## Setup

### Google Drive

In [95]:
import os
from google.colab import drive

drive.mount('/content/drive')
print(os.getcwd(), os.listdir(os.getcwd())) #> 'content', ['.config', 'drive', 'sample_data']

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content ['.config', 'drive', 'sample_data']


In [96]:
# you might need to create a google drive SHORTCUT that has this same path
# ... or update the path to use your own google drive organization
DATA_DIR = '/content/drive/MyDrive/Research/DS Research Shared 2023/data/impeachment_2020'
print(DATA_DIR)
assert os.path.isdir(DATA_DIR)

/content/drive/MyDrive/Research/DS Research Shared 2023/data/impeachment_2020


In [97]:
users_sample_csv_filepath = os.path.join(DATA_DIR, "users_sample_by_account_type_v2_and_their_tweets.csv")
assert os.path.isfile(users_sample_csv_filepath)

### OpenAI API Service

In [None]:
%%capture

!pip install openai

In [None]:
from getpass import getpass

OPENAI_API_KEY = getpass("Please provide your OpenAI API Key: ")
print("...", OPENAI_API_KEY[-4:])

Please provide your OpenAI API Key: ¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑



  + https://github.com/openai/openai-python
  + https://platform.openai.com/account/api-keys
  + https://platform.openai.com/docs/introduction/key-concepts
  + https://platform.openai.com/docs/models/overview
  + https://platform.openai.com/docs/guides/embeddings/what-are-embeddings
  + https://platform.openai.com/docs/guides/embeddings/embedding-models

> We recommend using `text-embedding-ada-002` for nearly all 
 (Embedding) use cases. It's better, cheaper, and simpler to use. Read the blog post announcement.


In [None]:
import openai
from openai import Model, Embedding
from pandas import DataFrame

openai.api_key = OPENAI_API_KEY

MODEL_ID = "text-embedding-ada-002"

class OpenAIService():
    def __init__(self, model_id=MODEL_ID):
        self.model_id = model_id

    def get_models(self):
        models = Model.list()
        #print(type(models)) #> openai.openai_object.OpenAIObject

        records = []
        for model in sorted(models.data, key=lambda m: m.id):
            #print(model.id, "...", model.owned_by, "...", model.parent, "...", model.object)
            model_info = model.to_dict()
            del model_info["permission"] # nested list
            #print(model_info)
            records.append(model_info)

        models_df = DataFrame(records)
        #models_df.to_csv("openai_models.csv")
        #models_df.sort_values(by=["id"])
        return models_df

    def get_embeddings(self, texts):
        """Pass in a list of strings. Returns a list of embeddings for each."""
        result = Embedding.create(input=texts, model=MODEL_ID)
        #print(len(result["data"]))
        return [d["embedding"] for d in result["data"]]
    

ai = OpenAIService()

In [None]:
#models_df = ai.get_models()
#models_df.head()

In [None]:
texts = [
    "I like apples, but bananas are gross.",
    "This is a tweet about bananas",
    "Drink apple juice!",
]
embeddings = ai.get_embeddings(texts)
print(len(embeddings))
print(len(embeddings[0]))

3
1536
[-0.00504452595487237, -0.01719384267926216, 0.026895813643932343, -0.0043577710166573524, 0.006321265362203121, -0.010357510298490524, -0.007242141291499138, -0.018504919484257698, 0.012848556973040104, -0.02166399173438549, 0.01654454693198204, 0.021189505234360695, 0.0013290265342220664, -0.01227418053895235, -0.022051071748137474, 0.03456249460577965, 0.03788388893008232, -0.0009286797139793634, -0.004545067902654409, -0.011512506753206253, -0.03169061243534088, 0.013697636313736439, 0.0070860604755580425, -0.02457333356142044, 0.00539414631202817, 0.010332537814974785, 0.027470190078020096, -0.031940340995788574, 0.008060003630816936, 0.006517927162349224, 0.0356113575398922, -0.016956599429249763, -0.005097593180835247, 3.616680623963475e-05, -0.016582006588578224, -0.01141261588782072, -0.012667504139244556, -0.0006282245158217847, -0.005868631415069103, -0.010619726032018661, -0.020277995616197586, 0.01589525118470192, -0.009096379391849041, -0.004501365125179291, -0.013

In [None]:
from pandas import Series

Series(embeddings, index=texts)

I like apples, but bananas are gross.    [-0.00504452595487237, -0.01719384267926216, 0...
This is a tweet about bananas            [-0.014693296514451504, -0.019133443012833595,...
Drink apple juice!                       [0.004055480472743511, -0.01426590234041214, 0...
dtype: object

## Users Sample

Fetch a sample of users. Use the balanced sample we already prepared. 



In [None]:
from pandas import read_csv

def remove_delimeters(txt):
    return txt.replace(" || ", " ")

users_df = read_csv(users_sample_csv_filepath)
users_df.drop(columns=["Unnamed: 0"], inplace=True, errors="ignore")
users_df.index = users_df["user_id"]
# remove delimeters inserted during the data export process:
users_df["tweet_texts"] = users_df["tweet_texts"].apply(remove_delimeters).tolist()

print(len(users_df))
print(users_df.columns)
users_df.iloc[0]

300
Index(['user_id', 'created_on', 'screen_name_count', 'screen_names',
       'status_count', 'rt_count', 'rt_pct', 'opinion_community', 'is_bot',
       'is_q', 'profile_descriptions', 'tweet_texts'],
      dtype='object')


user_id                                                          14710012
created_on                                                     2008-05-09
screen_name_count                                                       1
screen_names                                                     TREYPOLE
status_count                                                            5
rt_count                                                                1
rt_pct                                                                0.2
opinion_community                                                       0
is_bot                                                              False
is_q                                                                False
profile_descriptions    ACTOR, MUSICIAN, FILMMAKER, DATABASE ARCHITECT...
tweet_texts             RT @foxnewpolls: Daily FOX NEW POLL: (retweet ...
Name: 14710012, dtype: object

In [None]:
#users_df["profile_descriptions"].tolist()[0:5]

In [None]:
#users_df["tweet_texts"].tolist()[0:5]

## Embeddings


> NOTE: it seems we get an error when trying to process certain texts. Oh there are Nans? We will need to remove the nans before processing. And join them back together at the end.

In [98]:
#users_df["profile_descriptions"].isna()
#users_df["profile_descriptions"].notnull()

> ALSO: Series is not JSON serializable, so we will request the embeddings for a list of strings instead of a series.

### Profile Embeddings

In [99]:
profiles_df = users_df[users_df["profile_descriptions"].notnull()][["user_id", "profile_descriptions"]]
#profiles.head()

In [100]:
profile_texts = profiles_df["profile_descriptions"].tolist()
print("PROFILE TEXTS:", len(profile_texts))
print(profile_texts[0:5])

PROFILE TEXTS: 231
['ACTOR, MUSICIAN, FILMMAKER, DATABASE ARCHITECT, DATA PATHOLOGIST, SCIENCE LOVER, REGULAR LOVER, HISTORY BUFF, OTHER SUNDRY STUFF #THERESISTANCE', 'DFS PRO. FORMER POKER PRO. STL SPORTS ENTHUSIAST. @ROTOGRINDERS CONTRIBUTOR. @SIRIUSXMFANTASY HOST. VERY LUCKY GUY.', '2 GIRLS 2 BOYS ARMY VETüá∫üá≤ BSN RN CCM CLNC BA POLI SCI üèíü•ã AUTISM MOM #RESIST #NEVERTRUMP #AUTISM #FBRPARTY #FLINTWATERCRISIS #MSUSPARTANS | BLESSED W/ 2 GIRLS 2 BOYS ARMY VETüá∫üá≤ RN CCM CLNC üèíü•ã AUTISM MOM #RESIST  #NEVERTRUMP #AUTISM #FBRPARTY #FLINTWATERCRISIS', 'PRO #2A, #REPELTHEGOONS.  FOR GOD, COUNTRY & FAMILY. #USA.', 'JUST LIVE MY LIFE NOT TALKING ANYTHING SERIOUSLY HAVING FUN']


In [101]:
profile_embeddings = ai.get_embeddings(profile_texts) # API CALL
print("PROFILE EMBEDDINGS:", len(profile_embeddings))
print(len(profile_embeddings[0]))
print(profile_embeddings[0])

PROFILE EMBEDDINGS: 231
1536
[-0.017691081389784813, -0.022827202454209328, -0.0025850445963442326, -0.028207900002598763, 0.0015889023197814822, 0.03290921449661255, -0.011189404875040054, -0.008804777637124062, -0.0016432527918368578, -0.013179991394281387, -0.006956861354410648, -0.009022179991006851, 0.007289757952094078, 0.004439754877239466, 0.03165915608406067, -0.0197428110986948, 0.04157811775803566, 0.005924202501773834, 0.012059012427926064, -0.02140050195157528, 0.002394818002358079, -0.0037298016250133514, -0.015449123457074165, -0.0028601940721273422, 0.00996651966124773, 0.010985590517520905, 0.01509584579616785, -0.03967585042119026, 0.0014453830663114786, -0.03016451559960842, 0.02160431630909443, -0.01925365813076496, -0.0024865344166755676, -0.028805755078792572, -0.02972971275448799, -0.027012187987565994, 0.0007052823784761131, -0.0028635908383876085, 0.008777602575719357, 0.0072829644195735455, 0.0018190426053479314, 0.0019634109921753407, -0.0223244596272707, -0.

In [102]:
profiles_df["embeddings"] = profile_embeddings

In [103]:
#profiles_df["embeddings"].iloc[0]

### Tweet Embeddings

In [104]:
tweets_df =  users_df[users_df["tweet_texts"].notnull()][["user_id", "tweet_texts"]]

In [105]:
tweet_texts = tweets_df["tweet_texts"].tolist()
print("TWEET TEXTS:", len(tweet_texts))
print(tweet_texts[0:5])

TWEET TEXTS: 300
['RT @foxnewpolls: Daily FOX NEW POLL: (retweet for sample size)  Should Donald Trump be impeached and removed from office? @RepMikeJohnson Impeached forever. Acquitted for now. @GOPLeader He‚Äôll be impeached for life. And you and the rest of the GOP will be cowards for the rest of yours. @MarshaBlackburn Raised in Tennessee and he should be impeached. @GOPLeader Impeached for life. Acquitted for now.', 'RT @MattOswaltVA: "I JUST GOT ARRESTED FOR MAKING A PERFECT COCKTAIL!" - Bill Cosby https://t.co/xHorHsqTzM', "Just in time for impeachment!!! @Ryan449955 @bama_flyer @fleshmegazord @EaregoodSadie @brianhlp @charliekirk11 I cant wait for Brett Kavanaugh to be impeached for you believing Christine Ford now!! She is going to be so excited that you all believe her now!! @LindseyGrahamSC You've been silent for weeks.....remind yourself of your own words from Clinton's impeachment will you.  Point that finger at your own.", "I'd trust Casey Anthony with my kids more than I

In [106]:
tweet_embeddings = ai.get_embeddings(tweet_texts) # API CALL
print("TWEET EMBEDDINGS:", len(tweet_embeddings))

TWEET EMBEDDINGS: 300


In [107]:
tweets_df["embeddings"] = tweet_embeddings

## Save Embeddings

In [None]:
embeds_df = users_df.merge(profiles_df["embeddings"], left_index=True, right_index=True)
embeds_df.rename(columns={"embeddings": "profile_embeddings"}, inplace=True)

embeds_df = embeds_df.merge(tweets_df["embeddings"], left_index=True, right_index=True)
embeds_df.rename(columns={"embeddings": "tweet_embeddings"}, inplace=True)

#embeds_df.head()

In [113]:
model_dirpath = os.path.join(DATA_DIR, MODEL_ID)
os.makedirs(model_dirpath, exist_ok=True)

embeddings_csv_filepath = os.path.join(model_dirpath, "users_sample_openai_embeddings.csv")
print(embeddings_csv_filepath)

/content/drive/MyDrive/Research/DS Research Shared 2023/data/impeachment_2020/text-embedding-ada-002/users_sample_openai_embeddings.csv


In [111]:
embeds_df.to_csv(embeddings_csv_filepath)