In this notebook we load a previously-pulled sample of users and their tweets, then request embeddings from OpenAI, and save the embeddings back to a new CSV file in drive.

## Setup

### BigQuery Service

In [None]:
from google.colab import auth

# asks you to login
auth.authenticate_user()

In [None]:
from google.cloud import bigquery
from pandas import DataFrame

PROJECT_ID = "tweet-research-shared"

class BigQueryService():
    def __init__(self):
        self.client = bigquery.Client(project=PROJECT_ID)

    def execute_query(self, sql, verbose=True):
        if verbose == True:
            print(sql)
        job = self.client.query(sql)
        return job.result()

    def query_to_df(self, sql, verbose=True):
        """high-level wrapper to return a DataFrame"""
        results = self.execute_query(sql, verbose=verbose)
        records = [dict(row) for row in list(results)]
        df = DataFrame(records)
        return df


In [None]:
bq = BigQueryService()
print("PROJECT:", bq.client.project)

PROJECT: tweet-research-shared


### Google Drive

In [None]:
import os
from google.colab import drive

drive.mount('/content/drive')
print(os.getcwd(), os.listdir(os.getcwd())) #> 'content', ['.config', 'drive', 'sample_data']

Mounted at /content/drive
/content ['.config', 'drive', 'sample_data']


In [None]:
# you might need to create a google drive SHORTCUT that has this same path
# ... or update the path to use your own google drive organization
DATA_DIR = '/content/drive/MyDrive/Research/DS Research Shared 2023/data/impeachment_2020'
print(DATA_DIR)
assert os.path.isdir(DATA_DIR)

/content/drive/MyDrive/Research/DS Research Shared 2023/data/impeachment_2020


### Helper Functions

In [None]:
def split_into_batches(my_list, batch_size=10_000):
    """Splits a list into evenly sized batches"""
    # h/t: https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks
    for i in range(0, len(my_list), batch_size):
        yield my_list[i : i + batch_size]


In [None]:
def dynamic_batches(texts, batch_char_limit=30_000):
    """Splits texts into batches, with specified max number of characters per batch.
        Batches may have different lengths.
    """
    batches = []

    batch = []
    batch_chars = 0
    for text in texts:
        text_chars = len(text)

        if (batch_chars + text_chars) <= batch_char_limit:
            # THERE IS ROOM TO ADD THIS TEXT TO THE BATCH
            batch.append(text)
            batch_chars += text_chars
        else:
            # NO ROOM IN THIS BATCH, START A NEW ONE:

            if text_chars > batch_char_limit:
                # CAP THE TEXT AT THE MAX BATCH LENGTH
                text = text[0:batch_char_limit-1]

            batches.append(batch)
            batch = [text]
            batch_chars = text_chars

    if batch:
        batches.append(batch)

    return batches



In [None]:
texts = [
    "Short and sweet",
    "Short short",
    "I like apples, but bananas are gross.",
    "This is a tweet about bananas",
    "Drink apple juice!",
]
texts_df = DataFrame({"text": texts})
texts_df["chars"] = texts_df["text"].str.len()
texts_df

Unnamed: 0,text,chars
0,Short and sweet,15
1,Short short,11
2,"I like apples, but bananas are gross.",37
3,This is a tweet about bananas,29
4,Drink apple juice!,18


In [None]:
list(split_into_batches(texts_df["text"].tolist(), batch_size=2))

[['Short and sweet', 'Short short'],
 ['I like apples, but bananas are gross.', 'This is a tweet about bananas'],
 ['Drink apple juice!']]

In [None]:
dynamic_batches(texts_df["text"].tolist(), batch_char_limit=30)

[['Short and sweet', 'Short short'],
 ['I like apples, but bananas ar'],
 ['This is a tweet about bananas'],
 ['Drink apple juice!']]

### OpenAI API Service


  + https://github.com/openai/openai-python
  + https://platform.openai.com/account/api-keys
  + https://platform.openai.com/docs/introduction/key-concepts
  + https://platform.openai.com/docs/models/overview
  + https://platform.openai.com/docs/guides/embeddings/what-are-embeddings
  + https://platform.openai.com/docs/guides/embeddings/embedding-models

> We recommend using `text-embedding-ada-002` for nearly all
 (Embedding) use cases. It's better, cheaper, and simpler to use.

In [None]:
%%capture
!pip install openai

In [None]:
from getpass import getpass

OPENAI_API_KEY = getpass("Please provide your OpenAI API Key: ")
print("...", OPENAI_API_KEY[-4:])

Please provide your OpenAI API Key: ··········
... AysE


In [None]:
import openai
from openai import Model, Embedding
from pandas import DataFrame
from time import sleep

openai.api_key = OPENAI_API_KEY

MODEL_ID = "text-embedding-ada-002"

class OpenAIService():
    def __init__(self, model_id=MODEL_ID):
        self.model_id = model_id

    def get_models(self):
        models = Model.list()
        #print(type(models)) #> openai.openai_object.OpenAIObject

        records = []
        for model in sorted(models.data, key=lambda m: m.id):
            #print(model.id, "...", model.owned_by, "...", model.parent, "...", model.object)
            model_info = model.to_dict()
            del model_info["permission"] # nested list
            #print(model_info)
            records.append(model_info)

        models_df = DataFrame(records)
        #models_df.to_csv("openai_models.csv")
        #models_df.sort_values(by=["id"])
        return models_df

    def get_embeddings(self, texts):
        """Pass in a list of strings. Returns a list of embeddings for each."""
        result = Embedding.create(input=texts, model=MODEL_ID) # API CALL
        #print(len(result["data"]))
        return [d["embedding"] for d in result["data"]]

    def get_embeddings_in_batches(self, texts, batch_size=250, sleep_seconds=60):
        """High level wrapper to work around RateLimitError:
                Rate limit reached for [MODEL] in [ORG] on tokens per min.
                Limit: 1_000_000 tokens / min.

            batch_size : Number of users to request per API call

            sleep : Wait for a minute before requesting the next batch

            Also beware InvalidRequestError:
                This model's maximum context length is 8191 tokens,
                however you requested X tokens (X in your prompt; 0 for the completion).
                Please reduce your prompt; or completion length.

            ... so we should make lots of smaller requests.
        """
        #embeddings = []
        #counter = 1
        #for texts_batch in split_into_batches(texts, batch_size=batch_size):
        #    print(counter, len(texts_batch))
        #    embeds_batch = self.get_embeddings(texts_batch) # API CALL
        #    embeddings += embeds_batch
        #    counter += 1
        #    sleep(sleep_seconds)
        #return embeddings

        #embeddings = []
        #counter = 1
        #for texts_batch in split_into_batches(texts, batch_size=batch_size):
        #    print(counter, len(texts_batch))
        #    try:
        #        embeds_batch = self.get_embeddings(texts_batch)  # API CALL
        #        embeddings += embeds_batch
        #    except openai.error.RateLimitError as err:
        #        print(f"Rate limit reached. Sleeping for {sleep_seconds} seconds.")
        #        sleep(sleep_seconds)
        #        continue
        #    counter += 1
        #return embeddings

        embeddings = []
        counter = 1
        for texts_batch in split_into_batches(texts, batch_size=batch_size):
            print(counter, len(texts_batch))
            # retry loop
            while True:
                try:
                    embeds_batch = self.get_embeddings(texts_batch)  # API CALL
                    embeddings += embeds_batch
                    break  # exit the retry loop and go to the next batch
                except openai.error.RateLimitError as err:
                    print(f"... Rate limit reached. Sleeping for {sleep_seconds} seconds.")
                    sleep(sleep_seconds)
                    # retry the same batch
                #except openai.error.InvalidRequestError as err:
                #    print("INVALID REQUEST", err)
            counter += 1
        return embeddings


    def get_embeddings_in_dynamic_batches(self, texts, batch_char_limit=30_000, sleep_seconds=60):
        """High level wrapper to work around API limitations

            RateLimitError:
                Rate limit reached for [MODEL] in [ORG] on tokens per min.
                Limit: 1_000_000 tokens / min.

            AND

            InvalidRequestError:
                This model's maximum context length is 8191 tokens,
                however you requested X tokens (X in your prompt; 0 for the completion).
                Please reduce your prompt; or completion length.

            Params:

                batch_char_limit : Number of max characters to request per API call. Should be less than around 32_000 based on API docs.

                sleep : Wait for a minute before requesting the next batch

        """
        embeddings = []
        counter = 1
        for texts_batch in dynamic_batches(texts, batch_char_limit=batch_char_limit):
            print(counter, len(texts_batch))
            # retry loop
            while True:
                try:
                    embeds_batch = self.get_embeddings(texts_batch)  # API CALL
                    embeddings += embeds_batch
                    break  # exit the retry loop and go to the next batch
                except openai.error.RateLimitError as err:
                    print(f"... Rate limit reached. Sleeping for {sleep_seconds} seconds.")
                    sleep(sleep_seconds)
                    # retry the same batch
            counter += 1
        return embeddings

ai = OpenAIService()

In [None]:
#models_df = ai.get_models()
#models_df.head()

In [None]:
#texts = [
#    "I like apples, but bananas are gross.",
#    "This is a tweet about bananas",
#    "Drink apple juice!",
#]
#embeddings = ai.get_embeddings(texts)
#print(len(embeddings))
#print(len(embeddings[0])) #> 1536

## Users Sample

### Summary

Summary of the user sample we previously pulled when cross-checking botometer scores:

In [None]:
sql = f"""
    SELECT
        u.opinion_community, u.is_bot, u.is_q
        ,count(distinct bom.user_id) as user_count
        , avg(cap) as avg_cap, avg(astroturf) as avg_astro
        --, avg(fake_follower) as avg_fakefollower, avg(financial) as avg_financial, avg(other) as avg_other
    FROM `tweet-research-shared.impeachment_2020.botometer_scores` bom
    JOIN `tweet-research-shared.impeachment_2020.user_details_v20210806_slim` u ON u.user_id = bom.user_id
    WHERE score_type='english' -- 7566
    GROUP BY 1,2,3
"""
print("SUMMARY OF BOTOMETER SAMPLE:")
bq.query_to_df(sql, verbose=False)

Unnamed: 0,opinion_community,is_bot,is_q,user_count,avg_cap,avg_astro
0,0,True,False,1881,0.750015,0.613244
1,1,True,False,1051,0.718188,0.434784
2,1,True,True,168,0.732622,0.525882
3,0,False,False,3010,0.48855,0.146859
4,1,False,False,1447,0.548545,0.141691
5,1,False,True,9,0.574715,0.328889


### Users

Fetch sample we already pulled when checking botometer scores, as well as their profiles (if they have them), as well as at most X of their tweets (pulled at random). The botometer scores table has multiple rows for some users, so we average the botometer scores for these users to arrive at one row per user in the sample.

In [None]:
TWEET_MAX = 50
TWEET_DELIMETER = " " # " || "

sql = f"""
    SELECT
        u.user_id
        ,u.created_on
        ,u.screen_name_count
        ,u.screen_names
        ,u.status_count
        ,u.rt_count
        ,(u.rt_count / u.status_count) as rt_pct
        ,u.avg_toxicity
        ,u.avg_fact_score

        ,u.opinion_community
        ,u.is_bot
        ,u.is_q

        ,up.descriptions as profile_descriptions

        -- here we are grabbing at max X of the user's tweets at random:
        ,string_agg(t.status_text, '{TWEET_DELIMETER}' ORDER BY rand() LIMIT {int(TWEET_MAX)}) as tweet_texts

        ,avg(bom.cap) as bom_cap
        ,avg(bom.astroturf) as bom_astroturf
        ,avg(bom.fake_follower) as bom_fake_follower
        ,avg(bom.financial) as bom_financial
        ,avg(bom.other) as bom_other

    FROM `tweet-research-shared.impeachment_2020.botometer_scores` bom
    JOIN `tweet-research-shared.impeachment_2020.user_details_v20210806_slim` u ON u.user_id = bom.user_id
    JOIN `tweet-research-shared.impeachment_2020.tweets_v2` t on t.user_id = u.user_id
    LEFT JOIN `tweet-research-shared.impeachment_2020.user_profiles_v2` up on up.user_id = u.user_id
    WHERE bom.score_type='english' -- 7566
    GROUP BY 1,2,3,4,5,6,7,8,9,10,11,12,13
    -- LIMIT 10
"""
print(f"USERS SAMPLE AND THEIR TWEETS (MAX {TWEET_MAX}):")
df = bq.query_to_df(sql, verbose=False)
#df.index = df["user_id"]
print(len(df))
df.head()

USERS SAMPLE AND 50 OF THEIR TWEETS:
7566


Unnamed: 0,user_id,created_on,screen_name_count,screen_names,status_count,rt_count,rt_pct,avg_toxicity,avg_fact_score,opinion_community,is_bot,is_q,profile_descriptions,tweet_texts,bom_cap,bom_astroturf,bom_fake_follower,bom_financial,bom_other
0,479211236,2012-01-31,1,BIGREDMACHINE42,668,668,1.0,0.064429,1.809524,1,True,False,,RT @foxnewpolls: POLL: Should Donald Trump be ...,0.659646,0.32,0.16,0.35,0.54
1,34033550,2009-04-21,1,NURSINGPINS,763,753,0.986894,0.06952,2.528571,1,True,False,CRITICAL CARE R.N. STUDENT OF NURSING HISTOR...,RT @ouchinagirl: Pres.@realDonaldTrump Lawyers...,0.79897,0.47,0.51,0.255,0.43
2,515767837,2012-03-05,1,MARLAVAGTS,647,644,0.995363,0.046958,3.730159,0,True,False,PROGRESSIVE PASSIONATE COMPLICATED INDEPENDENT...,RT @politvidchannel: BREAKING: Sen. Tim Kaine ...,0.810078,0.79,0.42,0.21,0.56
3,3415696198,2015-08-11,1,NANMAC321,815,814,0.998773,0.047901,2.886905,0,True,False,👓👟🐶🦁🦋☘️🌊,"RT @TrialLawyerRich: Former Prosecutor, Senato...",0.833918,0.85,0.2,0.04,0.32
4,38444226,2009-05-07,1,GDIRTYDIME,1101,1097,0.996367,0.098514,3.345238,0,True,False,I AM JUST A CHILD OF GOD LIVING FOR MY JESUS.....,RT @Jwheels208: “Your favorite president” Some...,0.867012,0.9,0.56,0.04,0.47


In [None]:
len(df[df["user_id"].duplicated()]["user_id"].unique()) #> 0 row per unique user in the sample

0

In [None]:
#def remove_delimeters(txt, delimeter=TWEET_DELIMETER):
#    return txt.replace(delimeter, " ")
#
# remove delimeters inserted during the data export process:
# df["tweet_texts"] = df["tweet_texts"].apply(remove_delimeters).tolist()

In [None]:
#users_df["profile_descriptions"].tolist()[0:5]

In [None]:
df["tweet_texts"].tolist()[0]

'RT @foxnewpolls: POLL: Should Donald Trump be impeached and removed? RT @realjuliasong: Does anyone know what Trump is being impeached for? I could never tell. RT @CarpeDonktum: It\'s funny, two weeks ago 90% of Americans didn\'t even understand how a Senate Impeachment Trial worked and today Twitter… RT @banks_harken: Nothing says complete bullshit quite like a socialist on a private jet.  https://t.co/lqQHsmBMQH RT @GOPChairwoman: So true @DonaldJTrumpJr.   Despite Democrats’ unprecedented attempts to impeach him, Americans see that @realDonaldTrump… RT @JackPosobiec: The US far left is trying to force impeachment on the voters like the UK far left tried to force Jeremy Corbyn   The resu… RT @RepMarkMeadows: There is no other way to spin it: The response today from the President’s legal team has been absolutely devastating to… RT @dbongino: It’s really incredible the damage the hapless Democrats have done to their own party. They initiated a sham impeachment, base… RT @BreitbartNews

Should we remove special characters?

### Tweets

In [None]:
tweets_df = df[df["tweet_texts"].notnull()][["user_id", "tweet_texts"]]

tweets_df["tweet_chars"] = tweets_df["tweet_texts"].str.len()
tweets_df["tweet_tokens"] = tweets_df["tweet_chars"] / 4
tweets_df.head()

Unnamed: 0,user_id,tweet_texts,tweet_chars,tweet_tokens
0,479211236,RT @foxnewpolls: POLL: Should Donald Trump be ...,6611,1652.75
1,34033550,RT @ouchinagirl: Pres.@realDonaldTrump Lawyers...,6654,1663.5
2,515767837,RT @politvidchannel: BREAKING: Sen. Tim Kaine ...,6899,1724.75
3,3415696198,"RT @TrialLawyerRich: Former Prosecutor, Senato...",6599,1649.75
4,38444226,RT @Jwheels208: “Your favorite president” Some...,6539,1634.75


In [None]:
tweets_df["tweet_chars"].describe()

count     7566.000000
mean      3275.305578
std       3168.891538
min          7.000000
25%        140.000000
50%       1421.500000
75%       6663.000000
max      22467.000000
Name: tweet_chars, dtype: float64

In [None]:
import plotly.express as px

px.violin(tweets_df, x="tweet_chars", orientation="h",box=True, title="Distribution of User Tweet Text Lengths", height=350)

Distribution is bi-modal. Less tweet text for humans than bots. There are some outliers with much longer tweet texts. We may need to cap their text at a reasonable number of characters.

We need to cap the text length at the max number of characters allowable for an OpenAI API request.

> 1 token ~= 4 chars in English.

 https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them


But this token to character ratio is inexact, so we may need to further decrease in practice.

In [None]:
max_tokens_per_request = 8_000 # 8191
max_characters_per_request = max_tokens_per_request * 4 # around four chars per token but this is inexact
print("MAX CHARS PER REQUEST:", max_characters_per_request) #> 32_000

MAX CHARS PER REQUEST: 32000


In [None]:
#tweets_df["tweet_texts"].str[0:8]

In [None]:
TWEET_CHARS_MAX = 10_000 # 32_0000

tweets_df["tweet_texts"] = tweets_df["tweet_texts"].str[0:TWEET_CHARS_MAX]
tweets_df["tweet_chars"] = tweets_df["tweet_texts"].str.len()
tweets_df["tweet_tokens"] = tweets_df["tweet_chars"] / 4

px.violin(tweets_df, x="tweet_chars", orientation="h",box=True, title="Distribution of User Tweet Text Lengths", height=350)

In [None]:
# https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
# 1 token ~= 4 chars in English.
#tokens_limit = 1_000_000 # per minute
#characters_limit = tokens_limit * 4
#
#characters_total = tweets_df["tweet_chars"].sum()
#batches_needed = characters_total / characters_limit
#batch_size_needed = len(tweets_df) / batches_needed
#
#print("BATCHES NEEDED:", batches_needed)
#print("EST. BATCH SIZE:", batch_size_needed)

### Profiles


Not all users have profile texts. The API doesn't like null values or empty strings, so we will need to remove these rows before processing. And outer join them back together with the tweet embeddings at the end.

In [None]:
#users_df["profile_descriptions"].isna()
#users_df["profile_descriptions"].notnull()

In [None]:
profiles_df = df[df["profile_descriptions"].notnull()][["user_id", "profile_descriptions"]]
print(len(profiles_df))

# filter out '' values
profiles_df = profiles_df[profiles_df["profile_descriptions"].str.strip() != ""]
print(len(profiles_df))
#profiles.head()

7566
5746


## Embeddings

### Tweet Embeddings

In [None]:
tweet_texts = tweets_df["tweet_texts"].tolist()
print(len(tweet_texts))

In [None]:
tweet_embeddings = ai.get_embeddings_in_dynamic_batches(
    tweet_texts,
    batch_char_limit=15_000
)
print(len(tweet_embeddings))

1 2
2 2
3 2
4 2
5 2
6 2
7 5
8 14
9 2
10 2
11 2
12 2
13 2
14 12
15 11
16 2
17 2
18 2
19 2
20 2
21 2
22 5
23 18
24 2
25 2
26 2
27 2
28 2
29 2
30 3
31 15
32 2
33 2
34 2
35 2
36 2
37 3
38 5
39 18
40 2
41 2
42 2
43 2
44 2
45 2
46 2
47 4
48 14
49 2
50 2
51 2
52 3
53 2
54 2
55 2
56 2
57 3
58 20
59 2
60 2
61 2
62 2
63 2
64 4
65 18
66 2
67 1
68 1
69 2
70 2
71 2
72 2
73 2
74 2
75 2
76 3
77 24
78 2
79 2
80 2
81 2
82 2
83 2
84 2
85 2
86 4
87 17
88 2
89 2
90 2
91 2
92 2
93 2
94 15
95 2
96 2
97 2
98 2
99 2
100 2
101 2
102 2
103 2
104 6
105 9
106 2
107 2
108 2
109 2
110 2
111 2
112 6
113 17
114 2
115 2
116 2
117 2
118 2
119 2
120 2
121 2
122 12
123 2
124 2
125 2
126 2
127 2
128 2
129 2
130 3
131 23
132 2
133 2
134 2
135 2
136 2
137 1
138 1
139 2
140 17
141 2
142 2
143 2
144 2
145 2
146 2
147 1
148 2
149 2
150 2
151 6
152 24
153 2
154 2
155 2
156 2
157 2
158 2
159 2
160 2
161 15
162 2
163 2
164 2
165 2
166 2
167 2
168 2
169 2
170 21
171 2
172 2
173 2
174 2
175 2
176 2
177 3
178 16
179 2
180 2
181 2
18

In [None]:
tweets_df["embeddings"] = tweet_embeddings

### Profile Embeddings

In [None]:
profile_texts = profiles_df["profile_descriptions"].tolist()
print("PROFILE TEXTS:", len(profile_texts))
print(profile_texts[0:5])

PROFILE TEXTS: 5746
['CRITICAL CARE R.N.   STUDENT OF NURSING HISTORY.   I COLLECT HISTORICAL NURSING RELATED ITEMS. NEW  ORLEANS', 'PROGRESSIVE PASSIONATE COMPLICATED INDEPENDENT MIDWESTERN WOMAN #RESISTERSISTER DUDES DON’T DM ME', '👓👟🐶🦁🦋☘️🌊', 'I AM JUST A CHILD OF GOD LIVING FOR MY JESUS...AMEN STAY BLESSED AND HIGHLY FAVORED.🙏🌊🌊#BLUEWAVE ITS COMING2020😎✌#RESISTANCE🌊#FBR', 'I WILL ALWAYS BE THERE FOR YOU SAID THE FLOOR...']


In [None]:
profile_embeddings = ai.get_embeddings_in_dynamic_batches(
    profile_texts,
    batch_char_limit=15_000
)
print(len(profile_embeddings))

1 127
2 114
3 50
4 104
5 108
6 129
7 136
8 113
9 113
10 85
11 125
12 97
13 121
14 129
15 112
16 119
17 118
18 124
19 109
20 117
21 127
22 129
23 100
24 127
25 110
26 90
27 128
28 120
29 131
30 93
31 119
32 124
33 107
34 100
35 126
36 86
37 124
38 121
39 110
40 105
41 122
42 111
43 115
44 122
45 94
46 92
47 122
48 113
49 113
50 130
51 85
5746


In [None]:
profiles_df["embeddings"] = profile_embeddings

In [None]:
#profiles_df["embeddings"].iloc[0]

## Save Embeddings

In [None]:
embeds_df = df.merge(profiles_df["embeddings"], left_index=True, right_index=True, how="outer") # outer join to keep users who don't have profiles
embeds_df.rename(columns={"embeddings": "profile_embeddings"}, inplace=True)

embeds_df = embeds_df.merge(tweets_df["embeddings"], left_index=True, right_index=True, how="outer")
embeds_df.rename(columns={"embeddings": "tweet_embeddings"}, inplace=True)

#embeds_df.head()

In [None]:
model_dirpath = os.path.join(DATA_DIR, MODEL_ID)
os.makedirs(model_dirpath, exist_ok=True)

embeddings_csv_filepath = os.path.join(model_dirpath, "botometer_sample_openai_embeddings_20230704.csv")
print(embeddings_csv_filepath)
embeds_df.to_csv(embeddings_csv_filepath)

/content/drive/MyDrive/Research/DS Research Shared 2023/data/impeachment_2020/text-embedding-ada-002/botometer_sample_openai_embeddings_20230704.csv
