Using "High RAM" runtime in Colab. Crashes with basic runtime.

## Setup

### Google Drive

In [None]:
import os
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# you might need to create a google drive SHORTCUT that has this same path
# ... or update the path to use your own google drive organization
DIRPATH = '/content/drive/MyDrive/Research/DS Research Shared 2024'
os.path.isdir(DIRPATH)

True

In [None]:
DATA_DIRPATH = os.path.join(DIRPATH, "projects", "Impeachment 2020 Embeddings", "data")
os.path.isdir(DATA_DIRPATH)

True

### BigQuery Service

In [None]:
from google.colab import auth

# asks you to login
auth.authenticate_user()

In [None]:
# SOURCE: https://github.com/s2t2/openai-embeddings-2023/blob/main/app/bq_service.py

from google.cloud import bigquery
from pandas import DataFrame, read_gbq
#from datetime import datetime

PROJECT_ID = "tweet-collector-py"

class BigQueryService():
    def __init__(self, project_id=PROJECT_ID):
        self.project_id = project_id
        self.client = bigquery.Client(project=self.project_id)

    def execute_query(self, sql, verbose=True):
        if verbose == True:
            print(sql)
        job = self.client.query(sql)
        return job.result()

    #def query_to_df(self, sql, verbose=True):
    #    """high-level wrapper to return a DataFrame"""
    #    results = self.execute_query(sql, verbose=verbose)
    #    return DataFrame([dict(row) for row in results])

    def query_to_df(self, sql, verbose=True):
        """high-level wrapper to return a DataFrame"""
        if verbose == True:
            print(sql)
        # https://pandas.pydata.org/docs/reference/api/pandas.read_gbq.html#pandas-read-gbq
        #return read_gbq(sql, project_id=self.project_id) # progress_bar_type="tqdm_notebook"
        #progress_bar_type="tqdm_notebook"
        return read_gbq(sql, project_id=self.project_id, progress_bar_type="tqdm_notebook")

    # WRITING

    @staticmethod
    def split_into_batches(my_list, batch_size=10_000):
        """Splits a list into evenly sized batches"""
        # h/t: https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks
        for i in range(0, len(my_list), batch_size):
            yield my_list[i : i + batch_size]

    # @ staticmethod
    #def generate_timestamp(dt=None):
    #    """Formats datetime object for storing in BigQuery. Uses current time by default. """
    #    dt = dt or datetime.now()
    #    return dt.strftime("%Y-%m-%d %H:%M:%S")

    def insert_records_in_batches(self, table, records, batch_size=5_000):
        """
        Inserts records in batches because attempting to insert too many rows at once
            may result in google.api_core.exceptions.BadRequest: 400

        Params:
            table (table ID string, Table, or TableReference)
            records (list of dictionaries)
        """
        rows_to_insert = [list(d.values()) for d in records]
        #errors = self.client.insert_rows(table, rows_to_insert)
        errors = []
        batches = list(BigQueryService.split_into_batches(rows_to_insert, batch_size=batch_size))
        for batch in batches:
            errors += self.client.insert_rows(table, batch)
        return errors



In [None]:
bq = BigQueryService()
print(bq)

<__main__.BigQueryService object at 0x7ac6dc8fe800>


In [None]:
print("DATASETS:")
datasets = list(bq.client.list_datasets())
for ds in datasets:
    #print("...", ds.project, ds.dataset_id)
    if "user" not in str(ds.reference):
        print("...", ds.reference)

DATASETS:
... tweet-collector-py.analysis_2021
... tweet-collector-py.analysis_2021_development
... tweet-collector-py.collection_2021
... tweet-collector-py.disinfo_2021_development
... tweet-collector-py.disinfo_2021_production
... tweet-collector-py.election_2020_analysis
... tweet-collector-py.election_2020_development
... tweet-collector-py.election_2020_production
... tweet-collector-py.f1_racing_2023_development
... tweet-collector-py.f1_racing_2023_production
... tweet-collector-py.impeachment_2021_development
... tweet-collector-py.impeachment_2021_production
... tweet-collector-py.impeachment_backup
... tweet-collector-py.impeachment_development
... tweet-collector-py.impeachment_production
... tweet-collector-py.impeachment_test
... tweet-collector-py.jan6_committee_development
... tweet-collector-py.jan6_committee_production
... tweet-collector-py.transition_2021_development
... tweet-collector-py.transition_2021_production
... tweet-collector-py.truth_2023_development
... 

### OpenAI Service

In [None]:
%%capture

!pip install openai==0.28 # ok so there is now a 1.0 interface but we originally obtained using earlier API, so pinning that here

In [None]:
from google.colab import userdata

OPENAI_API_KEY = userdata.get("OPENAI_API_KEY") #os.getenv("OPENAI_API_KEY")
print(OPENAI_API_KEY[0:3], "...")

sk- ...


In [None]:
# SOURCE: https://github.com/s2t2/openai-embeddings-2023/blob/main/app/openai_service.py

def split_into_batches(my_list, batch_size=10_000):
    """Splits a list into evenly sized batches"""
    # h/t: https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks
    for i in range(0, len(my_list), batch_size):
        yield my_list[i : i + batch_size]

def dynamic_batches(texts, batch_char_limit=30_000):
    """Splits texts into batches, with specified max number of characters per batch.
        Caps text length at the maximum batch size (individual text cannot exceed batch size).
        Batches may have different lengths.
    """
    batches = []

    batch = []
    batch_chars = 0
    for text in texts:
        text_chars = len(text)

        if (batch_chars + text_chars) <= batch_char_limit:
            # THERE IS ROOM TO ADD THIS TEXT TO THE BATCH
            batch.append(text)
            batch_chars += text_chars
        else:
            # NO ROOM IN THIS BATCH, START A NEW ONE:

            if text_chars > batch_char_limit:
                # CAP THE TEXT AT THE MAX BATCH LENGTH
                text = text[0:batch_char_limit-1]

            batches.append(batch)
            batch = [text]
            batch_chars = text_chars

    if batch:
        batches.append(batch)

    return batches


In [None]:
# SOURCE: https://github.com/s2t2/openai-embeddings-2023/blob/main/app/openai_service.py

import os
from time import sleep
from pprint import pprint
import json

import openai
from openai import Model, Embedding
from pandas import DataFrame
#from dotenv import load_dotenv

#load_dotenv()

MODEL_ID = "text-embedding-ada-002" #  os.getenv("OPENAI_EMBEDDING_MODEL_ID", default="text-embedding-ada-002")

openai.api_key = OPENAI_API_KEY

class OpenAIService():
    """OpenAI API Service

        + https://github.com/openai/openai-python
        + https://platform.openai.com/account/api-keys
        + https://platform.openai.com/docs/introduction/key-concepts
        + https://platform.openai.com/docs/models/overview
        + https://platform.openai.com/docs/guides/embeddings/what-are-embeddings
        + https://platform.openai.com/docs/guides/embeddings/embedding-models

        > 2023: "We recommend using `text-embedding-ada-002` for nearly all
        (Embedding) use cases. It's better, cheaper, and simpler to use."

        > 2024: New "large" and "small" models (TODO)
    """

    def __init__(self, model_id=MODEL_ID):
        self.model_id = model_id
        print("EMBEDDING MODEL:", self.model_id)


    def get_models(self):
        models = Model.list()
        #print(type(models)) #> openai.openai_object.OpenAIObject

        records = []
        for model in sorted(models.data, key=lambda m: m.id):
            #print(model.id, "...", model.owned_by, "...", model.parent, "...", model.object)
            model_info = model.to_dict()
            try:
                del model_info["permission"] # nested list
            except:
                pass
            #print(model_info)
            records.append(model_info)

        models_df = DataFrame(records)
        #models_df.to_csv("openai_models.csv")
        #models_df.sort_values(by=["id"])
        return models_df

    def get_embeddings(self, texts):
        """Pass in a list of strings. Returns a list of embeddings for each."""
        result = Embedding.create(input=texts, model=self.model_id) # API CALL
        #print(len(result["data"]))
        return [d["embedding"] for d in result["data"]]

    def get_embeddings_in_batches(self, texts, batch_size=250, sleep_seconds=60):
        """High level wrapper to work around RateLimitError:
                Rate limit reached for [MODEL] in [ORG] on tokens per min.
                Limit: 1_000_000 tokens / min.

            batch_size : Number of users to request per API call

            sleep : Wait for a minute before requesting the next batch

            Also beware InvalidRequestError:
                This model's maximum context length is 8191 tokens,
                however you requested X tokens (X in your prompt; 0 for the completion).
                Please reduce your prompt; or completion length.

            ... so we should make lots of smaller requests.
        """
        embeddings = []
        counter = 1
        for texts_batch in split_into_batches(texts, batch_size=batch_size):
            print(counter, len(texts_batch))
            # retry loop
            while True:
                try:
                    embeds_batch = self.get_embeddings(texts_batch)  # API CALL
                    embeddings += embeds_batch
                    break  # exit the retry loop and go to the next batch
                except openai.error.RateLimitError as err:
                    print(f"... Rate limit reached. Sleeping for {sleep_seconds} seconds.")
                    sleep(sleep_seconds)
                    # retry the same batch
                #except openai.error.InvalidRequestError as err:
                #    print("INVALID REQUEST", err)
            counter += 1
        return embeddings

    def get_embeddings_in_dynamic_batches(self, texts, batch_char_limit=30_000, sleep_seconds=60):
        """High level wrapper to work around API limitations

            RateLimitError:
                Rate limit reached for [MODEL] in [ORG] on tokens per min.
                Limit: 1_000_000 tokens / min.

            AND

            InvalidRequestError:
                This model's maximum context length is 8191 tokens,
                however you requested X tokens (X in your prompt; 0 for the completion).
                Please reduce your prompt; or completion length.

            Params:

                batch_char_limit : Number of max characters to request per API call.
                                    Should be less than around 32_000 based on API docs.

                sleep : Wait for a minute before requesting the next batch

        """
        embeddings = []
        counter = 1
        for texts_batch in dynamic_batches(texts, batch_char_limit=batch_char_limit):
            print("BATCH:", counter, "SIZE:", len(texts_batch))
            # retry loop
            while True:
                try:
                    embeds_batch = self.get_embeddings(texts_batch)  # API CALL
                    embeddings += embeds_batch
                    break  # exit the retry loop and go to the next batch
                except openai.error.RateLimitError as err:
                    print(f"... Rate limit reached. Sleeping for {sleep_seconds} seconds.")
                    sleep(sleep_seconds)
                    # retry the same batch
                except openai.error.ServiceUnavailableError as err:
                    print(f"... Service Unavailz. Sleeping for {sleep_seconds} seconds.")
                    print(err)
                    sleep(sleep_seconds)
                    # retry the same batch
            counter += 1
        return embeddings



In [None]:
ai = OpenAIService()

EMBEDDING MODEL: text-embedding-ada-002


In [None]:
models_df = ai.get_models()
models_df.head()

Unnamed: 0,id,object,created,owned_by
0,babbage-002,model,1692634615,system
1,dall-e-2,model,1698798177,system
2,dall-e-3,model,1698785189,system
3,davinci-002,model,1692634301,system
4,gpt-3.5-turbo,model,1677610602,openai


In [None]:
#models_df["id"].tolist()

Text embeddings models:

In [None]:
from pandas import to_datetime

models_df["created_date"] = to_datetime(models_df["created"], unit="s").dt.date

models_df[models_df["id"].str.contains("text-embedding")]

Unnamed: 0,id,object,created,owned_by,created_date
19,text-embedding-3-large,model,1705953180,system,2024-01-22
20,text-embedding-3-small,model,1705948997,system,2024-01-22
21,text-embedding-ada-002,model,1671217299,openai-internal,2022-12-16


## Sample of Users and Tweets

Users sample, with max 50 of their tweets sampled at random. Same dataset we are using for user-level and status-level embeddings.

### Loading Data

In [None]:
sql = f"""
    SELECT t.user_id, t.row_num, t.status_id, t.status_text, t.created_at
            ,u.is_bot, u.opinion_community --, u.avg_fact_score, u.avg_toxicity
    FROM `tweet-collector-py.impeachment_production.botometer_sample_max_50` t
    JOIN `tweet-collector-py.impeachment_production.user_details_v20240128_slim` u on u.user_id = t.user_id

    -- LIMIT 100
"""

df = bq.query_to_df(sql)
df.head()


    SELECT t.user_id, t.row_num, t.status_id, t.status_text, t.created_at
            ,u.is_bot, u.opinion_community --, u.avg_fact_score, u.avg_toxicity 
    FROM `tweet-collector-py.impeachment_production.botometer_sample_max_50` t
    JOIN `tweet-collector-py.impeachment_production.user_details_v20240128_slim` u on u.user_id = t.user_id 

    -- LIMIT 100



Downloading:   0%|          |

Unnamed: 0,user_id,row_num,status_id,status_text,created_at,is_bot,opinion_community
0,1213938997136777216,1,1214318056408780801,@realDonaldTrump @POTUS Trump is doing a great...,2020-01-06 22:49:20+00:00,False,1
1,1037808767117737984,1,1223111414971133954,RT @brianbeutler: Like clockwork. https://t.co...,2020-01-31 05:11:00+00:00,False,0
2,1205624758899019777,1,1218912815123136512,RT @SethAbramson: RETWEET: The Democrats can g...,2020-01-19 15:07:16+00:00,False,0
3,1205624758899019777,2,1206417518182748160,RT @CNN: Half of American voters want the Pres...,2019-12-16 03:35:25+00:00,False,0
4,1205624758899019777,3,1206415143443337217,RT @JoeNBC: Brutal numbers for the president. ...,2019-12-16 03:25:58+00:00,False,0


In [None]:
print(len(df)) #> 183738
print(df["status_id"].nunique()) #> 183727

183738
183727


In [None]:
print(len(df))
df.drop_duplicates(subset="status_id", inplace=True)
print(len(df))

183738
183727


### Inspecting the Data

In [None]:
from pandas import to_datetime

df["created_at"] = to_datetime(df["created_at"])
df["created_on"] = df["created_at"].dt.date

df.sort_values(by=["user_id", "created_at"], inplace=True)
df.reset_index(drop=True, inplace=True) # order based on time

df["user_ts"] = df.groupby("user_id").cumcount() + 1

df.tail(10)

Unnamed: 0,user_id,row_num,status_id,status_text,created_at,is_bot,opinion_community,created_on,user_ts
183717,1234200349600288772,23,1242190075510886403,RT @Lrihendry: @SpeakerPelosi @HouseDemocrats ...,2020-03-23 20:42:47+00:00,False,1,2020-03-23,48
183718,1234200349600288772,1,1242239910028992513,RT @RealJamesWoods: The two-headed hydra of Pe...,2020-03-24 00:00:48+00:00,False,1,2020-03-24,49
183719,1234200349600288772,47,1242326885532663808,RT @thebradfordfile: Raise your hand if you wa...,2020-03-24 05:46:25+00:00,False,1,2020-03-24,50
183720,1234846911028453376,1,1234854326331461632,This cause is very close to my heart - please ...,2020-03-03 14:53:08+00:00,False,0,2020-03-03,1
183721,1237940420136456192,1,1239244949935116289,RT @TomFitton: .@realDonaldTrump admin should ...,2020-03-15 17:39:54+00:00,False,1,2020-03-15,1
183722,1237940420136456192,2,1241248879103725571,RT @bfraser747: If that's not awesome check th...,2020-03-21 06:22:48+00:00,False,1,2020-03-21,2
183723,1237940420136456192,4,1241385078531305472,RT @actlightning: Speaker Pelosi's Heinrich Hi...,2020-03-21 15:24:01+00:00,False,1,2020-03-21,3
183724,1237940420136456192,3,1242083933472227334,RT @mjeannd: What in the hell is wrong with @S...,2020-03-23 13:41:01+00:00,False,1,2020-03-23,4
183725,1238854780191195136,1,1240621202252730370,RT @DonaIdJTrrump: Coronavirus is revenge for ...,2020-03-19 12:48:38+00:00,False,0,2020-03-19,1
183726,1240138605726760962,1,1240791023040827393,RT @DonaIdJTrrump: If I catch another motherfu...,2020-03-20 00:03:27+00:00,False,0,2020-03-20,1


In [None]:
user_counts = df.groupby("user_id")["user_ts"].max()
user_counts.name = "user_status_count"
df = df.merge(user_counts, left_on="user_id", right_index=True, how="left")
df.tail(10)

Unnamed: 0,user_id,row_num,status_id,status_text,created_at,is_bot,opinion_community,created_on,user_ts,user_status_count
183717,1234200349600288772,23,1242190075510886403,RT @Lrihendry: @SpeakerPelosi @HouseDemocrats ...,2020-03-23 20:42:47+00:00,False,1,2020-03-23,48,50
183718,1234200349600288772,1,1242239910028992513,RT @RealJamesWoods: The two-headed hydra of Pe...,2020-03-24 00:00:48+00:00,False,1,2020-03-24,49,50
183719,1234200349600288772,47,1242326885532663808,RT @thebradfordfile: Raise your hand if you wa...,2020-03-24 05:46:25+00:00,False,1,2020-03-24,50,50
183720,1234846911028453376,1,1234854326331461632,This cause is very close to my heart - please ...,2020-03-03 14:53:08+00:00,False,0,2020-03-03,1,1
183721,1237940420136456192,1,1239244949935116289,RT @TomFitton: .@realDonaldTrump admin should ...,2020-03-15 17:39:54+00:00,False,1,2020-03-15,1,4
183722,1237940420136456192,2,1241248879103725571,RT @bfraser747: If that's not awesome check th...,2020-03-21 06:22:48+00:00,False,1,2020-03-21,2,4
183723,1237940420136456192,4,1241385078531305472,RT @actlightning: Speaker Pelosi's Heinrich Hi...,2020-03-21 15:24:01+00:00,False,1,2020-03-21,3,4
183724,1237940420136456192,3,1242083933472227334,RT @mjeannd: What in the hell is wrong with @S...,2020-03-23 13:41:01+00:00,False,1,2020-03-23,4,4
183725,1238854780191195136,1,1240621202252730370,RT @DonaIdJTrrump: Coronavirus is revenge for ...,2020-03-19 12:48:38+00:00,False,0,2020-03-19,1,1
183726,1240138605726760962,1,1240791023040827393,RT @DonaIdJTrrump: If I catch another motherfu...,2020-03-20 00:03:27+00:00,False,0,2020-03-20,1,1


In [None]:
import plotly.express as px

chart_df = df.groupby("user_id")["status_id"].nunique()
px.histogram(chart_df, title="Number of Tweets per User (sample max 50)")

In [None]:
#import plotly.express as px
#
#chart_df = df.groupby("user_id")["status_id"].cumcount()
#px.histogram(chart_df, title="Number of Users with at least X tweets in sample")

If we choose a minimum threshold to limit the sample based on those users who have a large number of tweets (anywhere over 2 or 3), we are going to be capturing mostly bots, but there are a good number of humans in here as well

In [None]:
px.histogram(df, x="user_ts", title="Users with at least X tweets",
             color="is_bot", color_discrete_map={True: "purple", False: "grey"},
             #labels={"user_ts": "Number of Tweets in Sample (max 50 per user)"}
)

In [None]:
chart_df = df[df["user_status_count"].between(1, 10, inclusive="both")]
print(len(chart_df))
print("USERS:", chart_df["user_id"].nunique())
print("TWEETS:", chart_df["status_id"].nunique())
print("TEXTS:", chart_df["status_text"].nunique())
print("----------")
print(chart_df.groupby("is_bot")["user_id"].nunique())

px.histogram(chart_df, x="user_ts", title="Users with max of X tweets",
             color="is_bot", color_discrete_map={True: "purple", False: "grey"},
             labels={"user_ts": "Number of Tweets in Sample (max 50 per user)"},
             text_auto=True
)

8268
USERS: 3821
TWEETS: 8268
TEXTS: 6261
----------
is_bot
False    3821
Name: user_id, dtype: int64


In [None]:
LIMIT = 20 # 50
chart_df = df[df["user_status_count"] >= LIMIT]
chart_df = chart_df[chart_df["user_ts"]==LIMIT]

print("USERS:", chart_df["user_id"].nunique())
print("TWEETS:", chart_df["status_id"].nunique())
print("TEXTS:", chart_df["status_text"].nunique())

print(chart_df.groupby("is_bot")["user_id"].nunique())

#px.histogram(chart_df, x="user_ts", title=f"Users with at least {LIMIT} tweets",
#             color="is_bot", color_discrete_map={True: "purple", False: "grey"},
#             labels={"user_ts": "Number of Tweets in Sample (max 50 per user)"},
#             text_auto=True
#)

USERS: 3522
TWEETS: 3522
TEXTS: 3061
is_bot
False     422
True     3100
Name: user_id, dtype: int64


In [None]:
print("TWEETS:", len(df))
print("TEXTS:", df["status_text"].nunique())

print("USERS:", df["user_id"].nunique())
#df.tail(10)

TWEETS: 183727
TEXTS: 80205
USERS: 7566


### User Time Series

We have to get embeddings for each user in a cumulative way. It doesn't make sense to get cumulative embeddings for a user that has only one tweet. We can look them up individually from what we've already collected. We care about chains of X>2 or more.

5289 users remain.

We will fetch embeddings for cumulative chains, essentially one for each tweet in the dataset (181,450).

Although with concern for cost, for users in the dataset that have 50 tweets, we will probably be able to see drift after 10 or 20 tweets.

If we make the number too small, we may not capture the entire period. Remember these are already a random sample of the tweets, so they are likely to be more dispersed over the entire time period.

Perhaps we can just get them all (to cover entire period).

In [None]:
CUMULATIVE_MAX = 20
# 10: 43,441 tweets
# 15: 61,856 tweets
# 20: 79,662 tweets
# 25: 97,090 tweets
# 50: 181,450 tweets

ts = df[df["user_status_count"] >= 2] # 181_450 tweets
ts = ts[ts["user_ts"].between(1, CUMULATIVE_MAX, inclusive="both")]

ts = ts[["user_id", "user_status_count", "user_ts", "status_id", "status_text", "created_at", "is_bot", "opinion_community",]]
ts.reset_index(inplace=True, drop=True)

print("TWEETS:", ts["status_id"].nunique())
print("TEXTS:", ts["status_text"].nunique())
print("USERS:", ts["user_id"].nunique())

print(ts.groupby("is_bot")["user_id"].nunique())

ts.tail(10)

TWEETS: 79662
TEXTS: 39575
USERS: 5289
is_bot
False    2189
True     3100
Name: user_id, dtype: int64


Unnamed: 0,user_id,user_status_count,user_ts,status_id,status_text,created_at,is_bot,opinion_community
79652,1234200349600288772,50,15,1237187991656955904,RT @JoyLinPark: .⁦@SpeakerPelosi⁩ Yet another ...,2020-03-10 01:26:17+00:00,False,1
79653,1234200349600288772,50,16,1237817181079044097,RT @DonaldJTrumpJr: Good thread. https://t.co/...,2020-03-11 19:06:28+00:00,False,1
79654,1234200349600288772,50,17,1238160710154235904,RT @JamesMo96533181: @bbusa617 @RubyRockstar33...,2020-03-12 17:51:31+00:00,False,1
79655,1234200349600288772,50,18,1238161929195175938,RT @DonaldJTrumpJr: This pretty much sums ever...,2020-03-12 17:56:22+00:00,False,1
79656,1234200349600288772,50,19,1238162081851064321,RT @charliekirk11: Sick: Nancy Pelosi tried t...,2020-03-12 17:56:58+00:00,False,1
79657,1234200349600288772,50,20,1238182290926252032,RT @ROHLL5: Lt. Col. Allen West Suggests Nancy...,2020-03-12 19:17:17+00:00,False,1
79658,1237940420136456192,4,1,1239244949935116289,RT @TomFitton: .@realDonaldTrump admin should ...,2020-03-15 17:39:54+00:00,False,1
79659,1237940420136456192,4,2,1241248879103725571,RT @bfraser747: If that's not awesome check th...,2020-03-21 06:22:48+00:00,False,1
79660,1237940420136456192,4,3,1241385078531305472,RT @actlightning: Speaker Pelosi's Heinrich Hi...,2020-03-21 15:24:01+00:00,False,1
79661,1237940420136456192,4,4,1242083933472227334,RT @mjeannd: What in the hell is wrong with @S...,2020-03-23 13:41:01+00:00,False,1


### User Cumulative Timelines

Let's calculate the cumulative timelines.

In [None]:
ts[["user_id", "user_ts", "status_text"]].head()

Unnamed: 0,user_id,user_ts,status_text
0,2952,1,RT @Mikel_Jollett: “The party told you to reje...
1,2952,2,RT @Sky_Lee_1: #FactsMatter &amp; saying the o...
2,2952,3,"RT @MillenPolitics: Back in 1787, James Madiso..."
3,2952,4,RT @RBReich: So let me get this straight: An i...
4,2952,5,RT @RBReich: Just so we're all clear: This is ...


In [None]:
# ts.groupby("user_id")["status_text"].cumsum()

ts["cumulative_text"] = ts.groupby("user_id", group_keys=False)["status_text"].apply(lambda txt: (txt + " ").cumsum().str.strip())
#ts[["user_id", "status_text", "cumulative_text"]].head()

In [None]:
ts["status_length"] = ts["status_text"].str.len()
ts["cumulative_length"] = ts["cumulative_text"].str.len()
ts[["user_id", "status_text", "status_length", "cumulative_text", "cumulative_length"]].head()

Unnamed: 0,user_id,status_text,status_length,cumulative_text,cumulative_length
0,2952,RT @Mikel_Jollett: “The party told you to reje...,130,RT @Mikel_Jollett: “The party told you to reje...,130
1,2952,RT @Sky_Lee_1: #FactsMatter &amp; saying the o...,143,RT @Mikel_Jollett: “The party told you to reje...,274
2,2952,"RT @MillenPolitics: Back in 1787, James Madiso...",140,RT @Mikel_Jollett: “The party told you to reje...,415
3,2952,RT @RBReich: So let me get this straight: An i...,140,RT @Mikel_Jollett: “The party told you to reje...,556
4,2952,RT @RBReich: Just so we're all clear: This is ...,140,RT @Mikel_Jollett: “The party told you to reje...,697


In [None]:
ts["cumulative_length"].describe()

count    79662.000000
mean      1334.951282
std        818.522584
min          5.000000
25%        613.000000
50%       1275.000000
75%       2009.750000
max      11845.000000
Name: cumulative_length, dtype: float64

In [None]:
px.violin(ts, x="cumulative_length", title="Cumulative Length of User Timelines",
        points="all", box=True,
        color="is_bot", color_discrete_map={True: "purple", False: "grey"},
)

In [None]:
px.violin(ts, x="status_length", title="Length of Status Texts",
        #points="all",
        box=True,
        color="is_bot", color_discrete_map={True: "purple", False: "grey"},
)

## Fetching Embeddings

We have been setting char limit at 15K, so since all users cumulative timeline lengths are under this, we should get complete results.

In [None]:
texts = ts["cumulative_text"].tolist()
texts[0:3]

['RT @Mikel_Jollett: “The party told you to reject the evidence of your eyes and ears. It was their final, most essential command.”…',
 'RT @Mikel_Jollett: “The party told you to reject the evidence of your eyes and ears. It was their final, most essential command.”… RT @Sky_Lee_1: #FactsMatter &amp; saying the opposite doesn’t speak it into truth.   Thank you CNN for breaking down the #GOPComplicitTraitors…',
 'RT @Mikel_Jollett: “The party told you to reject the evidence of your eyes and ears. It was their final, most essential command.”… RT @Sky_Lee_1: #FactsMatter &amp; saying the opposite doesn’t speak it into truth.   Thank you CNN for breaking down the #GOPComplicitTraitors… RT @MillenPolitics: Back in 1787, James Madison envisioned 3 reasons why a President ought to be impeached:  1) They may lose their ability…']

In [None]:
embeddings = ai.get_embeddings_in_dynamic_batches(texts, batch_char_limit=15_000)
print(len(embeddings))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
BATCH: 2546 SIZE: 10
BATCH: 2547 SIZE: 10
BATCH: 2548 SIZE: 9
BATCH: 2549 SIZE: 10
BATCH: 2550 SIZE: 5
BATCH: 2551 SIZE: 13
BATCH: 2552 SIZE: 6
BATCH: 2553 SIZE: 13
BATCH: 2554 SIZE: 7
BATCH: 2555 SIZE: 13
BATCH: 2556 SIZE: 7
BATCH: 2557 SIZE: 13
BATCH: 2558 SIZE: 7
BATCH: 2559 SIZE: 13
BATCH: 2560 SIZE: 7
BATCH: 2561 SIZE: 22
BATCH: 2562 SIZE: 8
BATCH: 2563 SIZE: 13
BATCH: 2564 SIZE: 7
BATCH: 2565 SIZE: 15
BATCH: 2566 SIZE: 7
BATCH: 2567 SIZE: 14
BATCH: 2568 SIZE: 7
BATCH: 2569 SIZE: 16
BATCH: 2570 SIZE: 9
BATCH: 2571 SIZE: 12
BATCH: 2572 SIZE: 8
BATCH: 2573 SIZE: 24
BATCH: 2574 SIZE: 7
BATCH: 2575 SIZE: 13
BATCH: 2576 SIZE: 7
BATCH: 2577 SIZE: 13
BATCH: 2578 SIZE: 7
BATCH: 2579 SIZE: 21
BATCH: 2580 SIZE: 8
BATCH: 2581 SIZE: 11
BATCH: 2582 SIZE: 9
BATCH: 2583 SIZE: 12
BATCH: 2584 SIZE: 14
BATCH: 2585 SIZE: 7
BATCH: 2586 SIZE: 19
BATCH: 2587 SIZE: 7
BATCH: 2588 SIZE: 12
BATCH: 2589 SIZE: 8
BATCH: 2590 SIZE: 12
BATCH: 2591

In [None]:
ts["embeddings"] = embeddings

## Saving Embeddings

Save to CSV / Parquet on drive:

In [None]:
pq_filepath = os.path.join(DATA_DIRPATH, f"botometer_sample_max_50_openai_cumulative_embeddings.parquet.gz")
ts[[
    "user_id", "user_status_count", "user_ts",
    "status_id", "status_text", "created_at",
    "cumulative_length", "cumulative_text",
    "embeddings"
]].to_parquet(pq_filepath, index=False, compression="gzip")

Save to BQ:

In [None]:
embeddings_table_name = f"tweet-collector-py.impeachment_production.botometer_sample_max_50_openai_cumulative_embeddings"
embeddings_table = bq.client.get_table(embeddings_table_name) # API call!
embeddings_table

Table(TableReference(DatasetReference('tweet-collector-py', 'impeachment_production'), 'botometer_sample_max_50_openai_cumulative_embeddings'))

In [None]:
records = ts[[
    "user_id", "user_status_count", "user_ts",
    "status_id", #"status_text", "created_at",
    "cumulative_length", "cumulative_text",
    "embeddings"
]].to_dict("records")

# running into google api issues with larger batches -
# there are so many embeddings for each row, so we lower the batch count substantially
bq.insert_records_in_batches(embeddings_table, records, batch_size=50)

[]

In [None]:
sql = f"""
    SELECT
        count(distinct user_id) as user_count
        , count(distinct status_id)  as status_count
    FROM `{embeddings_table_name}`
    -- LIMIT 100
"""

bq.query_to_df(sql)



    SELECT  
        count(distinct user_id) as user_count
        , count(distinct status_id)  as status_count
    FROM `tweet-collector-py.impeachment_production.botometer_sample_max_50_openai_cumulative_embeddings`
    -- LIMIT 100



Downloading:   0%|          |

Unnamed: 0,user_count,status_count
0,5289,79662


In [None]:
sql = f"""
    SELECT
        *
    FROM `{embeddings_table_name}`
    LIMIT 10
"""

bq.query_to_df(sql)


    SELECT  
        *
    FROM `tweet-collector-py.impeachment_production.botometer_sample_max_50_openai_cumulative_embeddings`
    LIMIT 10



Downloading:   0%|          |

Unnamed: 0,user_id,user_status_count,user_ts,status_id,cumulative_length,cumulative_text,embeddings
0,1219819336266928129,50,13,1221530244668149760,1674,RT @thebradfordfile: Trump is the only preside...,"[-0.02055618353188038, -0.011393598280847073, ..."
1,1219819336266928129,50,14,1221641154896318464,1815,RT @thebradfordfile: Trump is the only preside...,"[-0.019131431356072426, -0.008759384043514729,..."
2,1219819336266928129,50,15,1221816999124393984,1901,RT @thebradfordfile: Trump is the only preside...,"[-0.0193190760910511, -0.009714343585073948, 0..."
3,1219819336266928129,50,16,1222326415439384576,2042,RT @thebradfordfile: Trump is the only preside...,"[-0.021313780918717384, -0.008096817880868912,..."
4,1219819336266928129,50,17,1223029562684190722,2183,RT @thebradfordfile: Trump is the only preside...,"[-0.019543197005987167, -0.00960690900683403, ..."
5,1219819336266928129,50,18,1223090067834433536,2324,RT @thebradfordfile: Trump is the only preside...,"[-0.02310226671397686, -0.0054807583801448345,..."
6,1219819336266928129,50,19,1223093594925326336,2464,RT @thebradfordfile: Trump is the only preside...,"[-0.02032068744301796, -0.004223151132464409, ..."
7,1219819336266928129,50,20,1223094300386291712,2605,RT @thebradfordfile: Trump is the only preside...,"[-0.020268414169549942, -0.004312060307711363,..."
8,1219835726235197441,50,1,1219990104732815360,130,RT @BillKristol: Right. Because if Trump is re...,"[-0.03653779253363609, -0.03582550212740898, -..."
9,1219835726235197441,50,2,1220412123127468033,261,RT @BillKristol: Right. Because if Trump is re...,"[-0.018464352935552597, -0.035196442157030106,..."


### Unpacked

In [None]:
import os
from pandas import read_parquet

pq_filepath = os.path.join(DATA_DIRPATH, f"botometer_sample_max_50_openai_cumulative_embeddings.parquet.gz")
df_packed = read_parquet(pq_filepath)
print(df_packed)
df_packed.head()

                   user_id  ...                                         embeddings
0                     2952  ...  [-0.02606826461851597, 0.004675503820180893, -...
1                     2952  ...  [-0.031863126903772354, 0.006444419734179974, ...
2                     2952  ...  [-0.022716030478477478, 0.006075295619666576, ...
3                     2952  ...  [-0.027161207050085068, 0.006671885959804058, ...
4                     2952  ...  [-0.02330687642097473, 0.002930297749117017, -...
...                    ...  ...                                                ...
79657  1234200349600288772  ...  [-0.015118515118956566, -0.004739969968795776,...
79658  1237940420136456192  ...  [-0.00848793238401413, -0.006201688200235367, ...
79659  1237940420136456192  ...  [-0.007954545319080353, -0.0006160541088320315...
79660  1237940420136456192  ...  [-0.011712012812495232, -0.0029381639324128628...
79661  1237940420136456192  ...  [-0.01155290100723505, -0.006559472996741533, ...

[79

Unnamed: 0,user_id,user_status_count,user_ts,status_id,status_text,created_at,cumulative_length,cumulative_text,embeddings
0,2952,6,1,1209130970282123264,RT @Mikel_Jollett: “The party told you to reje...,2019-12-23 15:17:42+00:00,130,RT @Mikel_Jollett: “The party told you to reje...,"[-0.02606826461851597, 0.004675503820180893, -..."
1,2952,6,2,1211309643529674755,RT @Sky_Lee_1: #FactsMatter &amp; saying the o...,2019-12-29 15:34:58+00:00,274,RT @Mikel_Jollett: “The party told you to reje...,"[-0.031863126903772354, 0.006444419734179974, ..."
2,2952,6,3,1222518769383288837,"RT @MillenPolitics: Back in 1787, James Madiso...",2020-01-29 13:56:02+00:00,415,RT @Mikel_Jollett: “The party told you to reje...,"[-0.022716030478477478, 0.006075295619666576, ..."
3,2952,6,4,1223462557916246016,RT @RBReich: So let me get this straight: An i...,2020-02-01 04:26:19+00:00,556,RT @Mikel_Jollett: “The party told you to reje...,"[-0.027161207050085068, 0.006671885959804058, ..."
4,2952,6,5,1223464425266524160,RT @RBReich: Just so we're all clear: This is ...,2020-02-01 04:33:44+00:00,697,RT @Mikel_Jollett: “The party told you to reje...,"[-0.02330687642097473, 0.002930297749117017, -..."


In [None]:
import json
from pandas import DataFrame


def unpack(embeddings_str):
    """Takes a string value containing an array of OpenAI embeddings,
        and returns a list of floats.
    """
    if isinstance(embeddings_str, str):
        return json.loads(embeddings_str)
    else:
        return embeddings_str


def unpacked(df, col_prefix="openai"):
    """Takes a dataframe witha single column of OpenAI embeddings,
        and unpacks them into their own separate columns,
        and returns a modified version of the original dataframe,
        with the original embeddings column replaced by the new unpacked columns
    """

    print("UNPACKING...")
    embeds = df["embeddings"].apply(unpack)
    print(type(embeds))

    print("RECONSTRUCTING...")
    embeds = DataFrame(embeds.values.tolist())
    embeds.columns = [f"{col_prefix}_{col}" for col in embeds.columns]
    embeds.index = df.index
    print(embeds.shape)
    #embeds.head()

    print("MERGING...")
    df_unpacked = df.merge(embeds, left_index=True, right_index=True)
    df_unpacked.drop(columns=["embeddings"], inplace=True)
    print(df_unpacked.shape)
    return df_unpacked

In [None]:
df_unpacked = unpacked(df_packed)
print(df_unpacked.shape)
print(df_unpacked.columns.tolist())
df_unpacked.head()

UNPACKING...
<class 'pandas.core.series.Series'>
RECONSTRUCTING...
(79662, 1536)
MERGING...
(79662, 1544)
(79662, 1544)
['user_id', 'user_status_count', 'user_ts', 'status_id', 'status_text', 'created_at', 'cumulative_length', 'cumulative_text', 'openai_0', 'openai_1', 'openai_2', 'openai_3', 'openai_4', 'openai_5', 'openai_6', 'openai_7', 'openai_8', 'openai_9', 'openai_10', 'openai_11', 'openai_12', 'openai_13', 'openai_14', 'openai_15', 'openai_16', 'openai_17', 'openai_18', 'openai_19', 'openai_20', 'openai_21', 'openai_22', 'openai_23', 'openai_24', 'openai_25', 'openai_26', 'openai_27', 'openai_28', 'openai_29', 'openai_30', 'openai_31', 'openai_32', 'openai_33', 'openai_34', 'openai_35', 'openai_36', 'openai_37', 'openai_38', 'openai_39', 'openai_40', 'openai_41', 'openai_42', 'openai_43', 'openai_44', 'openai_45', 'openai_46', 'openai_47', 'openai_48', 'openai_49', 'openai_50', 'openai_51', 'openai_52', 'openai_53', 'openai_54', 'openai_55', 'openai_56', 'openai_57', 'openai_58

Unnamed: 0,user_id,user_status_count,user_ts,status_id,status_text,created_at,cumulative_length,cumulative_text,openai_0,openai_1,openai_2,openai_3,openai_4,openai_5,openai_6,openai_7,openai_8,openai_9,openai_10,openai_11,openai_12,openai_13,openai_14,openai_15,openai_16,openai_17,openai_18,openai_19,openai_20,openai_21,openai_22,openai_23,openai_24,openai_25,openai_26,openai_27,openai_28,openai_29,openai_30,openai_31,openai_32,openai_33,openai_34,openai_35,openai_36,openai_37,openai_38,openai_39,openai_40,openai_41,...,openai_1486,openai_1487,openai_1488,openai_1489,openai_1490,openai_1491,openai_1492,openai_1493,openai_1494,openai_1495,openai_1496,openai_1497,openai_1498,openai_1499,openai_1500,openai_1501,openai_1502,openai_1503,openai_1504,openai_1505,openai_1506,openai_1507,openai_1508,openai_1509,openai_1510,openai_1511,openai_1512,openai_1513,openai_1514,openai_1515,openai_1516,openai_1517,openai_1518,openai_1519,openai_1520,openai_1521,openai_1522,openai_1523,openai_1524,openai_1525,openai_1526,openai_1527,openai_1528,openai_1529,openai_1530,openai_1531,openai_1532,openai_1533,openai_1534,openai_1535
0,2952,6,1,1209130970282123264,RT @Mikel_Jollett: “The party told you to reje...,2019-12-23 15:17:42+00:00,130,RT @Mikel_Jollett: “The party told you to reje...,-0.026068,0.004676,-0.009185,-0.013853,-0.00176,0.013181,-0.007946,-0.002704,0.002449,-0.024789,0.027973,0.015612,0.006737,0.006907,-0.015518,0.005691,0.033115,0.016211,0.018702,-0.015572,-0.00728,0.0138,0.00728,-0.019994,1.5e-05,-0.018316,0.018023,-0.044384,-0.021539,-0.021353,-0.004083,0.005205,-0.011376,-0.019728,-0.035379,-0.001316,-0.006817,-0.023258,0.01709,-0.005894,0.036845,0.015931,...,-0.005668,0.042812,0.015159,-0.012468,0.001978,-0.021419,0.020887,-0.005475,0.005202,0.005861,-0.00222,0.023964,0.008006,0.02082,-0.032928,-0.008998,0.016877,0.021419,0.030344,-0.015905,-0.000597,0.025868,0.005082,0.003726,0.014226,-0.006244,-0.018409,0.021766,0.006967,-0.003367,-0.019261,0.023271,0.013014,-0.0349,-0.018369,0.01729,-0.022125,0.008758,-0.019088,0.027573,-0.005715,-0.002349,0.002967,-0.023284,-0.038177,0.016544,0.028319,0.000413,0.001131,-0.015159
1,2952,6,2,1211309643529674755,RT @Sky_Lee_1: #FactsMatter &amp; saying the o...,2019-12-29 15:34:58+00:00,274,RT @Mikel_Jollett: “The party told you to reje...,-0.031863,0.006444,-0.004872,-0.027487,0.012834,0.015453,0.002726,-0.016533,0.020226,-0.026708,0.030058,0.037196,0.000984,-0.012786,-0.001654,-3.4e-05,0.019528,0.008444,0.018762,-0.004937,0.003945,0.011316,-0.002106,-0.01853,0.003897,-0.007959,0.020554,-0.04893,-0.013839,-0.039931,0.008089,-0.006574,-0.021224,-0.02132,-0.02002,-0.00281,-0.012267,-0.027679,0.004858,-0.009094,0.041545,0.002752,...,-0.017873,0.05202,-0.002829,-0.011583,-0.018626,-0.012321,0.026708,-0.008219,-0.004909,-0.00625,-0.012513,-0.002631,-0.007501,0.009326,-0.024451,-0.00627,0.003403,0.028499,0.031535,-0.022468,0.007056,0.019856,0.002099,0.002564,0.007104,-0.006824,-0.018694,0.028007,-0.002354,-0.002159,-0.022605,0.012342,0.011214,-0.033395,-0.009737,0.024465,-0.015508,0.003679,-0.016793,0.020472,0.011877,0.027747,-0.002913,-0.03293,-0.048684,0.022386,0.029675,0.003116,-0.001966,-0.025709
2,2952,6,3,1222518769383288837,"RT @MillenPolitics: Back in 1787, James Madiso...",2020-01-29 13:56:02+00:00,415,RT @Mikel_Jollett: “The party told you to reje...,-0.022716,0.006075,-0.00353,-0.024677,-0.006959,0.000852,-0.002235,-0.013769,0.012057,-0.009699,0.040327,0.03267,0.009249,-0.011654,-0.002673,0.011486,0.032294,0.007382,0.023925,0.004806,0.005118,-0.015019,-0.003029,-0.014374,0.002248,-0.004258,0.028801,-0.039656,-0.003946,-0.02684,0.013716,-0.012957,-0.019022,-0.027485,-0.028748,0.006398,-0.012124,-0.022461,0.008799,-0.015193,0.046023,-0.001079,...,-0.022864,0.039978,0.013648,-0.001577,-0.009269,-0.01342,0.039521,-0.006891,-0.018068,0.013581,-0.00184,0.006965,-0.005608,0.000694,-0.03302,-0.0107,0.006495,0.031273,0.034954,-0.01792,0.000997,0.018968,0.021185,0.013346,-0.006005,-0.021991,-0.03259,0.030951,-0.023159,-0.00029,-0.008369,0.025215,0.03087,-0.027391,-0.00406,0.016872,-0.009209,0.008597,-0.01432,0.014172,0.0116,0.003028,-0.006777,-0.025577,-0.038071,0.005723,0.035491,0.008,-0.000417,-0.033611
3,2952,6,4,1223462557916246016,RT @RBReich: So let me get this straight: An i...,2020-02-01 04:26:19+00:00,556,RT @Mikel_Jollett: “The party told you to reje...,-0.027161,0.006672,-0.002653,-0.018175,-0.010752,-0.006137,-0.00833,-0.01375,0.002067,-0.006665,0.038326,0.024806,0.007998,-0.006567,-0.010576,0.004767,0.032155,0.004699,0.019894,-0.005031,0.014494,-0.015617,-0.00163,-0.012593,0.001749,-0.015482,0.031614,-0.036621,-0.000868,-0.026999,0.004419,-0.004002,-0.021775,-0.025686,-0.029719,0.005748,-0.01329,-0.022208,0.011923,-0.013554,0.047366,0.003668,...,-0.02141,0.043875,0.015834,-0.004632,-0.010245,-0.011253,0.036242,-0.009812,-0.012877,0.009392,0.0013,0.010394,-0.005654,0.002287,-0.035186,-0.011794,0.012295,0.032263,0.027824,-0.018378,0.008986,0.016294,0.014169,0.016795,-0.001324,-0.022573,-0.030396,0.036161,-0.022722,0.008424,-0.009236,0.024414,0.024752,-0.027662,-0.006926,0.008749,-0.001439,0.010224,-0.007775,0.011084,-0.00154,-0.000851,-0.001886,-0.026011,-0.038407,0.004818,0.03183,0.003847,-0.005941,-0.032994
4,2952,6,5,1223464425266524160,RT @RBReich: Just so we're all clear: This is ...,2020-02-01 04:33:44+00:00,697,RT @Mikel_Jollett: “The party told you to reje...,-0.023307,0.00293,-0.00374,-0.020231,-0.007466,-0.000966,-0.008171,-0.021925,0.001524,-0.005088,0.039839,0.023822,0.004031,-0.006044,-0.009587,0.002737,0.038727,0.001495,0.021301,-0.00746,0.009607,-0.019784,-0.002663,-0.001217,-0.002697,-0.015231,0.031762,-0.035611,-0.008618,-0.03038,0.002205,-0.003621,-0.02244,-0.027142,-0.026193,0.006443,-0.01664,-0.025001,0.003249,-0.014553,0.044961,0.003228,...,-0.020001,0.03393,0.012798,-0.008564,-0.015895,-0.010779,0.038375,-0.007067,-0.022982,0.010414,0.004882,0.00935,-0.00663,0.006816,-0.039161,-0.016762,0.01454,0.038727,0.023009,-0.016166,0.012189,0.015949,0.010624,0.019282,-0.006582,-0.013571,-0.032088,0.036424,-0.017521,0.000977,-0.01332,0.023429,0.019716,-0.025651,-0.013652,0.015231,0.000852,0.010292,-0.009316,0.013862,-0.003811,0.003571,0.000859,-0.031627,-0.036234,0.00897,0.029838,-0.000321,-0.005125,-0.033903


In [None]:
pq_filepath_unpacked = os.path.join(DATA_DIRPATH, "botometer_sample_max_50_openai_cumulative_embeddings_unpacked.parquet.gz")
df_unpacked.to_parquet(pq_filepath_unpacked, index=False, compression="gzip")