For a sample of users for which we have previously obtained Botometer scores, we have obtained OpenAI embeddings of their tweet texts and profile descriptions, respectively.

The file with all the embeddings is large, and the embeddings are in a JSON string format, so instead let's split the embeddings into a column per embedding. And split the single large file into two smaller files (one for tweets, one for profiles).

The resulting files will be easier to use for analysis.

### Google Drive

In [1]:
import os
from google.colab import drive

drive.mount('/content/drive')
print(os.getcwd(), os.listdir(os.getcwd())) #> 'content', ['.config', 'drive', 'sample_data']

Mounted at /content/drive
/content ['.config', 'drive', 'sample_data']


In [2]:
# you might need to create a google drive SHORTCUT that has this same path
# ... or update the path to use your own google drive organization
DATA_DIR = '/content/drive/MyDrive/Research/DS Research Shared 2023/data/impeachment_2020'
print(DATA_DIR)
assert os.path.isdir(DATA_DIR)

/content/drive/MyDrive/Research/DS Research Shared 2023/data/impeachment_2020


### Load Data

In [3]:
MODEL_ID = "text-embedding-ada-002"

embeddings_csv_filepath = os.path.join(DATA_DIR, MODEL_ID, "botometer_sample_openai_embeddings_20230704.csv")
assert os.path.isfile(embeddings_csv_filepath)

In [4]:
from pandas import read_csv

df = read_csv(embeddings_csv_filepath)
df.drop(columns=["Unnamed: 0"], inplace=True)
#df.index = df["user_id"]
print(df.columns.tolist())
df.head()

['user_id', 'created_on', 'screen_name_count', 'screen_names', 'status_count', 'rt_count', 'rt_pct', 'avg_toxicity', 'avg_fact_score', 'opinion_community', 'is_bot', 'is_q', 'profile_descriptions', 'tweet_texts', 'bom_cap', 'bom_astroturf', 'bom_fake_follower', 'bom_financial', 'bom_other', 'profile_embeddings', 'tweet_embeddings']


Unnamed: 0,user_id,created_on,screen_name_count,screen_names,status_count,rt_count,rt_pct,avg_toxicity,avg_fact_score,opinion_community,...,is_q,profile_descriptions,tweet_texts,bom_cap,bom_astroturf,bom_fake_follower,bom_financial,bom_other,profile_embeddings,tweet_embeddings
0,479211236,2012-01-31,1,BIGREDMACHINE42,668,668,1.0,0.064429,1.809524,1,...,False,,RT @foxnewpolls: POLL: Should Donald Trump be ...,0.659646,0.32,0.16,0.35,0.54,,"[-0.02560455910861492, -0.0007131877937354147,..."
1,34033550,2009-04-21,1,NURSINGPINS,763,753,0.986894,0.06952,2.528571,1,...,False,CRITICAL CARE R.N. STUDENT OF NURSING HISTOR...,RT @ouchinagirl: Pres.@realDonaldTrump Lawyers...,0.79897,0.47,0.51,0.255,0.43,"[-0.025425352156162262, -0.01051797904074192, ...","[-0.025602083653211594, -0.011307586915791035,..."
2,515767837,2012-03-05,1,MARLAVAGTS,647,644,0.995363,0.046958,3.730159,0,...,False,PROGRESSIVE PASSIONATE COMPLICATED INDEPENDENT...,RT @politvidchannel: BREAKING: Sen. Tim Kaine ...,0.810078,0.79,0.42,0.21,0.56,"[-0.05974208191037178, -0.022309767082333565, ...","[-0.017125703394412994, -0.014187934808433056,..."
3,3415696198,2015-08-11,1,NANMAC321,815,814,0.998773,0.047901,2.886905,0,...,False,👓👟🐶🦁🦋☘️🌊,"RT @TrialLawyerRich: Former Prosecutor, Senato...",0.833918,0.85,0.2,0.04,0.32,"[-0.0019808614160865545, -0.006983266212046146...","[-0.010583749040961266, -0.001130992197431624,..."
4,38444226,2009-05-07,1,GDIRTYDIME,1101,1097,0.996367,0.098514,3.345238,0,...,False,I AM JUST A CHILD OF GOD LIVING FOR MY JESUS.....,RT @Jwheels208: “Your favorite president” Some...,0.867012,0.9,0.56,0.04,0.47,"[-0.025338178500533104, -0.025195974856615067,...","[-0.03489087149500847, 0.007136144675314426, 0..."


In [5]:
print(len(df))
print(df["tweet_embeddings"].notna().sum())
print(df["profile_embeddings"].notna().sum())

7566
7566
5746


### Group Labels

In [6]:
df["opinion_label"] = df["opinion_community"].map({0:"Anti-Trump", 1:"Pro-Trump"})
df["bot_label"] = df["is_bot"].map({True:"Bot", False:"Human"})
df["q_label"] = df["is_q"].map({True:"Q-anon", False:"Normal"})

In [7]:
short_q_label = df["is_q"].map({True:" Q-anon ", False:" "})
df["group_label"] = df["opinion_label"] + short_q_label + df["bot_label"]
df["group_label"] = df["group_label"].replace(["Pro-Trump Q-anon Bot", "Pro-Trump Q-anon Human"], ["Q-anon Human", "Q-anon Bot"])
df["group_label"].value_counts()

Anti-Trump Human    3010
Anti-Trump Bot      1881
Pro-Trump Human     1447
Pro-Trump Bot       1051
Q-anon Human         168
Q-anon Bot             9
Name: group_label, dtype: int64

### Separate Datasets

In [8]:
#LABELS = ['user_id', 'created_on',
#          'screen_name_count', 'screen_names', 'status_count', 'rt_count', 'rt_pct',
#          'avg_toxicity', 'avg_fact_score', 'opinion_community', 'is_bot', 'is_q',
#          'bom_cap', 'bom_astroturf', 'bom_fake_follower', 'bom_financial', 'bom_other'
#          #'profile_descriptions', 'tweet_texts',
#]
#ENGINEERED_LABELS = ["opinion_label", "bot_label", "q_label", "group_label"] #, "group_color"
#TWEET_LABELS = LABELS + ENGINEERED_LABELS + ["tweet_texts"]
#PROFILE_LABELS = LABELS + ENGINEERED_LABELS + ["profile_descriptions"]
#
#tweets_df = df[TWEET_LABELS] #.merge(tweet_embeddings, left_index=True, right_index=True)
#profiles_df = df[PROFILE_LABELS] #.merge(profile_embeddings, left_index=True, right_index=True)
#
#print(len(tweets_df.columns))
#print(len(profiles_df.columns))

In [9]:
tweets_df = df.copy()
tweets_df.drop(columns=["profile_descriptions", "profile_embeddings"], inplace=True)
print(tweets_df.shape)

profiles_df = df.copy()
profiles_df.drop(columns=["tweet_texts", "tweet_embeddings"], inplace=True)
profiles_df = profiles_df[ profiles_df["profile_embeddings"].notna() ]  # drop rows where there are no profile descriptions (not all users have profiles)
profiles_df.reset_index(inplace=True, drop=True)
print(profiles_df.shape)

(7566, 23)
(5746, 23)


### Unpack Embeddings

The embeddings are stored in a single column as a JSON string, so we'll need to convert that single column into a column per value in the embeddings array. We'll get 1536 columns back.

In [10]:
import json

def unpack(embeddings_str):
    # idempotence check
    if isinstance(embeddings_str, str):
        return json.loads(embeddings_str)
    else:
        return embeddings_str

tweets_df["tweet_embeddings"] = tweets_df["tweet_embeddings"].apply(unpack)
profiles_df["profile_embeddings"] = profiles_df["profile_embeddings"].apply(unpack)

In [11]:
print(type(tweets_df["tweet_embeddings"][0]))
print(len(tweets_df["tweet_embeddings"][0])) #> 1536

print(type(profiles_df["profile_embeddings"][0]))
print(len(profiles_df["profile_embeddings"][0])) #> 1536

<class 'list'>
1536
<class 'list'>
1536


In [12]:
from pandas import DataFrame

tweet_embeddings = DataFrame(tweets_df["tweet_embeddings"].values.tolist())
print(len(tweet_embeddings))

profile_embeddings = DataFrame(profiles_df["profile_embeddings"].values.tolist())
print(len(profile_embeddings))

7566
5746


Merge embedding columns with label columns:

In [13]:
tweets_df = tweets_df.drop(columns=["tweet_embeddings"]).merge(tweet_embeddings, left_index=True, right_index=True)
profiles_df = profiles_df.drop(columns=["profile_embeddings"]).merge(profile_embeddings, left_index=True, right_index=True)

print(tweets_df.shape)
print(profiles_df.shape)

(7566, 1558)
(5746, 1558)


### Export Data

In [14]:
# export and download from google colab filesystem (file won't download from drive for some reason - maybe too big?)
# https://cmdlinetips.com/2020/05/how-to-save-pandas-dataframe-as-gzip-zip-file/
#df.to_csv("botometer_sample_openai_embeddings_20230704.csv.zip", index=False, compression="zip")

In [15]:
csv_filename = "botometer_sample_openai_tweet_embeddings_20230704.csv.gz"
csv_filepath = os.path.join(DATA_DIR, MODEL_ID, csv_filename)

#tweets_df.to_csv(csv_filename, index=False, compression="gzip")
tweets_df.to_csv(csv_filepath, index=False, compression="gzip")

In [16]:
csv_filename = "botometer_sample_openai_profile_embeddings_20230704.csv.gz"
csv_filepath = os.path.join(DATA_DIR, MODEL_ID, csv_filename)

#profiles_df.to_csv(csv_filename, index=False, compression="gzip")
profiles_df.to_csv(csv_filepath, index=False, compression="gzip")