We fetched user-level and tweet-level OpenAI embeddings and stored on BQ, and copied the data to CSV files on Drive.

This notebook provides an example of how to load those CSV files. Feel free to make a copy of this notebook and perform your own analyses.

## Setup

### Google Drive

In [3]:
import os
from google.colab import drive

drive.mount('/content/drive')
print(os.getcwd(), os.listdir(os.getcwd()))

Mounted at /content/drive
/content ['.config', 'drive', 'sample_data']


In [4]:
# you might need to create a google drive SHORTCUT that has this same path
# ... or update the path to use your own google drive organization
#DIRPATH = '/content/drive/MyDrive/Research/Disinfo Research Shared 2022'
#DIRPATH = '/content/drive/MyDrive/Research/DS Research Shared 2023'
DIRPATH = '/content/drive/MyDrive/Research/DS Research Shared 2024'

print(DIRPATH)
os.path.isdir(DIRPATH)

/content/drive/MyDrive/Research/DS Research Shared 2024


True

New project-based directory structure for 2024:

https://drive.google.com/drive/folders/1SuXkqVT400uZ2OYFGGV8SYBf7NhtBo5k?usp=drive_link

In [5]:
DATA_DIRPATH = os.path.join(DIRPATH, "projects", "Impeachment 2020 Embeddings", "data")
os.path.isdir(DATA_DIRPATH)

True

In [7]:
os.listdir(DATA_DIRPATH)

['botometer_sample_max_50_openai_user_embeddings.csv.gz',
 'botometer_sample_max_50_openai_user_embeddings_unpacked.csv.gz',
 'botometer_sample_max_50_openai_status_embeddings_v3.csv.gz',
 'botometer_sample_max_50_openai_status_embeddings_v3_unpacked.parquet.gzip',
 'botometer_sample_max_50_openai_status_embeddings_v3_unpacked.csv.gz']

The "unpacked" versions have a column per embedding, and are generally easier to work with.

The files we will be working with are:
  +  "botometer_sample_max_50_openai_user_embeddings_unpacked.csv.gz" and
  + "botometer_sample_max_50_openai_status_embeddings_v3_unpacked.parquet.gzip".

## User Embeddings

7566 users

Loading CSV from drive:

In [9]:
from pandas import read_csv

csv_filepath = os.path.join(DATA_DIRPATH, "botometer_sample_max_50_openai_user_embeddings_unpacked.csv.gz")
users_df = read_csv(csv_filepath, compression="gzip")
print(users_df.shape)
print(users_df.columns)
users_df.head()

(7566, 1547)
Index(['user_id', 'created_on', 'status_count', 'rt_count', 'is_bot',
       'opinion_community', 'is_q', 'avg_toxicity', 'avg_fact_score',
       'bom_astroturf',
       ...
       'openai_1526', 'openai_1527', 'openai_1528', 'openai_1529',
       'openai_1530', 'openai_1531', 'openai_1532', 'openai_1533',
       'openai_1534', 'openai_1535'],
      dtype='object', length=1547)


Unnamed: 0,user_id,created_on,status_count,rt_count,is_bot,opinion_community,is_q,avg_toxicity,avg_fact_score,bom_astroturf,...,openai_1526,openai_1527,openai_1528,openai_1529,openai_1530,openai_1531,openai_1532,openai_1533,openai_1534,openai_1535
0,3420436216,2015-08-13,555,540,True,0,False,0.056113,1.983193,0.295,...,-0.001867,-0.013167,0.020885,-0.022568,-0.033631,0.016153,0.024127,-0.017519,0.002636,-0.039838
1,108121958,2010-01-24,2,2,False,0,False,0.45671,,0.58,...,0.017651,-0.009439,0.024375,-0.032553,-0.042185,0.013782,0.01132,-0.014862,-0.010413,-0.020359
2,3038308638,2015-02-23,755,665,True,0,False,0.06986,3.401786,0.97,...,-0.026273,-0.008139,0.030285,-0.029902,-0.030887,0.022481,-0.005476,-0.016279,-0.010138,-0.021454
3,332396536,2011-07-09,951,951,True,1,False,0.044264,2.304511,0.58,...,-0.00552,-0.005288,0.017071,-0.033637,-0.040202,0.041773,-0.00937,0.003352,0.009391,-0.042671
4,955082522479808512,2018-01-21,570,533,True,0,False,0.049325,4.714286,0.355,...,0.009959,0.004695,0.005555,-0.012851,-0.032229,0.031443,0.008163,-0.018501,-0.008724,-0.042027


In [13]:
users_df["user_id"].nunique()

7566

In [14]:
users_df["is_bot"].value_counts()

False    4466
True     3100
Name: is_bot, dtype: int64

In [15]:
users_df["opinion_community"].value_counts()

0    4891
1    2675
Name: opinion_community, dtype: int64

In [16]:
users_df["avg_fact_score"].info()

<class 'pandas.core.series.Series'>
RangeIndex: 7566 entries, 0 to 7565
Series name: avg_fact_score
Non-Null Count  Dtype  
--------------  -----  
3292 non-null   float64
dtypes: float64(1)
memory usage: 59.2 KB


In [29]:


from pandas import isnull

def add_labels(users_df):
    # APPLY SAME LABELS AS THE ORIGINAL SOURCE CODE
    # https://github.com/s2t2/openai-embeddings-2023/blob/1b8372dd36982009df5d4a80871f4c182ada743d/notebooks/2_embeddings_data_export.py#L51
    # https://github.com/s2t2/openai-embeddings-2023/blob/main/app/dataset.py#L37-L64

    # labels:
    users_df["opinion_label"] = users_df["opinion_community"].map({0:"Anti-Trump", 1:"Pro-Trump"})
    users_df["bot_label"] = users_df["is_bot"].map({True:"Bot", False:"Human"})
    users_df["fourway_label"] = users_df["opinion_label"] + " " + users_df["bot_label"]

    # language toxicity scores (0 low - 1 high)
    toxic_threshold = 0.1
    users_df["is_toxic"] = users_df["avg_toxicity"] >= toxic_threshold
    users_df["is_toxic"] = users_df["is_toxic"].map({True: 1, False :0 })
    users_df["toxic_label"] = users_df["is_toxic"].map({1: "Toxic", 0 :"Normal" })

    # fact check / media quality scores (1 low - 5 high)
    fact_threshold = 3.0
    users_df["is_factual"] = users_df["avg_fact_score"].apply(lambda score: score if isnull(score) else score >= fact_threshold)

    # botometer binary and labels:
    users_df["is_bom_overall"] = users_df["bom_overall"].round()
    users_df["is_bom_astroturf"] = users_df["bom_astroturf"].round()
    users_df["bom_overall_label"] = users_df["is_bom_overall"].map({1:"Bot", 0:"Human"})
    users_df["bom_astroturf_label"] = users_df["is_bom_astroturf"].map({1:"Bot", 0:"Human"})
    users_df["bom_overall_fourway_label"] = users_df["opinion_label"] + " " + users_df["bom_overall_label"]
    users_df["bom_astroturf_fourway_label"] = users_df["opinion_label"] + " " + users_df["bom_astroturf_label"]

    return users_df


users_df = add_labels(users_df)
print(users_df.shape)
print(users_df.columns.tolist())
users_df.head()

(7566, 1559)
['user_id', 'created_on', 'status_count', 'rt_count', 'is_bot', 'opinion_community', 'is_q', 'avg_toxicity', 'avg_fact_score', 'bom_astroturf', 'bom_overall', 'openai_0', 'openai_1', 'openai_2', 'openai_3', 'openai_4', 'openai_5', 'openai_6', 'openai_7', 'openai_8', 'openai_9', 'openai_10', 'openai_11', 'openai_12', 'openai_13', 'openai_14', 'openai_15', 'openai_16', 'openai_17', 'openai_18', 'openai_19', 'openai_20', 'openai_21', 'openai_22', 'openai_23', 'openai_24', 'openai_25', 'openai_26', 'openai_27', 'openai_28', 'openai_29', 'openai_30', 'openai_31', 'openai_32', 'openai_33', 'openai_34', 'openai_35', 'openai_36', 'openai_37', 'openai_38', 'openai_39', 'openai_40', 'openai_41', 'openai_42', 'openai_43', 'openai_44', 'openai_45', 'openai_46', 'openai_47', 'openai_48', 'openai_49', 'openai_50', 'openai_51', 'openai_52', 'openai_53', 'openai_54', 'openai_55', 'openai_56', 'openai_57', 'openai_58', 'openai_59', 'openai_60', 'openai_61', 'openai_62', 'openai_63', 'opena

Unnamed: 0,user_id,created_on,status_count,rt_count,is_bot,opinion_community,is_q,avg_toxicity,avg_fact_score,bom_astroturf,...,bot_label,fourway_label,is_toxic,toxic_label,is_bom_overall,is_bom_astroturf,bom_overall_label,bom_astroturf_label,bom_overall_fourway_label,bom_astroturf_fourway_label
0,3420436216,2015-08-13,555,540,True,0,False,0.056113,1.983193,0.295,...,Bot,Anti-Trump Bot,0,Normal,0.0,0.0,Human,Human,Anti-Trump Human,Anti-Trump Human
1,108121958,2010-01-24,2,2,False,0,False,0.45671,,0.58,...,Human,Anti-Trump Human,1,Toxic,0.0,1.0,Human,Bot,Anti-Trump Human,Anti-Trump Bot
2,3038308638,2015-02-23,755,665,True,0,False,0.06986,3.401786,0.97,...,Bot,Anti-Trump Bot,0,Normal,1.0,1.0,Bot,Bot,Anti-Trump Bot,Anti-Trump Bot
3,332396536,2011-07-09,951,951,True,1,False,0.044264,2.304511,0.58,...,Bot,Pro-Trump Bot,0,Normal,1.0,1.0,Bot,Bot,Pro-Trump Bot,Pro-Trump Bot
4,955082522479808512,2018-01-21,570,533,True,0,False,0.049325,4.714286,0.355,...,Bot,Anti-Trump Bot,0,Normal,0.0,0.0,Human,Human,Anti-Trump Human,Anti-Trump Human


In [24]:
users_df["is_factual"].value_counts()

False    1696
True     1596
Name: is_factual, dtype: int64

In [25]:
users_df["is_toxic"].value_counts()


0    6132
1    1434
Name: is_toxic, dtype: int64

In [26]:
users_df["bot_label"].value_counts()

Human    4466
Bot      3100
Name: bot_label, dtype: int64

In [27]:
users_df["opinion_label"].value_counts()

Anti-Trump    4891
Pro-Trump     2675
Name: opinion_label, dtype: int64

In [30]:
users_df["fourway_label"].value_counts()

Anti-Trump Human    3010
Anti-Trump Bot      1881
Pro-Trump Human     1456
Pro-Trump Bot       1219
Name: fourway_label, dtype: int64

## Tweet Embeddings

183K statuses:

In [12]:
from pandas import read_parquet

pq_filepath = os.path.join(DATA_DIRPATH, "botometer_sample_max_50_openai_status_embeddings_v3_unpacked.parquet.gzip")
statuses_df = read_parquet(pq_filepath)
print(statuses_df.shape)
print(statuses_df.columns)
statuses_df.head()

(183815, 1541)
Index(['user_id', 'status_id', 'status_text', 'created_at', 'embeds_length',
       'openai_0', 'openai_1', 'openai_2', 'openai_3', 'openai_4',
       ...
       'openai_1526', 'openai_1527', 'openai_1528', 'openai_1529',
       'openai_1530', 'openai_1531', 'openai_1532', 'openai_1533',
       'openai_1534', 'openai_1535'],
      dtype='object', length=1541)


Unnamed: 0,user_id,status_id,status_text,created_at,embeds_length,openai_0,openai_1,openai_2,openai_3,openai_4,...,openai_1526,openai_1527,openai_1528,openai_1529,openai_1530,openai_1531,openai_1532,openai_1533,openai_1534,openai_1535
0,897845802701377536,1221540755451392001,Doubt it..It appears they all have gone the wa...,2020-01-26 21:09:45+00:00,1536,-0.020428,-0.00672,0.007308,-0.022157,-0.041841,...,0.014616,0.004705,0.012661,-0.020974,-0.003458,0.045166,0.029871,-0.021186,-0.003376,-0.024937
1,935739601301458947,1223458629837295619,RT @Wyn1745: Democrats are ‘setting the stage’...,2020-02-01 04:10:42+00:00,1536,-0.036689,-0.007481,0.007968,-0.006632,-0.022805,...,-0.001696,0.002522,0.020397,-0.046374,-0.046611,0.021068,-8.5e-05,-0.003701,-0.01537,-0.019213
2,571774622,1217445781663363072,RT @sarahdwire: I’m loathe to insert myself in...,2020-01-15 13:57:48+00:00,1536,-0.033382,-0.006886,-0.003244,-0.015834,0.000172,...,0.001027,0.002464,0.002013,-0.032766,-0.034265,0.006545,0.014804,0.003027,-0.001518,-0.030946
3,384679808,1223705594818748416,RT @RepRatcliffe: We warned them...As Schiff a...,2020-02-01 20:32:03+00:00,1536,-0.008477,-0.007364,0.000919,-0.006435,0.008101,...,-0.028269,0.003193,0.015056,-0.015333,-0.028137,0.03251,0.010327,-0.013621,-0.007686,-0.016216
4,701264221653217281,1218459840277729281,"RT @chipfranklin: Because ""impeachment"" in the...",2020-01-18 09:07:18+00:00,1536,-0.009454,0.017376,0.007016,-0.020075,-0.023674,...,-0.01359,0.015564,0.00513,0.003077,-0.029167,0.015523,0.017914,-0.008789,-0.019767,-0.042353


In [70]:
statuses_df["user_id"].nunique()

7566