In this notebook, we prepare a clean (de-duped) version of the status embeddings. And we re-construct user embeddings using the average of their status embeddings.

This notebook saves both datasets back to drive for further analysis.

## Google Drive

In [2]:
import os
from google.colab import drive

drive.mount('/content/drive')
print(os.getcwd(), os.listdir(os.getcwd()))

Mounted at /content/drive
/content ['.config', 'drive', 'sample_data']


In [3]:
# you might need to create a google drive SHORTCUT that has this same path
# ... or update the path to use your own google drive organization
DIRPATH = '/content/drive/MyDrive/Research/DS Research Shared 2024'

print(DIRPATH)
os.path.isdir(DIRPATH)

/content/drive/MyDrive/Research/DS Research Shared 2024


True

In [4]:
DATA_DIRPATH = os.path.join(DIRPATH, "projects", "Impeachment 2020 Embeddings", "data")
os.path.isdir(DATA_DIRPATH)

True

## Data Loading

In [5]:
from pandas import read_parquet

pq_filepath = os.path.join(DATA_DIRPATH, "botometer_sample_max_50_openai_status_embeddings_v3_unpacked.parquet.gzip")
statuses_df = read_parquet(pq_filepath)
print(statuses_df.shape)
print(statuses_df.columns)
statuses_df.head()

(183815, 1541)
Index(['user_id', 'status_id', 'status_text', 'created_at', 'embeds_length',
       'openai_0', 'openai_1', 'openai_2', 'openai_3', 'openai_4',
       ...
       'openai_1526', 'openai_1527', 'openai_1528', 'openai_1529',
       'openai_1530', 'openai_1531', 'openai_1532', 'openai_1533',
       'openai_1534', 'openai_1535'],
      dtype='object', length=1541)


Unnamed: 0,user_id,status_id,status_text,created_at,embeds_length,openai_0,openai_1,openai_2,openai_3,openai_4,...,openai_1526,openai_1527,openai_1528,openai_1529,openai_1530,openai_1531,openai_1532,openai_1533,openai_1534,openai_1535
0,897845802701377536,1221540755451392001,Doubt it..It appears they all have gone the wa...,2020-01-26 21:09:45+00:00,1536,-0.020428,-0.00672,0.007308,-0.022157,-0.041841,...,0.014616,0.004705,0.012661,-0.020974,-0.003458,0.045166,0.029871,-0.021186,-0.003376,-0.024937
1,935739601301458947,1223458629837295619,RT @Wyn1745: Democrats are ‘setting the stage’...,2020-02-01 04:10:42+00:00,1536,-0.036689,-0.007481,0.007968,-0.006632,-0.022805,...,-0.001696,0.002522,0.020397,-0.046374,-0.046611,0.021068,-8.5e-05,-0.003701,-0.01537,-0.019213
2,571774622,1217445781663363072,RT @sarahdwire: I’m loathe to insert myself in...,2020-01-15 13:57:48+00:00,1536,-0.033382,-0.006886,-0.003244,-0.015834,0.000172,...,0.001027,0.002464,0.002013,-0.032766,-0.034265,0.006545,0.014804,0.003027,-0.001518,-0.030946
3,384679808,1223705594818748416,RT @RepRatcliffe: We warned them...As Schiff a...,2020-02-01 20:32:03+00:00,1536,-0.008477,-0.007364,0.000919,-0.006435,0.008101,...,-0.028269,0.003193,0.015056,-0.015333,-0.028137,0.03251,0.010327,-0.013621,-0.007686,-0.016216
4,701264221653217281,1218459840277729281,"RT @chipfranklin: Because ""impeachment"" in the...",2020-01-18 09:07:18+00:00,1536,-0.009454,0.017376,0.007016,-0.020075,-0.023674,...,-0.01359,0.015564,0.00513,0.003077,-0.029167,0.015523,0.017914,-0.008789,-0.019767,-0.042353


In [6]:
statuses_df["user_id"].nunique()

7566

In [7]:
len(statuses_df)

183815

In [8]:
statuses_df["status_id"].nunique()

183727

Oh no, statuses not unique?

In [9]:
statuses_df["status_id"].value_counts()

1234905353650761728    6
1209143341901737984    3
1209173027772076033    3
1207894148151308289    2
1217603880453718016    2
                      ..
1216442996260003840    1
1225979782745272325    1
1206336484170702849    1
1239320120071200771    1
1222940911023333376    1
Name: status_id, Length: 183727, dtype: Int64

In [None]:
statuses_df[statuses_df["status_id"].duplicated(keep=False)].sort_values("status_id")

The embeddings values appear to be the same for each status, so we can take the first row for each status.

## De-Duping

183,727 statuses

In [10]:
print(statuses_df.shape)
statuses_df.drop_duplicates(subset=["status_id"], inplace=True)
print(statuses_df.shape)

(183815, 1541)
(183727, 1541)


Saving to drive:

In [23]:
pq_filepath = os.path.join(DATA_DIRPATH, "botometer_sample_max_50_openai_status_embeddings_v3_unpacked_deduped.parquet.gzip")

statuses_df.to_parquet(pq_filepath, compression="gzip")

## Averaging Embeddings per User

In [12]:
statuses_df.groupby("user_id")["status_id"].count()

user_id
2952                    6
635553                 12
656993                  1
761154                  4
777554                  1
                       ..
1234200349600288772    50
1234846911028453376     1
1237940420136456192     4
1238854780191195136     1
1240138605726760962     1
Name: status_id, Length: 7566, dtype: int64

In [13]:
embeddings_cols = [col for col in statuses_df.columns if "openai" in col]
print(len(embeddings_cols))
print(embeddings_cols[0], "...", embeddings_cols[-1])

1536
openai_0 ... openai_1535


In [14]:
averages = statuses_df.groupby("user_id")[embeddings_cols].mean()
print(averages.shape)
averages.head()

(7566, 1536)


Unnamed: 0_level_0,openai_0,openai_1,openai_2,openai_3,openai_4,openai_5,openai_6,openai_7,openai_8,openai_9,...,openai_1526,openai_1527,openai_1528,openai_1529,openai_1530,openai_1531,openai_1532,openai_1533,openai_1534,openai_1535
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2952,-0.023816,0.002004,0.004429,-0.019361,-0.00986,0.004878,0.00096,-0.015426,-0.00643,0.001027,...,-0.012285,0.001094,0.015767,-0.026536,-0.024981,0.015113,0.018588,-0.002324,-0.003782,-0.028532
635553,-0.030022,-0.006063,0.017259,-0.018501,-0.008536,0.004416,-0.01184,-0.010581,-0.010859,-0.003771,...,-0.00596,-0.007866,0.010948,-0.021376,-0.023424,0.020705,0.005084,-0.011961,-0.003258,-0.026262
656993,-0.010723,0.008235,0.004192,-0.040441,-0.015172,0.012798,-0.015786,0.008556,-0.022145,-0.017026,...,-0.01811,0.007116,-0.004877,-0.032427,-0.023885,-0.000715,0.003886,-0.024242,0.003839,-0.048883
761154,-0.021389,-0.004747,0.006925,-0.017395,-0.0119,0.018309,-0.007047,-0.024175,0.001368,0.002065,...,0.013326,-0.020819,0.007364,-0.016794,-0.049548,0.013037,0.024798,-0.008543,0.006142,-0.035867
777554,-0.009369,-0.009612,0.01247,0.005079,-0.019303,-0.010459,0.019815,-0.019171,-0.017594,-0.006209,...,0.007358,0.010696,0.008784,-0.024808,-0.008042,0.011077,0.001996,-0.001104,-0.01946,-0.030301


Get user labels from CSV file:

In [15]:
from pandas import read_csv

csv_filepath = os.path.join(DATA_DIRPATH, "botometer_sample_max_50_openai_user_embeddings_unpacked.csv.gz")
users_df = read_csv(csv_filepath, compression="gzip")
print(users_df.shape)
print(users_df.columns)
users_df.head()

(7566, 1547)
Index(['user_id', 'created_on', 'status_count', 'rt_count', 'is_bot',
       'opinion_community', 'is_q', 'avg_toxicity', 'avg_fact_score',
       'bom_astroturf',
       ...
       'openai_1526', 'openai_1527', 'openai_1528', 'openai_1529',
       'openai_1530', 'openai_1531', 'openai_1532', 'openai_1533',
       'openai_1534', 'openai_1535'],
      dtype='object', length=1547)


Unnamed: 0,user_id,created_on,status_count,rt_count,is_bot,opinion_community,is_q,avg_toxicity,avg_fact_score,bom_astroturf,...,openai_1526,openai_1527,openai_1528,openai_1529,openai_1530,openai_1531,openai_1532,openai_1533,openai_1534,openai_1535
0,3420436216,2015-08-13,555,540,True,0,False,0.056113,1.983193,0.295,...,-0.001867,-0.013167,0.020885,-0.022568,-0.033631,0.016153,0.024127,-0.017519,0.002636,-0.039838
1,108121958,2010-01-24,2,2,False,0,False,0.45671,,0.58,...,0.017651,-0.009439,0.024375,-0.032553,-0.042185,0.013782,0.01132,-0.014862,-0.010413,-0.020359
2,3038308638,2015-02-23,755,665,True,0,False,0.06986,3.401786,0.97,...,-0.026273,-0.008139,0.030285,-0.029902,-0.030887,0.022481,-0.005476,-0.016279,-0.010138,-0.021454
3,332396536,2011-07-09,951,951,True,1,False,0.044264,2.304511,0.58,...,-0.00552,-0.005288,0.017071,-0.033637,-0.040202,0.041773,-0.00937,0.003352,0.009391,-0.042671
4,955082522479808512,2018-01-21,570,533,True,0,False,0.049325,4.714286,0.355,...,0.009959,0.004695,0.005555,-0.012851,-0.032229,0.031443,0.008163,-0.018501,-0.008724,-0.042027


In [19]:
user_labels = users_df.drop(columns=embeddings_cols)
user_labels.index = user_labels["user_id"]
user_labels.head()

Unnamed: 0_level_0,user_id,created_on,status_count,rt_count,is_bot,opinion_community,is_q,avg_toxicity,avg_fact_score,bom_astroturf,bom_overall
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
3420436216,3420436216,2015-08-13,555,540,True,0,False,0.056113,1.983193,0.295,0.19
108121958,108121958,2010-01-24,2,2,False,0,False,0.45671,,0.58,0.11
3038308638,3038308638,2015-02-23,755,665,True,0,False,0.06986,3.401786,0.97,0.97
332396536,332396536,2011-07-09,951,951,True,1,False,0.044264,2.304511,0.58,0.75
955082522479808512,955082522479808512,2018-01-21,570,533,True,0,False,0.049325,4.714286,0.355,0.225


Merge user labels columns back in:

In [20]:
averages = averages.merge(user_labels, left_index=True, right_index=True)
averages.head()

Unnamed: 0_level_0,openai_0,openai_1,openai_2,openai_3,openai_4,openai_5,openai_6,openai_7,openai_8,openai_9,...,created_on,status_count,rt_count,is_bot,opinion_community,is_q,avg_toxicity,avg_fact_score,bom_astroturf,bom_overall
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2952,-0.023816,0.002004,0.004429,-0.019361,-0.00986,0.004878,0.00096,-0.015426,-0.00643,0.001027,...,2006-07-24,6,6,False,0,False,0.006899,,0.21,0.2
635553,-0.030022,-0.006063,0.017259,-0.018501,-0.008536,0.004416,-0.01184,-0.010581,-0.010859,-0.003771,...,2007-01-15,12,12,False,0,False,0.077787,,0.24,0.16
656993,-0.010723,0.008235,0.004192,-0.040441,-0.015172,0.012798,-0.015786,0.008556,-0.022145,-0.017026,...,2007-01-17,1,1,False,0,False,0.025031,,0.11,0.1
761154,-0.021389,-0.004747,0.006925,-0.017395,-0.0119,0.018309,-0.007047,-0.024175,0.001368,0.002065,...,2007-02-09,4,0,False,0,False,0.172311,,0.13,0.72
777554,-0.009369,-0.009612,0.01247,0.005079,-0.019303,-0.010459,0.019815,-0.019171,-0.017594,-0.006209,...,2007-02-17,1,1,False,0,False,0.00166,,0.15,0.03


Saving to drive:

In [21]:
csv_filepath = os.path.join(DATA_DIRPATH, "botometer_sample_max_50_openai_status_embeddings_v3_unpacked_deduped_averaged.csv.gz")

averages.to_csv(csv_filepath, compression="gzip")