Looks like we originally didn't include the "bom_overall" score in the data file, so let's load the file from drive, query the database to obtain the scores, and merge the scores in.



## Query to Obtain Scores

In [1]:
from google.colab import auth

# asks you to login
auth.authenticate_user()

In [4]:

from google.cloud import bigquery
from pandas import DataFrame

class BigQueryService():
    def __init__(self):
        self.client = bigquery.Client(project="tweet-research-shared")

    def execute_query(self, sql, verbose=True):
        if verbose == True:
            print(sql)
        job = self.client.query(sql)
        return job.result()

    def query_to_df(self, sql, verbose=True):
        """high-level wrapper to return a DataFrame"""
        results = self.execute_query(sql, verbose=verbose)
        records = [dict(row) for row in list(results)]
        df = DataFrame(records)
        return df

bq = BigQueryService()
print(bq)

<__main__.BigQueryService object at 0x7d6dac49bdc0>


In [5]:
print("------------")
print("QUERY:")
sql = """
    SELECT *
    FROM `tweet-research-shared.impeachment_2020.tweets_v2`
    LIMIT 10
"""

results = list(bq.execute_query(sql, verbose=True))
records = [dict(row) for row in results]
print("------------")
print("RESULTS:" , len(records))


------------
QUERY:

    SELECT *
    FROM `tweet-research-shared.impeachment_2020.tweets_v2`
    LIMIT 10

------------
RESULTS: 10


In [8]:
sql = f"""
    SELECT
        u.user_id
        ,u.is_bot
        ,u.is_q ,u.opinion_community
        ,u.avg_fact_score, u.avg_toxicity, u.created_on

        --bom.user_id
        ,bom.score_type as bom_score_type
        ,count(distinct bom.lookup_at) as bom_lookup_count
        ,avg(bom.cap) as bom_cap
        ,avg(bom.astroturf) as bom_astroturf
        ,avg(bom.fake_follower) as bom_fake_follower
        ,avg(bom.financial) as bom_financial
        ,avg(bom.other) as bom_other
        ,avg(bom.overall) as bom_overall
        ,avg(bom.self_declared) as bom_self_declared
        ,avg(bom.spammer) as bom_spammer
    FROM `tweet-research-shared.impeachment_2020.botometer_scores` bom
    JOIN `tweet-research-shared.impeachment_2020.user_details_v20210806_slim` u ON bom.user_id = u.user_id -- 8683
    WHERE bom.score_type = 'english' -- 7,566 users with english scores
    GROUP BY 1,2,3,4,5,6,7,8
    -- HAVING lookup_count > 1 -- 333 users have multiple lookups, so we're going to average them instead of drop them
"""
bom_score_results = bq.query_to_df(sql, verbose=False)
bom_score_results.index = bom_score_results["user_id"]
bom_score_results.head()

Unnamed: 0_level_0,user_id,is_bot,is_q,opinion_community,avg_fact_score,avg_toxicity,created_on,bom_score_type,bom_lookup_count,bom_cap,bom_astroturf,bom_fake_follower,bom_financial,bom_other,bom_overall,bom_self_declared,bom_spammer
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
808369805094858752,808369805094858752,True,False,0,3.235294,0.063635,2016-12-12,english,1,0.915838,0.96,0.35,0.02,0.36,0.96,0.01,0.0
34472721,34472721,True,False,0,3.696429,0.060307,2009-04-23,english,1,0.794672,0.54,0.3,0.37,0.47,0.4,0.07,0.01
1187747527,1187747527,True,False,0,3.857143,0.080285,2013-02-16,english,1,0.794672,0.43,0.22,0.22,0.38,0.4,0.03,0.19
265461503,265461503,True,False,0,3.291667,0.0651,2011-03-13,english,1,0.882963,0.92,0.48,0.25,0.49,0.92,0.05,0.07
4843245081,4843245081,True,False,0,2.857143,0.054542,2016-01-24,english,1,0.796622,0.62,0.43,0.32,0.62,0.62,0.07,0.05


In [22]:
scores_lookup = bom_score_results[["bom_overall", "bom_self_declared", "bom_spammer"]]
scores_lookup.head()

Unnamed: 0_level_0,bom_overall,bom_self_declared,bom_spammer
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
808369805094858752,0.96,0.01,0.0
34472721,0.4,0.07,0.01
1187747527,0.4,0.03,0.19
265461503,0.92,0.05,0.07
4843245081,0.62,0.07,0.05


### Load Data from Google Drive

In [9]:
import os
from google.colab import drive

drive.mount('/content/drive')
print(os.getcwd(), os.listdir(os.getcwd())) #> 'content', ['.config', 'drive', 'sample_data']

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content ['.config', 'drive', 'sample_data']


In [10]:
# you might need to create a google drive SHORTCUT that has this same path
# ... or update the path to use your own google drive organization
DATA_DIR = '/content/drive/MyDrive/Research/DS Research Shared 2023/data/impeachment_2020'
print(DATA_DIR)
assert os.path.isdir(DATA_DIR)

/content/drive/MyDrive/Research/DS Research Shared 2023/data/impeachment_2020


In [11]:
MODEL_ID = "text-embedding-ada-002"

#embeddings_csv_filepath = os.path.join(DATA_DIR, MODEL_ID, "botometer_sample_openai_embeddings_20230704.csv") # messy format
#assert os.path.isfile(embeddings_csv_filepath)

tweet_embeddings_csv_filepath = os.path.join(DATA_DIR, MODEL_ID, "botometer_sample_openai_tweet_embeddings_20230704.csv.gz") # column per embedding, with label cols
assert os.path.isfile(tweet_embeddings_csv_filepath)

In [12]:
from pandas import read_csv

df = read_csv(tweet_embeddings_csv_filepath)
#df.drop(columns=["Unnamed: 0"], inplace=True)
df.index = df["user_id"]
print(df.columns.tolist())
df.head()

['user_id', 'created_on', 'screen_name_count', 'screen_names', 'status_count', 'rt_count', 'rt_pct', 'avg_toxicity', 'avg_fact_score', 'opinion_community', 'is_bot', 'is_q', 'tweet_texts', 'bom_cap', 'bom_astroturf', 'bom_fake_follower', 'bom_financial', 'bom_other', 'opinion_label', 'bot_label', 'q_label', 'group_label', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92', '93', '94', '95', '96', '97', '98', '99', '100', '101', '102', '103', '104', '105', '106', '107', '108', '109', '110', '111', '1

Unnamed: 0_level_0,user_id,created_on,screen_name_count,screen_names,status_count,rt_count,rt_pct,avg_toxicity,avg_fact_score,opinion_community,...,1526,1527,1528,1529,1530,1531,1532,1533,1534,1535
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
479211236,479211236,2012-01-31,1,BIGREDMACHINE42,668,668,1.0,0.064429,1.809524,1,...,-0.022184,0.001605,0.017806,-0.027392,-0.028566,0.017704,0.006305,-0.012687,-0.017678,-0.043244
34033550,34033550,2009-04-21,1,NURSINGPINS,763,753,0.986894,0.06952,2.528571,1,...,-0.005602,-0.00669,0.021927,-0.040633,-0.017619,0.010771,0.017564,-0.026235,-0.013882,-0.032292
515767837,515767837,2012-03-05,1,MARLAVAGTS,647,644,0.995363,0.046958,3.730159,0,...,-0.002997,-0.015799,0.026494,-0.011663,-0.050199,0.029838,0.023462,-0.013863,-0.000938,-0.039667
3415696198,3415696198,2015-08-11,1,NANMAC321,815,814,0.998773,0.047901,2.886905,0,...,-0.008434,-0.017602,0.029888,-0.018951,-0.025267,0.026469,-0.009555,-0.030155,-0.008601,-0.040225
38444226,38444226,2009-05-07,1,GDIRTYDIME,1101,1097,0.996367,0.098514,3.345238,0,...,-0.009671,-0.000391,0.011717,-0.027693,-0.03794,0.028984,0.009849,-0.01316,-0.004221,-0.03371


In [21]:
[col for col in df.columns if "bom_" in col]

['bom_cap', 'bom_astroturf', 'bom_fake_follower', 'bom_financial', 'bom_other']

## Merge In Scores

In [25]:
merged_df = df.merge(scores_lookup, how="left", left_index=True, right_index=True)
merged_df[["user_id", "is_bot", "bom_overall", "bom_astroturf"]].head()

Unnamed: 0_level_0,user_id,is_bot,bom_overall,bom_astroturf
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
479211236,479211236,True,0.18,0.32
34033550,34033550,True,0.61,0.47
515767837,515767837,True,0.79,0.79
3415696198,3415696198,True,0.85,0.85
38444226,38444226,True,0.9,0.9


In [18]:
merged_df["bom_overall"].isna().sum()

0

In [26]:
[col for col in merged_df.columns if "bom_" in col]

['bom_cap',
 'bom_astroturf',
 'bom_fake_follower',
 'bom_financial',
 'bom_other',
 'bom_overall',
 'bom_self_declared',
 'bom_spammer']

Save new version to drive (then download for local analysis as well):

In [27]:
new_csv_filepath = os.path.join(DATA_DIR, "text-embedding-ada-002", "botometer_sample_openai_tweet_embeddings_20230724.csv.gz")

merged_df.to_csv(new_csv_filepath, compression="gzip", index=False)