We fetched OpenAI embeddings and stored on BQ. Let's download a CSV file to drive for further analysis.

## Setup

### Google Drive

In [3]:
import os
from google.colab import drive

drive.mount('/content/drive')
print(os.getcwd(), os.listdir(os.getcwd()))

Mounted at /content/drive
/content ['.config', 'drive', 'sample_data']


In [4]:
# you might need to create a google drive SHORTCUT that has this same path
# ... or update the path to use your own google drive organization
#DIRPATH = '/content/drive/MyDrive/Research/Disinfo Research Shared 2022'
#DIRPATH = '/content/drive/MyDrive/Research/DS Research Shared 2023'
DIRPATH = '/content/drive/MyDrive/Research/DS Research Shared 2024'

print(DIRPATH)
os.path.isdir(DIRPATH)

/content/drive/MyDrive/Research/DS Research Shared 2024


True

In [5]:
DATA_DIRPATH = os.path.join(DIRPATH, "projects", "Impeachment 2020 Embeddings", "data")
os.path.isdir(DATA_DIRPATH)

True

In [6]:
#csv_filepath = os.path.join("")
#print(os.path.isfile(users_csv_filepath))

### BigQuery Service

In [2]:
from google.colab import auth

# asks you to login
auth.authenticate_user()

In [27]:
from google.cloud import bigquery
from pandas import DataFrame, read_gbq


PROJECT_ID = "tweet-collector-py"

class BigQueryService():
    def __init__(self, project_id=PROJECT_ID):
        self.project_id = project_id
        self.client = bigquery.Client(project=self.project_id)

    def execute_query(self, sql, verbose=True):
        if verbose == True:
            print(sql)
        job = self.client.query(sql)
        return job.result()

    #def query_to_df(self, sql, verbose=True):
    #    """high-level wrapper to return a DataFrame"""
    #    results = self.execute_query(sql, verbose=verbose)
    #    return DataFrame([dict(row) for row in results])

    def query_to_df(self, sql, verbose=True):
        """high-level wrapper to return a DataFrame"""
        if verbose == True:
            print(sql)
        # https://pandas.pydata.org/docs/reference/api/pandas.read_gbq.html#pandas-read-gbq
        return read_gbq(sql, project_id=self.project_id) # progress_bar_type="tqdm_notebook"





In [28]:
bq = BigQueryService()
print(bq)

<__main__.BigQueryService object at 0x79f65c9f99c0>


In [25]:
print("DATASETS:")
datasets = list(bq.client.list_datasets())
for ds in datasets:
    #print("...", ds.project, ds.dataset_id)
    print("...", ds.reference)

DATASETS:
... tweet-collector-py.analysis_2021
... tweet-collector-py.analysis_2021_development
... tweet-collector-py.collection_2021
... tweet-collector-py.disinfo_2021_development
... tweet-collector-py.disinfo_2021_production
... tweet-collector-py.election_2020_analysis
... tweet-collector-py.election_2020_development
... tweet-collector-py.election_2020_production
... tweet-collector-py.f1_racing_2023_development
... tweet-collector-py.f1_racing_2023_production
... tweet-collector-py.impeachment_2021_development
... tweet-collector-py.impeachment_2021_production
... tweet-collector-py.impeachment_backup
... tweet-collector-py.impeachment_development
... tweet-collector-py.impeachment_production
... tweet-collector-py.impeachment_test
... tweet-collector-py.jan6_committee_development
... tweet-collector-py.jan6_committee_production
... tweet-collector-py.transition_2021_development
... tweet-collector-py.transition_2021_production
... tweet-collector-py.truth_2023_development
... 

# Embeddings

In [12]:
DATASET_ADDRESS = "tweet-collector-py.impeachment_production"

In [29]:
sql = f"""
    SELECT
        count(distinct s.user_id) as user_count
        ,count(distinct s.status_id) as status_count
    FROM `{DATASET_ADDRESS}.botometer_sample` s
    JOIN `{DATASET_ADDRESS}.botometer_sample_max_50_openai_status_embeddings` emb
        ON s.status_id = emb.status_id
"""
bq.query_to_df(sql, verbose=False)

Unnamed: 0,user_count,status_count
0,7566,183727


## User Embeddings

7566 users

In [18]:
##%%time

sql = f"""
     SELECT
        u.user_id, u.created_on
        --, u.screen_name_count, u.screen_names, split(u.screen_names, ",")[0] as screen_name
        ,u.status_count, u.rt_count
        ,u.is_bot --, u.bot_rt_network
        ,u.opinion_community --, u.avg_score_lr, avg_score_nb, avg_score_bert
        , u.is_q --, u.q_status_count
        --, u.follower_count, u.follower_count_b, u.follower_count_h
        --, u.friend_count, u.friend_count_b, u.friend_count_h

        ,u.avg_toxicity --, u.avg_severe_toxicity, u.avg_insult, u.avg_obscene, u.avg_threat, u.avg_identity_hate
        , u.avg_fact_score -- ,u.fact_scored_count

        ,u.bom_astroturf, u.bom_overall --, u.bom_cap  --,u.bom_lookup_count
        --,u.bom_fake_follower, u.bom_financial, u.bom_other, u.bom_self_declared, u.bom_spammer

        ,emb.embeddings

    FROM `{DATASET_ADDRESS}.user_details_v20240128_slim` u
    JOIN `{DATASET_ADDRESS}.botometer_sample_max_50_openai_user_embeddings` emb
        ON emb.user_id = u.user_id
    -- LIMIT 10
"""

users_df = bq.query_to_df(sql, verbose=True)
print(users_df.shape)
users_df.head()


     SELECT 
        u.user_id, u.created_on 
        --, u.screen_name_count, u.screen_names, split(u.screen_names, ",")[0] as screen_name
        ,u.status_count, u.rt_count
        ,u.is_bot --, u.bot_rt_network
        ,u.opinion_community --, u.avg_score_lr, avg_score_nb, avg_score_bert
        , u.is_q --, u.q_status_count 
        --, u.follower_count, u.follower_count_b, u.follower_count_h
        --, u.friend_count, u.friend_count_b, u.friend_count_h

        ,u.avg_toxicity --, u.avg_severe_toxicity, u.avg_insult, u.avg_obscene, u.avg_threat, u.avg_identity_hate
        , u.avg_fact_score -- ,u.fact_scored_count

        ,u.bom_astroturf, u.bom_overall --, u.bom_cap  --,u.bom_lookup_count 
        --,u.bom_fake_follower, u.bom_financial, u.bom_other, u.bom_self_declared, u.bom_spammer

        ,emb.embeddings

    FROM `tweet-collector-py.impeachment_production.user_details_v20240128_slim` u 
    JOIN `tweet-collector-py.impeachment_production.botometer_sample_max_50_openai_

Unnamed: 0,user_id,created_on,status_count,rt_count,is_bot,opinion_community,is_q,avg_toxicity,avg_fact_score,bom_astroturf,bom_overall,embeddings
0,1196938918874107904,2019-11-19,145,145,True,0,False,0.059316,3.678571,0.8,0.8,"[-0.021835120394825935, -0.004839153029024601,..."
1,57175518,2009-07-15,685,655,True,1,False,0.042347,2.377551,0.415,0.805,"[-0.027428433299064636, 0.011793000623583794, ..."
2,827175115200876544,2017-02-02,749,714,True,0,False,0.055041,4.653061,0.83,0.83,"[-0.015219379216432571, -0.010519277304410934,..."
3,405039097,2011-11-04,479,467,True,1,False,0.05324,1.8,0.535,0.31,"[-0.02283899299800396, 0.004357530735433102, 0..."
4,826747343571664897,2017-02-01,697,669,True,1,False,0.04858,1.571429,0.245,0.08,"[-0.03725171461701393, -0.011983302421867847, ..."


Saving CSV to drive:

In [19]:
user_embeddings_csv_filepath = os.path.join(DATA_DIRPATH, "botometer_sample_max_50_openai_user_embeddings.csv")
users_df.to_csv(user_embeddings_csv_filepath, index=False)

In [None]:
#import json
#
#def unpack(embeddings_str):
#    # idempotence check
#    if isinstance(embeddings_str, str):
#        return json.loads(embeddings_str)
#    else:
#        return embeddings_str
#
#embeds = df["embeddings"].apply(unpack)
#print(type(embeds))
#
#embeds = DataFrame(embeds.values.tolist())
#embeds.columns = [f"openai_{col}" for col in embeds.columns]
#embeds.index = df.index
#print(embeds.shape)
#embeds.head()

In [None]:
#df_embeds = df.drop(columns=["embeddings"]).merge(embeds, left_index=True, right_index=True)
#df_embeds.head()

In [24]:
#df.to_csv("status_embeddings.csv", index=False)

## Tweet Embeddings

183K statuses

Wow wow wow this is taking a long time (1hr +...) to stream the data down over the network...

In [31]:

sql = f"""
    SELECT s.user_id, s.status_id, s.status_text, s.created_at, emb.embeddings
    FROM `{DATASET_ADDRESS}.botometer_sample` s
    JOIN `{DATASET_ADDRESS}.botometer_sample_max_50_openai_status_embeddings` emb
        ON s.status_id = emb.status_id
    -- LIMIT 10000
"""

tweets_df = bq.query_to_df(sql, verbose=True)
print(tweets_df.shape)
tweets_df.head()


    SELECT s.user_id, s.status_id, s.status_text, s.created_at, emb.embeddings
    FROM `tweet-collector-py.impeachment_production.botometer_sample` s 
    JOIN `tweet-collector-py.impeachment_production.botometer_sample_max_50_openai_status_embeddings` emb 
        ON s.status_id = emb.status_id
    -- LIMIT 10000

(198477, 5)


Unnamed: 0,user_id,status_id,status_text,created_at,embeddings
0,999756421188202498,1218190214906155009,RT @jimsciutto: Immediate question for Starr a...,2020-01-17 15:15:54+00:00,"[-0.0176332239061594, -0.018955715000629425, 0..."
1,295393566,1224859633216630784,RT @village_jordan: Rand Paul on Senate floor ...,2020-02-05 00:57:48+00:00,"[-0.02314981445670128, -0.009875185787677765, ..."
2,1173707986038067200,1225270015110320128,RT @DemWrite: 🔥 Share this 1 million times 🔥 ...,2020-02-06 04:08:30+00:00,"[-0.022219592705368996, 0.0038568368181586266,..."
3,850117708477853697,1210648904716443649,RT @realDonaldTrump: “Pelosi’s stall tactics e...,2019-12-27 19:49:26+00:00,"[-0.0222681425511837, -0.006929074414074421, -..."
4,888143550638170112,1210261760835555328,RT @joncoopertweets: FUN FACT: According to a ...,2019-12-26 18:11:04+00:00,"[-0.026049431413412094, 0.00031084506190381944..."


In [34]:
tweets_df.shape

(198477, 5)

Saving CSV to drive:

In [33]:
status_embeddings_csv_filepath = os.path.join(DATA_DIRPATH, "botometer_sample_max_50_openai_status_embeddings.csv")
tweets_df.to_csv(status_embeddings_csv_filepath, index=False)

In [None]:
##from pandas import concat
##
##limit = 1_000
##offset = 0
##
##all = DataFrame()
##
##while offset < 5_500:
##    sql = f"""
##        SELECT s.user_id, s.status_id, s.status_text, s.created_at, emb.embeddings
##        FROM `{DATASET_ADDRESS}.botometer_sample` s
##        JOIN `{DATASET_ADDRESS}.botometer_sample_max_50_openai_status_embeddings` emb
##            ON s.status_id = emb.status_id
##        LIMIT {int(limit)}
##        OFFSET {int(offset)}
##    """
##
##    batch = bq.query_to_df(sql, verbose=True)
##    print(tweets_df.shape)
##    if batch.empty:
##        print("ALL DONE!")
##        break
##
##    concat(all, batch)
##    offset += limit

