We fetched OpenAI embeddings and stored on BQ. Let's download a CSV file to drive for further analysis.

## Setup

### Google Drive

In [1]:
import os
from google.colab import drive

drive.mount('/content/drive')
print(os.getcwd(), os.listdir(os.getcwd()))

Mounted at /content/drive
/content ['.config', 'drive', 'sample_data']


In [2]:
# you might need to create a google drive SHORTCUT that has this same path
# ... or update the path to use your own google drive organization
#DIRPATH = '/content/drive/MyDrive/Research/Disinfo Research Shared 2022'
#DIRPATH = '/content/drive/MyDrive/Research/DS Research Shared 2023'
DIRPATH = '/content/drive/MyDrive/Research/DS Research Shared 2024'

print(DIRPATH)
os.path.isdir(DIRPATH)

/content/drive/MyDrive/Research/DS Research Shared 2024


True

In [3]:
DATA_DIRPATH = os.path.join(DIRPATH, "projects", "Impeachment 2020 Embeddings", "data")
os.path.isdir(DATA_DIRPATH)

True

### BigQuery Service

In [4]:
from google.colab import auth

# asks you to login
auth.authenticate_user()

In [5]:
from google.cloud import bigquery
from pandas import DataFrame, read_gbq


PROJECT_ID = "tweet-collector-py"

class BigQueryService():
    def __init__(self, project_id=PROJECT_ID):
        self.project_id = project_id
        self.client = bigquery.Client(project=self.project_id)

    def execute_query(self, sql, verbose=True):
        if verbose == True:
            print(sql)
        job = self.client.query(sql)
        return job.result()

    #def query_to_df(self, sql, verbose=True):
    #    """high-level wrapper to return a DataFrame"""
    #    results = self.execute_query(sql, verbose=verbose)
    #    return DataFrame([dict(row) for row in results])

    def query_to_df(self, sql, verbose=True):
        """high-level wrapper to return a DataFrame"""
        if verbose == True:
            print(sql)
        # https://pandas.pydata.org/docs/reference/api/pandas.read_gbq.html#pandas-read-gbq
        #return read_gbq(sql, project_id=self.project_id) # progress_bar_type="tqdm_notebook"
        #progress_bar_type="tqdm_notebook"
        return read_gbq(sql, project_id=self.project_id, progress_bar_type="tqdm_notebook")




In [6]:
bq = BigQueryService()
print(bq)

<__main__.BigQueryService object at 0x785284479510>


In [7]:
print("DATASETS:")
datasets = list(bq.client.list_datasets())
for ds in datasets:
    #print("...", ds.project, ds.dataset_id)
    print("...", ds.reference)

DATASETS:
... tweet-collector-py.analysis_2021
... tweet-collector-py.analysis_2021_development
... tweet-collector-py.collection_2021
... tweet-collector-py.disinfo_2021_development
... tweet-collector-py.disinfo_2021_production
... tweet-collector-py.election_2020_analysis
... tweet-collector-py.election_2020_development
... tweet-collector-py.election_2020_production
... tweet-collector-py.f1_racing_2023_development
... tweet-collector-py.f1_racing_2023_production
... tweet-collector-py.impeachment_2021_development
... tweet-collector-py.impeachment_2021_production
... tweet-collector-py.impeachment_backup
... tweet-collector-py.impeachment_development
... tweet-collector-py.impeachment_production
... tweet-collector-py.impeachment_test
... tweet-collector-py.jan6_committee_development
... tweet-collector-py.jan6_committee_production
... tweet-collector-py.transition_2021_development
... tweet-collector-py.transition_2021_production
... tweet-collector-py.truth_2023_development
... 

## Helper Functions

### Unpacking Embeddings

In [30]:
import json
from pandas import DataFrame


def unpack(embeddings_str):
    """Takes a string value containing an array of OpenAI embeddings,
        and returns a list of floats.
    """
    if isinstance(embeddings_str, str):
        return json.loads(embeddings_str)
    else:
        return embeddings_str


def unpacked(df, col_prefix="openai"):
    """Takes a dataframe witha single column of OpenAI embeddings,
        and unpacks them into their own separate columns,
        and returns a modified version of the original dataframe,
        with the original embeddings column replaced by the new unpacked columns
    """

    print("UNPACKING...")
    embeds = df["embeddings"].apply(unpack)
    print(type(embeds))

    print("RECONSTRUCTING...")
    embeds = DataFrame(embeds.values.tolist())
    embeds.columns = [f"{col_prefix}_{col}" for col in embeds.columns]
    embeds.index = df.index
    print(embeds.shape)
    #embeds.head()

    print("MERGING...")
    df_unpacked = df.merge(embeds, left_index=True, right_index=True)
    df_unpacked.drop(columns=["embeddings"], inplace=True)
    print(df_unpacked.shape)
    return df_unpacked



# Embeddings

In [8]:
DATASET_ADDRESS = "tweet-collector-py.impeachment_production"

In [9]:
sql = f"""
    SELECT
        count(distinct s.user_id) as user_count
        ,count(distinct s.status_id) as status_count
    FROM `{DATASET_ADDRESS}.botometer_sample` s
    JOIN `{DATASET_ADDRESS}.botometer_sample_max_50_openai_status_embeddings_v2` emb
        ON s.status_id = emb.status_id
"""
bq.query_to_df(sql, verbose=False)

Downloading:   0%|          |

Unnamed: 0,user_count,status_count
0,7566,183727


## User Embeddings

7566 users

In [None]:
sql = f"""
     SELECT
        u.user_id, u.created_on
        --, u.screen_name_count, u.screen_names, split(u.screen_names, ",")[0] as screen_name
        ,u.status_count, u.rt_count
        ,u.is_bot --, u.bot_rt_network
        ,u.opinion_community --, u.avg_score_lr, avg_score_nb, avg_score_bert
        , u.is_q --, u.q_status_count
        --, u.follower_count, u.follower_count_b, u.follower_count_h
        --, u.friend_count, u.friend_count_b, u.friend_count_h

        ,u.avg_toxicity --, u.avg_severe_toxicity, u.avg_insult, u.avg_obscene, u.avg_threat, u.avg_identity_hate
        , u.avg_fact_score -- ,u.fact_scored_count

        ,u.bom_astroturf, u.bom_overall --, u.bom_cap  --,u.bom_lookup_count
        --,u.bom_fake_follower, u.bom_financial, u.bom_other, u.bom_self_declared, u.bom_spammer

        ,emb.embeddings

    FROM `{DATASET_ADDRESS}.user_details_v20240128_slim` u
    JOIN `{DATASET_ADDRESS}.botometer_sample_max_50_openai_user_embeddings` emb
        ON emb.user_id = u.user_id
    -- LIMIT 10
"""

users_df = bq.query_to_df(sql, verbose=False)
print(users_df.shape)

In [20]:
users_df.head()

Unnamed: 0,user_id,created_on,status_count,rt_count,is_bot,opinion_community,is_q,avg_toxicity,avg_fact_score,bom_astroturf,bom_overall,embeddings
0,3420436216,2015-08-13,555,540,True,0,False,0.056113,1.983193,0.295,0.19,"[-0.018801862373948097, -0.007904230616986752,..."
1,108121958,2010-01-24,2,2,False,0,False,0.45671,,0.58,0.11,"[-0.030551623553037643, -0.0053298575803637505..."
2,3038308638,2015-02-23,755,665,True,0,False,0.06986,3.401786,0.97,0.97,"[-0.007297390140593052, 0.0010276929242536426,..."
3,332396536,2011-07-09,951,951,True,1,False,0.044264,2.304511,0.58,0.75,"[-0.01834747940301895, -0.007322159130126238, ..."
4,955082522479808512,2018-01-21,570,533,True,0,False,0.049325,4.714286,0.355,0.225,"[-0.024803657084703445, 0.007516898214817047, ..."


Saving CSV to drive:

In [21]:
csv_filepath = os.path.join(DATA_DIRPATH, "botometer_sample_max_50_openai_user_embeddings.csv.gz")
users_df.to_csv(csv_filepath, index=False, compression="gzip")

### ... Unpacked

In [31]:
users_df_unpacked = unpacked(users_df)
print(users_df.shape)
users_df_unpacked.head()

UNPACKING...
<class 'pandas.core.series.Series'>
RECONSTRUCTING...
(7566, 1536)
MERGING...
(7566, 1547)


Unnamed: 0,user_id,created_on,status_count,rt_count,is_bot,opinion_community,is_q,avg_toxicity,avg_fact_score,bom_astroturf,...,openai_1526,openai_1527,openai_1528,openai_1529,openai_1530,openai_1531,openai_1532,openai_1533,openai_1534,openai_1535
0,3420436216,2015-08-13,555,540,True,0,False,0.056113,1.983193,0.295,...,-0.001867,-0.013167,0.020885,-0.022568,-0.033631,0.016153,0.024127,-0.017519,0.002636,-0.039838
1,108121958,2010-01-24,2,2,False,0,False,0.45671,,0.58,...,0.017651,-0.009439,0.024375,-0.032553,-0.042185,0.013782,0.01132,-0.014862,-0.010413,-0.020359
2,3038308638,2015-02-23,755,665,True,0,False,0.06986,3.401786,0.97,...,-0.026273,-0.008139,0.030285,-0.029902,-0.030887,0.022481,-0.005476,-0.016279,-0.010138,-0.021454
3,332396536,2011-07-09,951,951,True,1,False,0.044264,2.304511,0.58,...,-0.00552,-0.005288,0.017071,-0.033637,-0.040202,0.041773,-0.00937,0.003352,0.009391,-0.042671
4,955082522479808512,2018-01-21,570,533,True,0,False,0.049325,4.714286,0.355,...,0.009959,0.004695,0.005555,-0.012851,-0.032229,0.031443,0.008163,-0.018501,-0.008724,-0.042027


In [32]:
csv_filepath = os.path.join(DATA_DIRPATH, "botometer_sample_max_50_openai_user_embeddings_unpacked.csv.gz")
users_df_unpacked.to_csv(csv_filepath, index=False, compression="gzip")

## Tweet Embeddings

183K statuses

Wow wow wow this is taking a long time (1hr +...) to stream the data down over the network...

Re-doing with the statuses table v2, that has duplicate lookups removed (row per unique status)...

Re-doing with statuses table v3, which has status texts as well...

In [33]:
sql = f"""
    SELECT user_id, status_id, status_text, created_at, embeds_length, embeddings
    FROM `{DATASET_ADDRESS}.botometer_sample_max_50_openai_status_embeddings_v3`
    -- LIMIT 10000
"""

tweets_df = bq.query_to_df(sql, verbose=True)
print(tweets_df.shape)
tweets_df.head()


    SELECT user_id, status_id, status_text, created_at, embeds_length, embeddings
    FROM `tweet-collector-py.impeachment_production.botometer_sample_max_50_openai_status_embeddings_v3` 
    -- LIMIT 10000



Downloading:   0%|          |

(183815, 6)


Unnamed: 0,user_id,status_id,status_text,created_at,embeds_length,embeddings
0,897845802701377536,1221540755451392001,Doubt it..It appears they all have gone the wa...,2020-01-26 21:09:45+00:00,1536,"[-0.020428381860256195, -0.006719687487930059,..."
1,935739601301458947,1223458629837295619,RT @Wyn1745: Democrats are ‘setting the stage’...,2020-02-01 04:10:42+00:00,1536,"[-0.03668860346078873, -0.0074811591766774654,..."
2,571774622,1217445781663363072,RT @sarahdwire: I’m loathe to insert myself in...,2020-01-15 13:57:48+00:00,1536,"[-0.033381544053554535, -0.006886449176818132,..."
3,384679808,1223705594818748416,RT @RepRatcliffe: We warned them...As Schiff a...,2020-02-01 20:32:03+00:00,1536,"[-0.008476617746055126, -0.007363526616245508,..."
4,701264221653217281,1218459840277729281,"RT @chipfranklin: Because ""impeachment"" in the...",2020-01-18 09:07:18+00:00,1536,"[-0.009453612379729748, 0.017376383766531944, ..."


In [35]:
tweets_df.head()

Unnamed: 0,user_id,status_id,status_text,created_at,embeds_length,embeddings
0,897845802701377536,1221540755451392001,Doubt it..It appears they all have gone the wa...,2020-01-26 21:09:45+00:00,1536,"[-0.020428381860256195, -0.006719687487930059,..."
1,935739601301458947,1223458629837295619,RT @Wyn1745: Democrats are ‘setting the stage’...,2020-02-01 04:10:42+00:00,1536,"[-0.03668860346078873, -0.0074811591766774654,..."
2,571774622,1217445781663363072,RT @sarahdwire: I’m loathe to insert myself in...,2020-01-15 13:57:48+00:00,1536,"[-0.033381544053554535, -0.006886449176818132,..."
3,384679808,1223705594818748416,RT @RepRatcliffe: We warned them...As Schiff a...,2020-02-01 20:32:03+00:00,1536,"[-0.008476617746055126, -0.007363526616245508,..."
4,701264221653217281,1218459840277729281,"RT @chipfranklin: Because ""impeachment"" in the...",2020-01-18 09:07:18+00:00,1536,"[-0.009453612379729748, 0.017376383766531944, ..."


Saving CSV to drive:

In [36]:
csv_filepath = os.path.join(DATA_DIRPATH, "botometer_sample_max_50_openai_status_embeddings_v3.csv.gz")
tweets_df.to_csv(csv_filepath, index=False, compression="gzip")

### ... Unpacked

In [37]:
unpacked_tweets_df = unpacked(tweets_df)
unpacked_tweets_df.head()

UNPACKING...
<class 'pandas.core.series.Series'>
RECONSTRUCTING...
(183815, 1536)
MERGING...
(183815, 1541)


Unnamed: 0,user_id,status_id,status_text,created_at,embeds_length,openai_0,openai_1,openai_2,openai_3,openai_4,...,openai_1526,openai_1527,openai_1528,openai_1529,openai_1530,openai_1531,openai_1532,openai_1533,openai_1534,openai_1535
0,897845802701377536,1221540755451392001,Doubt it..It appears they all have gone the wa...,2020-01-26 21:09:45+00:00,1536,-0.020428,-0.00672,0.007308,-0.022157,-0.041841,...,0.014616,0.004705,0.012661,-0.020974,-0.003458,0.045166,0.029871,-0.021186,-0.003376,-0.024937
1,935739601301458947,1223458629837295619,RT @Wyn1745: Democrats are ‘setting the stage’...,2020-02-01 04:10:42+00:00,1536,-0.036689,-0.007481,0.007968,-0.006632,-0.022805,...,-0.001696,0.002522,0.020397,-0.046374,-0.046611,0.021068,-8.5e-05,-0.003701,-0.01537,-0.019213
2,571774622,1217445781663363072,RT @sarahdwire: I’m loathe to insert myself in...,2020-01-15 13:57:48+00:00,1536,-0.033382,-0.006886,-0.003244,-0.015834,0.000172,...,0.001027,0.002464,0.002013,-0.032766,-0.034265,0.006545,0.014804,0.003027,-0.001518,-0.030946
3,384679808,1223705594818748416,RT @RepRatcliffe: We warned them...As Schiff a...,2020-02-01 20:32:03+00:00,1536,-0.008477,-0.007364,0.000919,-0.006435,0.008101,...,-0.028269,0.003193,0.015056,-0.015333,-0.028137,0.03251,0.010327,-0.013621,-0.007686,-0.016216
4,701264221653217281,1218459840277729281,"RT @chipfranklin: Because ""impeachment"" in the...",2020-01-18 09:07:18+00:00,1536,-0.009454,0.017376,0.007016,-0.020075,-0.023674,...,-0.01359,0.015564,0.00513,0.003077,-0.029167,0.015523,0.017914,-0.008789,-0.019767,-0.042353


In [39]:
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_parquet.html

pq_filepath = os.path.join(DATA_DIRPATH, "botometer_sample_max_50_openai_status_embeddings_v3_unpacked.parquet.gzip")
unpacked_tweets_df.to_parquet(pq_filepath, compression="gzip")

In [None]:
csv_filepath = os.path.join(DATA_DIRPATH, "botometer_sample_max_50_openai_status_embeddings_v3_unpacked.csv.gz")
unpacked_tweets_df.to_csv(csv_filepath, index=False, compression="gzip")

In [None]:
#arrow_filepath = os.path.join(DATA_DIRPATH, "botometer_sample_max_50_openai_status_embeddings_v3_unpacked.arrow")
#df.to_feather(arrow_filepath)

## Scratch Work

In [None]:
##from pandas import concat
##
##limit = 1_000
##offset = 0
##
##all = DataFrame()
##
##while offset < 5_500:
##    sql = f"""
##        SELECT s.user_id, s.status_id, s.status_text, s.created_at, emb.embeddings
##        FROM `{DATASET_ADDRESS}.botometer_sample` s
##        JOIN `{DATASET_ADDRESS}.botometer_sample_max_50_openai_status_embeddings` emb
##            ON s.status_id = emb.status_id
##        LIMIT {int(limit)}
##        OFFSET {int(offset)}
##    """
##
##    batch = bq.query_to_df(sql, verbose=True)
##    print(tweets_df.shape)
##    if batch.empty:
##        print("ALL DONE!")
##        break
##
##    concat(all, batch)
##    offset += limit



### Compressed Table

https://cloud.google.com/bigquery/docs/exporting-data#bigquery_extract_table_compressed-python

In [None]:
# from google.cloud import bigquery
# client = bigquery.Client()
# bucket_name = 'my-bucket'

#destination_uri = "gs://{}/{}".format(bucket_name, "shakespeare.csv.gz")
#dataset_ref = bigquery.DatasetReference(project, dataset_id)
#table_ref = dataset_ref.table("shakespeare")
#job_config = bigquery.job.ExtractJobConfig()
#job_config.compression = bigquery.Compression.GZIP
#
#extract_job = client.extract_table(
#    table_ref,
#    destination_uri,
#    # Location must match that of the source table.
#    location="US",
#    job_config=job_config,
#)  # API request
#extract_job.result()  # Waits for job to complete.

In [None]:
# from google.cloud import bigquery
# client = bigquery.Client()
# bucket_name = 'my-bucket'


#from google.cloud import bigquery
#
#
##ds_ref = bigquery.DatasetReference(PROJECT_ID, DATASET_ADDRESS)
#DATASET_ID = "impeachment_production"
#ds_ref = bigquery.DatasetReference(PROJECT_ID, DATASET_ID)
#table_ref = ds_ref.table("botometer_sample_max_50_openai_status_embeddings_v3")
#
#job_config = bigquery.job.ExtractJobConfig()
#job_config.compression = bigquery.Compression.GZIP
#
#BUCKET_NAME = "impeachment-analysis-2020"
##destination_uri = f"gs://{BUCKET_NAME}/impeachment_production/botometer_sample_max_50_openai_status_embeddings_v4.csv.gz"
##> too large to be exported to a single file. Specify a uri including a * to shard export. See 'Exporting data into one or more files' in https://cloud.google.com/bigquery/docs/exporting-data.
#destination_uri = f"gs://{BUCKET_NAME}/impeachment_production/botometer_sample_max_50_openai_status_embeddings_v4_*.csv.gz"
#
#client = bq.client
#extract_job = client.extract_table(
#    table_ref,
#    destination_uri,
#    # Location must match that of the source table.
#    location="US",
#    job_config=job_config,
#)  # API request
#extract_job.result()  # Waits for job to complete.