# Setup

## Mounting the Drive

In [1]:
import os

from google.colab import drive

drive.mount('/content/drive')
print(os.listdir(os.getcwd())) 

Mounted at /content/drive
['.config', 'drive', 'sample_data']


In [2]:
# you might need to create a google drive SHORTCUT that has this same path
# ... or update the path to use your own google drive organization
DIRPATH = '/content/drive/MyDrive/Research/Disinfo Research Shared 2022'
print(DIRPATH)
os.path.isdir(DIRPATH)

/content/drive/MyDrive/Research/Disinfo Research Shared 2022


True

## Configuring Credentials 


In [3]:
# google.cloud checks the file at path designated by the GOOGLE_APPLICATION_CREDENTIALS env var
# so we set it here using the shared credentials JSON file from our shared google drive
# and verify it for good measure

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = os.path.join(DIRPATH, "credentials", "tweet-research-shared-268bbccc0aac.json") 

GOOGLE_APPLICATION_CREDENTIALS = os.getenv("GOOGLE_APPLICATION_CREDENTIALS") # implicit check by google.cloud
print(GOOGLE_APPLICATION_CREDENTIALS) # verification for implicit check
print(os.path.isfile(GOOGLE_APPLICATION_CREDENTIALS)) # verification for implicit check

/content/drive/MyDrive/Research/Disinfo Research Shared 2022/credentials/tweet-research-shared-268bbccc0aac.json
True


# Helpers

### BigQuery Service

In [4]:

from google.cloud import bigquery
from pandas import DataFrame

class BigQueryService():
    def __init__(self):
        self.client = bigquery.Client()

    def execute_query(self, sql, verbose=True):
        if verbose == True: 
            print(sql)
        job = self.client.query(sql)
        return job.result()
    
    def query_to_df(self, sql, verbose=True):
        """high-level wrapper to return a DataFrame"""
        results = self.execute_query(sql, verbose=verbose)
        records = [dict(row) for row in list(results)]
        df = DataFrame(records)
        return df


In [5]:
bq_service = BigQueryService()
print(bq_service)

<__main__.BigQueryService object at 0x7f3731378a50>


### Combinations Example

In [8]:
from itertools import combinations

print(list(combinations(["a"], 2)))
print(list(combinations(["a", "b"], 2)))
print(list(combinations(["a", "b", "c"], 2)))
print(list(combinations(["a", "b", "c", "d"], 2)))

[]
[('a', 'b')]
[('a', 'b'), ('a', 'c'), ('b', 'c')]
[('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')]


# Usage

## User Profile Hashtag Lists

First, let's fetch all user profile tags from the database. We have a row per user_id, with a list of distinct tags that they ever mentioned in their profile during the duration of our collection period.

In [7]:
sql = f"""
    SELECT user_id, distinct_tags as tags
    FROM `tweet-research-shared.impeachment_2020.profile_tags_v2_distinct`
    WHERE ARRAY_LENGTH(distinct_tags) > 1 -- we only care about co-occurances
"""

user_tags_df = bq_service.query_to_df(sql)
print(len(user_tags_df))
user_tags_df.head()


    SELECT user_id, distinct_tags as tags
    FROM `tweet-research-shared.impeachment_2020.profile_tags_v2_distinct`
    WHERE ARRAY_LENGTH(distinct_tags) > 1 -- we only care about co-occurances

258821


Unnamed: 0,user_id,tags
0,22762000,"[#0, #2]"
1,1864065690,"[#0, #JUNKIE]"
2,2785726018,"[#1, #2]"
3,1125970519298269185,"[#1, #2]"
4,860854368,"[#1, #2]"


In [10]:
user_tags_df.iloc[0]

user_id    22762000
tags       [#0, #2]
Name: 0, dtype: object

In [9]:
user_tags_df.iloc[0]["tags"]

['#0', '#2']

In [12]:
user_tags_df.head()

Unnamed: 0,user_id,tags
0,22762000,"[#0, #2]"
1,1864065690,"[#0, #JUNKIE]"
2,2785726018,"[#1, #2]"
3,1125970519298269185,"[#1, #2]"
4,860854368,"[#1, #2]"


## User Profile Hashtag Pairs

Right now we have a row per unique user, but we want a row per unique user per hashtag pair.

In [21]:
#%%time #> 25 seconds

records = []
for _, row in user_tags_df.iterrows():
    
    user_tag_pairs = list(combinations(row["tags"], 2)) #> list of tuples

    for tag_pair in user_tag_pairs:
        records.append({
            "user_id": row["user_id"], 
            "tag_0": tag_pair[0], 
            "tag_1": tag_pair[1]
        })

user_pairs_df = DataFrame(records)
user_pairs_df

CPU times: user 26.8 s, sys: 763 ms, total: 27.5 s
Wall time: 27.6 s


Unnamed: 0,user_id,tag_0,tag_1
0,22762000,#0,#2
1,1864065690,#0,#JUNKIE
2,2785726018,#1,#2
3,1125970519298269185,#1,#2
4,860854368,#1,#2
...,...,...,...
2846954,1163835134,#TEAMROUSEY,#NOTREDAME
2846955,1163835134,#TEAMMMA4LIFE,#WARLOVSKI
2846956,1163835134,#TEAMMMA4LIFE,#NOTREDAME
2846957,1163835134,#WARLOVSKI,#NOTREDAME


In [22]:
csv_filepath = os.path.join(DIRPATH, "data", "user_profile_hashtag_pairs_v4.csv")
user_pairs_df.to_csv(csv_filepath)

### User Profile Hashtag Co-occurrence Matrix

In [None]:
# https://www.kaggle.com/code/rtatman/co-occurrence-matrix-plot-in-python

from pandas import crosstab

# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.crosstab.html
# ... Compute a simple cross tabulation of two (or more) factors.
# ... By default, computes a frequency table of the factors 
# ... index: Values to group by in the rows
# ... columns: Values to group by in the columns.
# ... rownames: If passed, must match number of row arrays passed
# ... colnames: If passed, must match number of column arrays passed.
#
co_matrix = crosstab(user_pairs_df.tag_0, user_pairs_df.tag_1) 

print(type(co_matrix))
print(co_matrix.shape)
co_matrix

In [None]:
#> Your session crashed after using all available RAM. If you are interested in access to high-RAM runtimes, you may want to check out Colab Pro.
# ... (hmmm, maybe need to do this locally...)