# Setup

## Mounting the Drive

In [92]:
import os

from google.colab import drive

drive.mount('/content/drive')
print(os.listdir(os.getcwd())) 

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
['.config', 'drive', 'sample_data']


In [93]:
# you might need to create a google drive SHORTCUT that has this same path
# ... or update the path to use your own google drive organization
DIRPATH = '/content/drive/MyDrive/Research/Disinfo Research Shared 2022'
print(DIRPATH)
os.path.isdir(DIRPATH)

/content/drive/MyDrive/Research/Disinfo Research Shared 2022


True

## Configuring Credentials 


In [94]:
# google.cloud checks the file at path designated by the GOOGLE_APPLICATION_CREDENTIALS env var
# so we set it here using the shared credentials JSON file from our shared google drive
# and verify it for good measure

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = os.path.join(DIRPATH, "credentials", "tweet-research-shared-268bbccc0aac.json") 

GOOGLE_APPLICATION_CREDENTIALS = os.getenv("GOOGLE_APPLICATION_CREDENTIALS") # implicit check by google.cloud
print(GOOGLE_APPLICATION_CREDENTIALS) # verification for implicit check
print(os.path.isfile(GOOGLE_APPLICATION_CREDENTIALS)) # verification for implicit check

/content/drive/MyDrive/Research/Disinfo Research Shared 2022/credentials/tweet-research-shared-268bbccc0aac.json
True


# Helpers

### BigQuery Service

In [95]:

from google.cloud import bigquery
from pandas import DataFrame

class BigQueryService():
    def __init__(self):
        self.client = bigquery.Client()

    def execute_query(self, sql, verbose=True):
        if verbose == True: 
            print(sql)
        job = self.client.query(sql)
        return job.result()
    
    def query_to_df(self, sql, verbose=True):
        """high-level wrapper to return a DataFrame"""
        results = self.execute_query(sql, verbose=verbose)
        records = [dict(row) for row in list(results)]
        df = DataFrame(records)
        return df


In [96]:
bq_service = BigQueryService()
print(bq_service)

<__main__.BigQueryService object at 0x7f675c2b4a10>


# Usage

In [97]:
print("------------")
print("QUERY:")
sql = """
    SELECT user_id, tags
    FROM `tweet-research-shared.impeachment_2020.profile_tags_v2`
    WHERE ARRAY_LENGTH(tags) > 0 -- 451,698
"""

df = bq_service.query_to_df(sql, verbose=True)
print("------------")
print("RESULTS:" , len(df))
df.head()

------------
QUERY:

    SELECT user_id, tags
    FROM `tweet-research-shared.impeachment_2020.profile_tags_v2`
    WHERE ARRAY_LENGTH(tags) > 0 -- 451,698

------------
RESULTS: 451698


Unnamed: 0,user_id,tags
0,1022720011977285633,[#0]
1,1055646664889954304,[#0]
2,63358235,[#0]
3,1380664326,[#0]
4,1169702174374473732,[#0]


## Top Profile Tags

In [98]:
sql = f"""
    SELECT tag, count(distinct user_id) as user_count
    FROM `tweet-research-shared.impeachment_2020.profile_tags_v2_flat`
    GROUP BY 1
    ORDER BY 2 DESC
"""

top_tags_df = bq_service.query_to_df(sql)
top_tags_df.head()


    SELECT tag, count(distinct user_id) as user_count
    FROM `tweet-research-shared.impeachment_2020.profile_tags_v2_flat`
    GROUP BY 1
    ORDER BY 2 DESC



Unnamed: 0,tag,user_count
0,#MAGA,63744
1,#RESIST,35313
2,#KAG,27820
3,#TRUMP2020,20984
4,#WWG1WGA,14999


In [99]:
from plotly.express import histogram

#fig = histogram(top_tags_df, x="user_count", nbins=20)
#fig.show()
#
#fig = histogram(top_tags_df[top_tags_df["user_count"] < 20], x="user_count", nbins=20)
#fig.show()
#
#fig = histogram(top_tags_df[top_tags_df["user_count"] > 100], x="user_count", nbins=20)
#fig.show()

fig = histogram(top_tags_df.head(50), x="user_count", nbins=20)
fig.show()

### Profile Tag Co-occurances

First, let's fetch all user profile tags from the database. We have a row per user_id, with a list of distinct tags that they ever mentioned in their profile during the duration of our collection period.

In [100]:
sql = f"""
    SELECT user_id, distinct_tags as tags
    FROM `tweet-research-shared.impeachment_2020.profile_tags_v2_distinct`
    WHERE ARRAY_LENGTH(distinct_tags) > 1 -- we only care about co-occurances
"""

user_tags_df = bq_service.query_to_df(sql)
print(len(user_tags_df))
user_tags_df.head()


    SELECT user_id, distinct_tags as tags
    FROM `tweet-research-shared.impeachment_2020.profile_tags_v2_distinct`
    WHERE ARRAY_LENGTH(distinct_tags) > 1 -- we only care about co-occurances

258821


Unnamed: 0,user_id,tags
0,22762000,"[#0, #2]"
1,1864065690,"[#0, #JUNKIE]"
2,2785726018,"[#1, #2]"
3,1125970519298269185,"[#1, #2]"
4,860854368,"[#1, #2]"


In [101]:
user_tags_df.iloc[0]["tags"]

['#0', '#2']

Now, let's compile a list of combintations each hashtag was used in the same profile as another hashtag. Along the way let's retain information about the user id who mentioned both.

In [102]:
from itertools import combinations

print(list(combinations(["a"], 2)))
print(list(combinations(["a", "b"], 2)))
print(list(combinations(["a", "b", "c"], 2)))
print(list(combinations(["a", "b", "c", "d"], 2)))

[]
[('a', 'b')]
[('a', 'b'), ('a', 'c'), ('b', 'c')]
[('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')]


In [103]:
edge_list = []

for _, row in user_tags_df.iterrows():
    tag_pairs = list(combinations(row["tags"], 2)) #> list of tuples
    for tag_pair in tag_pairs:
        edge_list.append({"hashtag_pair": tag_pair, "user_id": row["user_id"]})

print(len(edge_list))
print(edge_list[0])

2846959
{'hashtag_pair': ('#0', '#2'), 'user_id': 22762000}


Here is an initial attempt to aggregate the number of times a given hashtag pair co-occured (count refers to the number of users).

In [104]:
edge_aggs = {}
for row in edge_list:
    tags_pair = row["hashtag_pair"]
    if tags_pair not in edge_aggs:
        edge_aggs[row["hashtag_pair"]] = 1
    else:
        edge_aggs[row["hashtag_pair"]] += 1

len(edge_aggs.keys()) -- 1,808,382

(1808383, 808, 382)

In [105]:
# NOTE THESE HAVE TO BE IN ALPHA ORDER
print(edge_aggs[('#1A', '#2A')]) #> 4,360
print(edge_aggs[('#KAG', '#MAGA')]) #> 21,389 
print(edge_aggs[('#MAGA', '#QANON')]) #> 4,728
print(edge_aggs[('#QANON', '#RESIST')]) #> 7

4360
21389
4728
7


But let's also compile a list of user_ids in the edge info, instead of just counting them, in case there are interesting similarity-related things we can do with this info later.

In [106]:
from operator import itemgetter

sorted_edge_list = sorted(edge_list, key=itemgetter("hashtag_pair")) # sort by some attribute
print(sorted_edge_list[0:5])

[{'hashtag_pair': ('#0', '#2'), 'user_id': 22762000}, {'hashtag_pair': ('#0', '#2'), 'user_id': 2403975446}, {'hashtag_pair': ('#0', '#2KLEAGUEPROSPECT'), 'user_id': 2403975446}, {'hashtag_pair': ('#0', '#GODFIRST'), 'user_id': 325995486}, {'hashtag_pair': ('#0', '#JUNKIE'), 'user_id': 1864065690}]


In [107]:
from itertools import groupby

groupby_tag_pair = groupby(sorted_edge_list, key=itemgetter("hashtag_pair")) # group by the sorted attribute
print(groupby_tag_pair) #> <itertools.groupby object at 0x10339dc50>

<itertools.groupby object at 0x7f6718fad770>


In [108]:
edge_aggs = {}

for tag_pair, items in groupby_tag_pair:
    user_ids = [item["user_id"] for item in items]
    #print(tag_pair, len(user_ids), user_ids)
    edge_aggs[tag_pair] = {"weight": len(user_ids), "user_ids": user_ids}

print(len(edge_aggs.keys()))

1808382


In [119]:

print(edge_aggs[('#1A', '#2A')]["weight"]) #> 4360
print(edge_aggs[('#KAG', '#MAGA')]["weight"]) #> 21389
print(edge_aggs[('#MAGA', '#QANON')]["weight"]) #> 4728
edge_aggs[('#QANON', '#RESIST')]


4360
21389
4728


{'weight': 7,
 'user_ids': [942185523279306752,
  1198294309512122368,
  1204488199298572291,
  887488437812682753,
  779104303269249024,
  994340333138755586,
  1309537879]}

# Graph Compilation and Storage

In [110]:
from networkx import Graph

# https://networkx.org/documentation/stable/reference/classes/graph.html
graph = Graph()
print(graph)

Graph with 0 nodes and 0 edges


In [111]:

# ADD NODES

for _, row in top_tags_df.iterrows():
    graph.add_node(row["tag"], user_count=row["user_count"])

print(graph)

Graph with 258622 nodes and 0 edges


In [116]:
# ADD EDGES

for tag_pair, edge_info in edge_aggs.items():
    tag, other_tag = tag_pair
    graph.add_edge(tag, other_tag, **edge_info)

print(graph)

Graph with 258623 nodes and 1722323 edges


Export the graph so we can save it to Google Drive and use it later.

In [117]:
from networkx import write_gpickle

graph_filepath = "user_profile_tags_coocurance_network.gpickle"

write_gpickle(graph, graph_filepath)