<a href="https://colab.research.google.com/github/s2t2/tweet-analysis-2021/blob/warehouse/notebooks/Collection_Results_and_Dataset_Summaries.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook shows the number of tweets collected per day, for each dataset.

# Setup

## Mounting the Drive

In [2]:
import os

from google.colab import drive

drive.mount('/content/drive')
print(os.listdir(os.getcwd())) 

Mounted at /content/drive
['.config', 'drive', 'sample_data']


In [3]:
# you might need to create a google drive SHORTCUT that has this same path
# ... or update the path to use your own google drive organization
DIRPATH = '/content/drive/MyDrive/Research/Disinfo Research Shared 2022'
print(DIRPATH)
os.path.isdir(DIRPATH)

/content/drive/MyDrive/Research/Disinfo Research Shared 2022


True

## Configuring Credentials 


In [4]:
# google.cloud checks the file at path designated by the GOOGLE_APPLICATION_CREDENTIALS env var
# so we set it here using the shared credentials JSON file from our shared google drive
# and verify it for good measure

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = os.path.join(DIRPATH, "credentials", "tweet-research-shared-268bbccc0aac.json") 

GOOGLE_APPLICATION_CREDENTIALS = os.getenv("GOOGLE_APPLICATION_CREDENTIALS") # implicit check by google.cloud
print(GOOGLE_APPLICATION_CREDENTIALS) # verification for implicit check
print(os.path.isfile(GOOGLE_APPLICATION_CREDENTIALS)) # verification for implicit check

/content/drive/MyDrive/Research/Disinfo Research Shared 2022/credentials/tweet-research-shared-268bbccc0aac.json
True


# Helpers

### BigQuery Service

In [5]:

from google.cloud import bigquery
from pandas import DataFrame

class BigQueryService():
    def __init__(self):
        self.client = bigquery.Client()

    def execute_query(self, sql, verbose=True):
        if verbose == True: 
            print(sql)
        job = self.client.query(sql)
        return job.result()
    
    def query_to_df(self, sql, verbose=True):
        """high-level wrapper to return a DataFrame"""
        results = self.execute_query(sql, verbose=verbose)
        records = [dict(row) for row in list(results)]
        df = DataFrame(records)
        return df


In [6]:
bq_service = BigQueryService()
print(bq_service)

<__main__.BigQueryService object at 0x7f80d047f310>


### Helper Functions

In [None]:

def topics_list(dataset_name):
    dataset_name = dataset_name.replace(";","") # be extra safe to prevent SQL injection
    
    sql = f"""
        SELECT
            topic, created_at
        FROM `tweet-research-shared.{dataset_name}.topics`
    """
    df = bq_service.query_to_df(sql, verbose=False)
    #print(df.head())
    df.sort_values(by=["created_at"], inplace=True)
    return df
    



In [None]:
from plotly.express import bar

def tweets_by_day(dataset_name):
    dataset_name = dataset_name.replace(";","") # be extra safe to prevent SQL injection
    if dataset_name == "impeachment_2020":
        table_name = "tweets_v2"
    else:
        table_name = "tweets_v2_slim"
    
    sql = f"""
        SELECT
            cast(created_at as date) as created_on 
            ,count(distinct status_id) as status_count
        FROM `tweet-research-shared.{dataset_name}.{table_name}`
        GROUP BY 1
        ORDER BY 1
    """
    df = bq_service.query_to_df(sql, verbose=False)

    title = f"Tweets Collected by Day (Dataset: '{dataset_name}')"
    fig = bar(df, x="created_on", y="status_count", title=title)
    fig.show()



In [None]:
def users_by_day(dataset_name):
    dataset_name = dataset_name.replace(";","") # be extra safe to prevent SQL injection
    if dataset_name == "impeachment_2020":
        table_name = "tweets_v2"
    else:
        table_name = "tweets_v2_slim"
    
    sql = f"""
        SELECT
            cast(created_at as date) as created_on 
            ,count(distinct user_id) as user_count
        FROM `tweet-research-shared.{dataset_name}.{table_name}`
        GROUP BY 1
        ORDER BY 1
    """
    df = bq_service.query_to_df(sql, verbose=False)

    title = f"Users Active by Day (Dataset: '{dataset_name}')"
    fig = bar(df, x="created_on", y="user_count", title=title)
    fig.show()

In [15]:
from plotly.express import bar


def timeline_tweets_by_day(after_date="2010-01-01"):
    # WE ONLY HAVE TIMELINE TWEETS FOR THE DISINFO_2021 DATASET ATM
    dataset_name = "disinfo_2021"
    after_date = after_date.replace(";","") # sql inj
    sql = f"""
        SELECT
            cast(created_at as date) as created_on 
            ,count(distinct user_id) as user_count
            ,count(distinct status_id) as status_count
        FROM `tweet-research-shared.{dataset_name}.timeline_tweets`
        WHERE created_at >= '{after_date}'
        GROUP BY 1
        ORDER BY 1
    """
    df = bq_service.query_to_df(sql, verbose=False)

    tweets_title = f"Timeline Tweets Recollected by Day (Dataset: '{dataset_name}')"
    tweets_fig = bar(df, x="created_on", y="status_count", title=tweets_title)
    tweets_fig.show()

    users_title = f"Timeline Tweet Users by Day (Dataset: '{dataset_name}')"
    users_fig = bar(df, x="created_on", y="user_count", title=users_title)
    users_fig.show()

    #return df, tweets_fig, users_fig



# Dataset Summaries

## Impeachment 2020

In [None]:
topics_list("impeachment_2020")

Unnamed: 0,topic,created_at
23,impeachment,2019-12-17 17:48:23+00:00
14,#SenateHearing,2019-12-17 17:48:23+00:00
13,impeached,2019-12-17 17:48:23+00:00
22,#ImpeachAndConvictTrump,2019-12-17 17:48:23+00:00
17,Trump to Pelosi,2019-12-17 17:48:23+00:00
15,#IGHearing,2019-12-17 17:48:23+00:00
18,impeach,2019-12-17 17:48:23+00:00
16,#IGReport,2019-12-17 17:48:23+00:00
20,#ImpeachAndConvict,2019-12-17 17:48:23+00:00
21,#FactsMatter,2019-12-17 17:48:23+00:00


In [None]:
tweets_by_day("impeachment_2020")

In [None]:
users_by_day("impeachment_2020")

## Election 2020

In [None]:
topics_list("election_2020")

Unnamed: 0,topic,created_at
30,Trump,2020-08-07 16:08:15+00:00
29,Biden,2020-08-07 16:08:15+00:00
28,#KAG2020,2020-08-07 16:08:15+00:00
27,#2020Election,2020-08-07 16:08:15+00:00
26,#MAGA,2020-08-07 16:08:15+00:00
31,#Election2020,2020-08-07 16:08:15+00:00
32,#creepyjoe,2020-08-07 16:57:47+00:00
20,#voteblue,2020-08-09 07:40:34+00:00
19,#sleepyjoe,2020-08-09 07:40:34+00:00
5,#DNCC,2020-08-10 10:56:44+00:00


In [None]:
tweets_by_day("election_2020")

In [None]:
users_by_day("election_2020")

## Disinfo 2021

In [None]:
topics_list("disinfo_2021")

Unnamed: 0,topic,created_at
0,#WAKEUPAMERICA,2020-11-15 11:25:36+00:00
1,#QAnon,2020-11-15 11:25:36+00:00
2,#WWG1WGA,2020-11-15 11:25:36+00:00
3,#WEARETHENEWSNOW,2020-11-15 11:25:36+00:00
4,q anon,2020-11-15 11:25:36+00:00
5,#GREATAWAKENING,2020-11-15 11:25:36+00:00


In [None]:
tweets_by_day("disinfo_2021")

In [None]:
users_by_day("disinfo_2021")

In [17]:
timeline_tweets_by_day(after_date="2020-01-01")


## Transition 2021

In [None]:
topics_list("transition_2021")

Unnamed: 0,topic,created_at
6,#RiggedElection,2020-11-15 12:44:57+00:00
7,#TrumpConceded,2020-11-15 12:44:57+00:00
8,#StopTheSteal,2020-11-15 12:44:57+00:00
9,#SharpieGate,2020-11-15 12:44:57+00:00
12,#HeWon,2020-11-15 13:13:55+00:00
0,#ElectoralCollegeVote,2021-01-06 15:44:00+00:00
1,#Sedition,2021-01-06 15:44:00+00:00
2,#DCProtests,2021-01-06 15:44:00+00:00
3,#coup,2021-01-06 16:43:57+00:00
4,#25thAmendment,2021-01-06 16:43:57+00:00


In [None]:
tweets_by_day("transition_2021")

In [None]:
users_by_day("transition_2021")

## Impeachment 2021

In [None]:
tweets_by_day("impeachment_2021")

In [None]:
users_by_day("impeachment_2021")