In [1]:
# Resolve paths from root project directory

import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
import pandas as pd, numpy

In [3]:
DATE = "16-dec"
DATA_DIR = "../data/{}/".format(DATE)
EXPORT_DIR = "../data/dataframes/{}/".format(DATE)

In [4]:
from data_tools.dataframes import create_tweet_df, create_user_df, create_retweet_df, aggregate_counts_by_hour, aggregate_most_common_hashtags
from data_tools import load_crawled_terms 

In [5]:
CRAWLED_TERMS = load_crawled_terms("../keywords-3nov.txt")

In [6]:
retweet_df = create_retweet_df(data_dir=DATA_DIR)
retweet_df.info()

Loading 25566698 json lines
(0%): 100000 lines in ../data/16-dec/parsed_retweets.json processed (0.34247398376464844 sec)
(1%): 200000 lines in ../data/16-dec/parsed_retweets.json processed (0.3321700096130371 sec)
(1%): 300000 lines in ../data/16-dec/parsed_retweets.json processed (0.37143874168395996 sec)
(2%): 400000 lines in ../data/16-dec/parsed_retweets.json processed (0.33399295806884766 sec)
(2%): 500000 lines in ../data/16-dec/parsed_retweets.json processed (0.40375328063964844 sec)
(2%): 600000 lines in ../data/16-dec/parsed_retweets.json processed (0.40134692192077637 sec)
(3%): 700000 lines in ../data/16-dec/parsed_retweets.json processed (0.39596986770629883 sec)
(3%): 800000 lines in ../data/16-dec/parsed_retweets.json processed (0.39369988441467285 sec)
(4%): 900000 lines in ../data/16-dec/parsed_retweets.json processed (0.3447408676147461 sec)
(4%): 1000000 lines in ../data/16-dec/parsed_retweets.json processed (0.3477671146392822 sec)
(4%): 1100000 lines in ../data/16-

In [7]:
old_tweet_df, recent_tweet_df = create_tweet_df(
    retweet_df.timestamp.min(), CRAWLED_TERMS, data_dir=DATA_DIR
)

Loading 7609005 json lines
(1%): 100000 lines in ../data/16-dec/parsed_tweets.json processed (2.6809768676757812 sec)
(3%): 200000 lines in ../data/16-dec/parsed_tweets.json processed (3.286905288696289 sec)
(4%): 300000 lines in ../data/16-dec/parsed_tweets.json processed (2.3683621883392334 sec)
(5%): 400000 lines in ../data/16-dec/parsed_tweets.json processed (2.5968551635742188 sec)
(7%): 500000 lines in ../data/16-dec/parsed_tweets.json processed (2.7551989555358887 sec)
(8%): 600000 lines in ../data/16-dec/parsed_tweets.json processed (3.262377977371216 sec)
(9%): 700000 lines in ../data/16-dec/parsed_tweets.json processed (1.82088303565979 sec)
(11%): 800000 lines in ../data/16-dec/parsed_tweets.json processed (3.635631799697876 sec)
(12%): 900000 lines in ../data/16-dec/parsed_tweets.json processed (4.123608112335205 sec)
(13%): 1000000 lines in ../data/16-dec/parsed_tweets.json processed (1.9939191341400146 sec)
(14%): 1100000 lines in ../data/16-dec/parsed_tweets.json process

In [8]:
df_users = create_user_df(data_dir=DATA_DIR)
df_users.info()

Loading 1388621 json lines
(7%): 100000 lines in ../data/16-dec/parsed_users.json processed (6.144344091415405 sec)
(14%): 200000 lines in ../data/16-dec/parsed_users.json processed (4.8742687702178955 sec)
(22%): 300000 lines in ../data/16-dec/parsed_users.json processed (4.657926797866821 sec)
(29%): 400000 lines in ../data/16-dec/parsed_users.json processed (4.414881229400635 sec)
(36%): 500000 lines in ../data/16-dec/parsed_users.json processed (4.135060787200928 sec)
(43%): 600000 lines in ../data/16-dec/parsed_users.json processed (4.198259353637695 sec)
(50%): 700000 lines in ../data/16-dec/parsed_users.json processed (4.581743001937866 sec)
(58%): 800000 lines in ../data/16-dec/parsed_users.json processed (4.835165977478027 sec)
(65%): 900000 lines in ../data/16-dec/parsed_users.json processed (5.120298862457275 sec)
(72%): 1000000 lines in ../data/16-dec/parsed_users.json processed (4.460118055343628 sec)
(79%): 1100000 lines in ../data/16-dec/parsed_users.json processed (4.70

In [9]:
def create_crawled_terms_df(crawled_terms, tweet_df):
    crawled_terms_stats = []

    for term in crawled_terms:
        if term in tweet_df.columns:
            stats = {}
            stats["term"] = term
            stats["tweet count"] = tweet_df[term].value_counts().values[1]
            crawled_terms_stats.append(stats)

    crawled_terms_df = pd.DataFrame(crawled_terms_stats).sort_values(
        by=["tweet count"], ascending=False
    )

    return crawled_terms_df

crawled_terms_df = create_crawled_terms_df(CRAWLED_TERMS, recent_tweet_df)
crawled_terms_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 36 entries, 35 to 15
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   term         36 non-null     object
 1   tweet count  36 non-null     int64 
dtypes: int64(1), object(1)
memory usage: 864.0+ bytes


## Basic stats & Coverage

In [10]:

coverage_stats = {}

coverage_stats["old_tweet_count"] = len(old_tweet_df.index)
coverage_stats["recent_tweet_count"] = len(recent_tweet_df.index)
coverage_stats["total_tweet_count"] = coverage_stats["recent_tweet_count"] + coverage_stats["old_tweet_count"]
coverage_stats["retweet_count"] = len(retweet_df.index)
coverage_stats["user_count"] = len(df_users.index)

coverage_stats["earliest_tweet"] = old_tweet_df.timestamp.min()
coverage_stats["latest_tweet"] = recent_tweet_df.timestamp.max()
coverage_stats["earliest_retweet"] = retweet_df.timestamp.min()
coverage_stats["latest_retweet"] = retweet_df.timestamp.max()

coverage_stats

{'old_tweet_count': 5902,
 'recent_tweet_count': 7603103,
 'total_tweet_count': 7609005,
 'retweet_count': 25566698,
 'user_count': 1388621,
 'earliest_tweet': '2008-11-05T02:44:00Z',
 'latest_tweet': '2020-12-16T13:08:49Z',
 'earliest_retweet': '2020-10-23T16:59:58Z',
 'latest_retweet': '2020-12-16T13:42:14Z'}

## Terms grouped by hour

In [11]:
df_aggregated_by_hour = aggregate_counts_by_hour(recent_tweet_df, retweet_df, crawled_terms_df["term"].values)
df_aggregated_by_hour.head()

Unnamed: 0_level_0,tweet count,retweet count,voter fraud,election fraud,#stopthesteal,#voterfraud,#electionfraud,election interference,ballot harvesting,ballot fraud,...,hacked voting machine,pre-filled ballot,#ilhanomarballotharvesting,#ballotvoterfraud,#votebymailfraud,#nomailinvoting,#ilhanomarvoterfraud,#hackedvotingmachines,#discardedballots,#stopgopvoterfraud
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-10-23 17:00:00+00:00,306,681.0,179,4.0,0.0,24.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-10-23 18:00:00+00:00,419,1272.0,238,9.0,0.0,22.0,1.0,0.0,3.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-10-23 19:00:00+00:00,409,561.0,250,8.0,1.0,50.0,0.0,2.0,4.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-10-23 20:00:00+00:00,645,932.0,372,18.0,0.0,77.0,2.0,2.0,6.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-10-23 21:00:00+00:00,539,847.0,322,11.0,0.0,57.0,1.0,0.0,4.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Most common hashtags

In [12]:
df_most_common_hashtags = aggregate_most_common_hashtags(recent_tweet_df, crawled_terms_df["term"].values)
df_most_common_hashtags.head()

Unnamed: 0_level_0,all tweets,voter fraud,election fraud,#stopthesteal,#voterfraud,#electionfraud,election interference,ballot harvesting,ballot fraud,#electioninterference,...,hacked voting machine,pre-filled ballot,#ilhanomarballotharvesting,#ballotvoterfraud,#votebymailfraud,#nomailinvoting,#ilhanomarvoterfraud,#hackedvotingmachines,#discardedballots,#stopgopvoterfraud
hashtag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
#1,0,741,530,0,0,0,47,21,18,0,...,0,0,0,0,0,0,0,0,0,0
#12news,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
#1a,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
#2,0,0,0,0,0,0,0,12,0,0,...,0,0,0,0,0,0,0,0,0,0
#2020election,9507,934,701,2282,2680,2608,62,14,19,340,...,0,0,0,2,0,0,0,0,0,0


## Most common tokens

In [17]:
from collections import Counter
from spacy.lang.en.stop_words import STOP_WORDS
STOP_WORDS = STOP_WORDS.union({"pron", "", " ", "”", "“", "🇺"})

def include_token(token):
    return token not in STOP_WORDS and not token.startswith("hashtag")

def create_most_common_tokens_df(df_tweets, count_label, k=100):
    counted_tokens = Counter(
        [
            token
            for tokens in df_tweets["tokens"]
            for token in tokens
            if include_token(token)
        ]
    )
    return pd.DataFrame(
        counted_tokens.most_common(k), columns=["token", count_label]
    ).set_index("token")

def aggregate_most_common_tokens(df_tweets, crawled_terms, k=100):
    df_most_common_tokens = create_most_common_tokens_df(df_tweets, count_label="all tweets", k=k)
    for term in crawled_terms:
        filtered_by_crawled_term = df_tweets[
            df_tweets[term] == 1
        ]
        df_most_common_tokens = df_most_common_tokens.join(
            create_most_common_tokens_df(filtered_by_crawled_term, count_label=term, k=k),
            how='outer'
        )
    return df_most_common_tokens.fillna(0).astype(int)


df_most_common_tokens = aggregate_most_common_tokens(recent_tweet_df, crawled_terms_df["term"].values)
df_most_common_tokens.head()

Unnamed: 0_level_0,all tweets,voter fraud,election fraud,#stopthesteal,#voterfraud,#electionfraud,election interference,ballot harvesting,ballot fraud,#electioninterference,...,hacked voting machine,pre-filled ballot,#ilhanomarballotharvesting,#ballotvoterfraud,#votebymailfraud,#nomailinvoting,#ilhanomarvoterfraud,#hackedvotingmachines,#discardedballots,#stopgopvoterfraud
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,17,8,0,0,1,0,0,0,0,0
10,0,0,0,0,0,0,0,0,0,206,...,0,0,0,0,0,0,0,0,0,0
100,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10000,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1000s,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


## Co-occurrence

In [None]:
crawled_term_threshold = 5000
filtered_crawled_terms = crawled_terms_df[
    crawled_terms_df["tweet count"] > crawled_term_threshold
]
terms_in_df = [term for term in filtered_crawled_terms["term"]]
crawled_terms_tweet_df = (
    recent_tweet_df[terms_in_df].sparse.to_dense().astype("int32")
)
df_cooccurrence = crawled_terms_tweet_df.T.dot(crawled_terms_tweet_df)

## Export

In [None]:
import pickle

df_users.to_pickle(EXPORT_DIR + "df_users.pickle")
retweet_df.to_pickle(EXPORT_DIR + "df_retweets.pickle")
recent_tweet_df.to_pickle(EXPORT_DIR + "df_recent_tweets.pickle")
old_tweet_df.to_pickle(EXPORT_DIR + "df_old_tweets.pickle")
df_aggregated_by_hour.to_pickle(EXPORT_DIR + "df_counts_by_hour.pickle")
crawled_terms_df.to_pickle(EXPORT_DIR + "df_crawled_terms.pickle")
df_most_common_hashtags.to_pickle(EXPORT_DIR + "df_most_common_hashtags.pickle")
df_most_common_tokens.to_pickle(EXPORT_DIR + "df_most_common_tokens.pickle")
df_cooccurrence.to_pickle(EXPORT_DIR + "df_cooccurrence.pickle")

with open(EXPORT_DIR + "coverage_stats.pickle", "wb") as f:
    pickle.dump(coverage_stats, f)

In [18]:
df_most_common_tokens.to_pickle(EXPORT_DIR + "df_most_common_tokens.pickle")