# US 2020 political observatory

Analysing twitter language using BERT

### Pre-requisites

Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Install `transformers`

In [None]:
!pip install transformers
!pip install -U sentence-transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/2c/4e/4f1ede0fd7a36278844a277f8d53c21f88f37f3754abf76a5d6224f76d4a/transformers-3.4.0-py3-none-any.whl (1.3MB)
[K     |▎                               | 10kB 24.2MB/s eta 0:00:01[K     |▌                               | 20kB 3.5MB/s eta 0:00:01[K     |▉                               | 30kB 4.5MB/s eta 0:00:01[K     |█                               | 40kB 4.7MB/s eta 0:00:01[K     |█▎                              | 51kB 4.0MB/s eta 0:00:01[K     |█▋                              | 61kB 4.4MB/s eta 0:00:01[K     |█▉                              | 71kB 4.9MB/s eta 0:00:01[K     |██                              | 81kB 5.3MB/s eta 0:00:01[K     |██▍                             | 92kB 5.5MB/s eta 0:00:01[K     |██▋                             | 102kB 5.5MB/s eta 0:00:01[K     |██▉                             | 112kB 5.5MB/s eta 0:00:01[K     |███▏                            | 122kB 5.5M

In [None]:
# general
import re
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
from pathlib import Path

# viz
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

# nlp
from transformers import pipeline
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA
from sklearn import datasets
from sklearn.preprocessing import StandardScaler

%matplotlib inline

#### Paths

In [None]:
root = Path("drive/My Drive/us-2020")
tweets_path = Path("data/tweets-04112020/tweets.csv")
hashtags_path = Path("data/tweets-04112020/hashtags.csv")
mentions_path = Path("data/tweets-04112020/user-mentions.csv")
user_path = Path("data/user_handles.json")

#### Datasets

In [None]:
df_tweets = pd.read_csv(root/tweets_path)
df_hashtags = pd.read_csv(root/hashtags_path)
df_mentions = pd.read_csv(root/mentions_path)

print(f"tweets: {df_tweets.shape}")
print(f"hashtags: {df_hashtags.shape}")
print(f"mentions: {df_mentions.shape}")

tweets: (1569371, 19)
hashtags: (630657, 4)
mentions: (1312092, 5)


In [None]:
df_tweets.head()

Unnamed: 0,tweet_id,created_at,tweet_content,user_id,screen_name,retweet_count,favorite_count,place_id,place_type,place_name,lon,lat,country_code,original_user_id,original_screen_name,original_favorite_count,in_reply_to_status_id_str,in_reply_to_user_id_str,in_reply_to_screen_name
0,1297974719707308033,Mon Aug 24 19:11:03 +0000 2020,"As a member of @HouseScience, I'm proud to sup...",1009269193,RepLipinski,5,8,,,,,,,,,,,,
1,1241136765886496768,Fri Mar 20 22:57:18 +0000 2020,[2/4] To learn more about what is or is not pe...,1009269193,RepLipinski,0,2,,,,,,,,,,1.241137e+18,1009269000.0,RepLipinski
2,1245860684706926593,Thu Apr 02 23:48:28 +0000 2020,Together we can stop the spread of #COVID19 by...,1009269193,RepLipinski,2,16,,,,,,,,,,,,
3,1244784554361192448,Tue Mar 31 00:32:19 +0000 2020,And thank you @RepLipinski for your investment...,1009269193,RepLipinski,4,0,,,,,,,208566585.0,PaceSuburbanBus,12.0,,,
4,1234148366147346439,Sun Mar 01 16:07:54 +0000 2020,Enjoyed talking with parents and teachers at t...,1009269193,RepLipinski,4,13,014241bf2253c205,city,"Lockport, IL",,,US,,,,,,


> Note: The presence of `original_` prefixed entries suggest the tweet was a RT.

Sanity checks

In [None]:
print(f"Number of unique tweet IDs == Number of total rows: {df_tweets.tweet_id.nunique() == df_tweets.shape[0]}")

Number of unique tweet IDs == Number of total rows: True


#### Data type coercion

In [None]:
df_tweets.dtypes

tweet_id                       int64
created_at                    object
tweet_content                 object
user_id                        int64
screen_name                   object
retweet_count                  int64
favorite_count                 int64
place_id                      object
place_type                    object
place_name                    object
lon                          float64
lat                          float64
country_code                  object
original_user_id             float64
original_screen_name          object
original_favorite_count      float64
in_reply_to_status_id_str    float64
in_reply_to_user_id_str      float64
in_reply_to_screen_name       object
dtype: object

All IDs to `str`

In [None]:
# tweet_id
df_tweets['tweet_id'] = df_tweets.tweet_id.astype(str)
df_hashtags['tweet_id'] = df_hashtags.tweet_id.astype(str)
df_mentions['tweet_id'] = df_mentions.tweet_id.astype(str)

# user_id
df_tweets['user_id'] = df_tweets.user_id.astype(str)
df_hashtags['user_id'] = df_hashtags.user_id.astype(str)
df_mentions['user_id'] = df_mentions.user_id.astype(str)

# others 
# Note: here "0" indicates NULL
df_tweets['original_user_id'] = df_tweets.original_user_id.fillna(0).astype(int).astype(str)
df_tweets['in_reply_to_status_id_str'] = df_tweets.in_reply_to_status_id_str.fillna(0).astype(int).astype(str)
df_tweets['in_reply_to_user_id_str'] = df_tweets.in_reply_to_user_id_str.fillna(0).astype(int).astype(str)
df_mentions['mentioned_user_id'] = df_mentions.mentioned_user_id.fillna(0).astype(int).astype(str)

sanity check

In [None]:
df_tweets.dtypes

tweet_id                      object
created_at                    object
tweet_content                 object
user_id                       object
screen_name                   object
retweet_count                  int64
favorite_count                 int64
place_id                      object
place_type                    object
place_name                    object
lon                          float64
lat                          float64
country_code                  object
original_user_id              object
original_screen_name          object
original_favorite_count      float64
in_reply_to_status_id_str     object
in_reply_to_user_id_str       object
in_reply_to_screen_name       object
dtype: object

#### Primary actors

In [None]:
trump_id = '25073877' # @realDonaldTrump
biden_id = '939091'
harris_id = '803694179079458816'
primary_actors = ["realDonaldTrump", "JoeBiden", "SenKamalaHarris"] # pence?

#### Senators

In [None]:
import os
user_info = []
with open(root/user_path, 'r') as f:
    user_info = json.load(f)

In [None]:
users = {x['id_str'] : x['screen_name'] for x in user_info}

In [None]:
senators = []
for user in user_info:
    slugs = list(map(lambda x: x.strip(), user['slug'].split(',')))
    if 'senators' in slugs:
        senators.append(str(user['id_str']))

In [None]:
print(f"Total no of senators: {len(senators)}")

Total no of senators: 104


### Summary statistics

No. of tweets per user

In [None]:
tweet_counts = df_tweets.groupby(['user_id'])['tweet_id'].count().reset_index(name="tweet_count")

In [None]:
tweet_counts.sort_values('tweet_count').tail(5)

Unnamed: 0,user_id,tweet_count
884,822215679726100480,3251
345,17494010,3254
241,14412533,3256
243,14465607,3260
883,822215673812119553,3263


Only senators

In [None]:
tweet_counts[tweet_counts.user_id.isin(senators)].sort_values('tweet_count').tail(5)

Unnamed: 0,user_id,tweet_count
342,172858784,3245
641,2964174789,3246
157,109287731,3247
820,76456274,3250
345,17494010,3254


### Filter tweets

Tweets from `@realDonaldTrump`

In [None]:
trump_tweet_ids = set(df_tweets[df_tweets.user_id == trump_id].tweet_id.tolist())
print(f"Total Trump tweets: {len(trump_tweet_ids)}")

Total Trump tweets: 3192


Tweets mentioning `@realDonaldTrump` AND not from `@realDonaldTrump`

In [None]:
trump_mentions_tweet_ids = set(df_mentions[(df_mentions['mentioned_user_id'] == trump_id) & (df_mentions['user_id'] != trump_id)].tweet_id.tolist())
print(f"Total Trump mentions: {len(trump_mentions_tweet_ids)}")

Total Trump mentions: 36051


Tweets mentioning `@realDonaldTrump` that are retweets (RTs)

In [None]:
trump_rt_tweet_ids = set(df_tweets[df_tweets.original_user_id == trump_id].tweet_id.tolist())
print(f"Trump RTs: {len(trump_rt_tweet_ids)}")

Trump RTs: 4628


Filtering out RTs from `@realDonadTrump` FROM the *actual* mentions (to cut down on redundant signals)

In [None]:
trump_no_rt_mentions_ids = trump_mentions_tweet_ids - trump_rt_tweet_ids
print(f"Sanity check: {len(trump_mentions_tweet_ids - trump_rt_tweet_ids) == len(trump_no_rt_mentions_ids)}")

Sanity check: True


In [None]:
len(trump_no_rt_mentions_ids)

31592

Get the tweets

In [None]:
df_trump_mentions = df_tweets[df_tweets.tweet_id.isin(trump_no_rt_mentions_ids)].reset_index(drop=True).copy()
print(df_trump_mentions.shape)
df_trump_mentions.head()

(31592, 19)


Unnamed: 0,tweet_id,created_at,tweet_content,user_id,screen_name,retweet_count,favorite_count,place_id,place_type,place_name,lon,lat,country_code,original_user_id,original_screen_name,original_favorite_count,in_reply_to_status_id_str,in_reply_to_user_id_str,in_reply_to_screen_name
0,1026516623719587840,Mon Aug 06 17:13:29 +0000 2018,"Just found the Trump Tower in Mamou, Louisiana...",1017500185356853248,SenBillCassidy,9,19,,,,,,,0,,,0,0,
1,1025485773875937283,Fri Aug 03 20:57:15 +0000 2018,We need better border security. Washington Dem...,1017500185356853248,SenBillCassidy,1,5,,,,,,,0,,,0,0,
2,1032951113815928832,Fri Aug 24 11:21:51 +0000 2018,President @realDonaldTrump is standing up for ...,1017500185356853248,SenBillCassidy,8,0,,,,,,,1017500185356853248,SenBillCassidy,24.0,0,0,
3,1052157223646978048,Tue Oct 16 11:20:04 +0000 2018,American manufacturers are #BetterOffNow.\n\n→...,1017500185356853248,SenBillCassidy,1768,0,,,,,,,1209417007,SteveScalise,6189.0,0,0,
4,1026545492354691072,Mon Aug 06 19:08:12 +0000 2018,Great work by Senator @BillCassidy and @realDo...,1017500185356853248,SenBillCassidy,4,0,,,,,,,2706910842,ChrisNeiweem,9.0,0,0,


Getting Trump's direct tweets

In [None]:
df_trump = df_tweets[(df_tweets.tweet_id.isin(trump_tweet_ids)) & (df_tweets.original_user_id == '0')].reset_index(drop=True).copy()
print(df_trump.shape)
df_trump.head()

(1507, 19)


Unnamed: 0,tweet_id,created_at,tweet_content,user_id,screen_name,retweet_count,favorite_count,place_id,place_type,place_name,lon,lat,country_code,original_user_id,original_screen_name,original_favorite_count,in_reply_to_status_id_str,in_reply_to_user_id_str,in_reply_to_screen_name
0,1268685511755026432,Thu Jun 04 23:26:11 +0000 2020,Great to be with our wonderful Men and Women o...,25073877,realDonaldTrump,23514,96180,,,,,,,0,,,0,0,
1,1268874882827378688,Fri Jun 05 11:58:41 +0000 2020,Great going Mike! https://t.co/fmInHTfj9k,25073877,realDonaldTrump,4769,23127,,,,,,,0,,,0,0,
2,1268723566046044160,Fri Jun 05 01:57:24 +0000 2020,Sleepy Joe Biden’s 1994 Crime Bill was a total...,25073877,realDonaldTrump,38970,140965,,,,,,,0,,,0,0,
3,1268869099431608320,Fri Jun 05 11:35:42 +0000 2020,USA! https://t.co/p6LrDNkSB9,25073877,realDonaldTrump,20376,92842,,,,,,,0,,,0,0,
4,1268998143733051394,Fri Jun 05 20:08:28 +0000 2020,"...We should be standing up straight and tall,...",25073877,realDonaldTrump,41234,205898,,,,,,,0,,,1268998142860627968,25073877,realDonaldTrump


## Embeddings

### Contextual embeddings

1. BERT (base)
2. DistillBERT
3. RoBERTa

#### Text preprocessing

As per https://web.stanford.edu/class/cs224n/reports/custom/15785631.pdf:
> 2.1 Text preprocessing
Texts are lowercased. Non-ascii letters, urls, @RT:[NAME], @[NAME] are removed. For BERT, an
additional [CLS] token is inserted to the beginning of each text. Texts with length less than 4 are
thrown away. No lemmatization is performed and no punctuation mark is removed since pre-trained
embeddings are always used. No stop-word is removed for fluency purpose.


In [None]:
def bert_preprocessor(tweet: str):
    """Minimal preprocessing for BERT 
    
    1. URL removal
    2. @[screen_name] removal
    3. #hashtag removal
    4. Removal of leading and trailing spaces

    """
    FLAGS = re.MULTILINE | re.DOTALL
    # tweet = tweet.lower() # lowercase
    tweet = re.sub(r"https?:\/\/\S+\b|www\.(\w+\.)+\S*", "", tweet, flags=FLAGS) # remove URLs
    tweet = re.sub(r"@\w+", "", tweet, flags=FLAGS) # @user
    tweet = re.sub(r"#\S+", "", tweet, flags=FLAGS) # #hashtags
    tweet = tweet.strip()
    return tweet

Apply

In [None]:
df_trump['_tweet_content'] = df_trump.tweet_content.apply(bert_preprocessor)
df_trump_mentions['_tweet_content'] = df_trump_mentions.tweet_content.apply(bert_preprocessor)

BERT to perform [Semantic Textual Similarity (STS) ](https://www.sbert.net/docs/usage/semantic_textual_similarity.html)

Select model from: https://docs.google.com/spreadsheets/d/14QplCdTCDwEmTqrn1LH4yrbKvdogK4oQvYO1K1aPR5M

In [None]:
# efficiency vs performance trade-off
model = 'distilbert-base-nli-stsb-mean-tokens' # 'roberta-base-nli-stsb-mean-tokens'
bert = SentenceTransformer(model) # DistillBERT is more efficient

100%|██████████| 245M/245M [00:24<00:00, 9.85MB/s]


In [None]:
def get_bert_embeddings(tweets):
    """Computes the BERT embeddings for all tweets
    Performs inference with a pretrained BERT model 
    
    Args:
    tweets (list): A list of tweets
    
    Returns a 768-dimensional embedding for each tweet (mean)
    """
    embeddings = bert.encode(tweets, convert_to_tensor=False) # do not need a torch tensor
    return embeddings

Get the list of tweets

In [None]:
%%time
tweets = df_trump['_tweet_content'].tolist()
bert_embeddings = get_bert_embeddings(tweets)

CPU times: user 5.23 s, sys: 1.98 s, total: 7.21 s
Wall time: 16.1 s


In [None]:
df_trump_embed = pd.concat([df_trump[['tweet_id', 'created_at', 'user_id', 'retweet_count',	'favorite_count']], pd.DataFrame(bert_embeddings)], axis=1)
df_trump_embed.to_csv(root/'trump_embed.csv', index=False)
df_trump_embed.head()

Unnamed: 0,tweet_id,created_at,user_id,retweet_count,favorite_count,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,...,728,729,730,731,732,733,734,735,736,737,738,739,740,741,742,743,744,745,746,747,748,749,750,751,752,753,754,755,756,757,758,759,760,761,762,763,764,765,766,767
0,1268685511755026432,Thu Jun 04 23:26:11 +0000 2020,25073877,23514,96180,1.103124,0.424156,0.050481,0.141552,0.025443,-0.275829,-0.403517,-0.030968,-0.982882,-1.13635,-0.87308,0.647381,0.431796,0.20921,0.019798,-0.521178,1.341752,0.361453,-0.891255,-0.459986,1.232851,-1.218535,-0.037328,0.027012,0.371096,-0.386542,-0.769604,0.250894,0.344037,0.141113,-0.490978,-0.923257,-0.224342,0.328991,-0.557138,...,0.178903,-0.246942,-0.38981,-0.561453,0.802101,0.823415,0.288733,0.559953,-0.654947,0.320738,0.754348,0.468034,-0.747992,-0.486636,-0.046141,-1.585838,0.08189,0.572955,-0.550021,-0.006114,0.137136,0.399055,-0.074018,-0.886708,0.474638,-0.490245,-0.329489,0.512363,-0.290211,0.254168,-0.880716,-0.748653,-0.077986,0.36262,-0.871173,0.408367,0.435463,-0.178711,0.151998,-0.970517
1,1268874882827378688,Fri Jun 05 11:58:41 +0000 2020,25073877,4769,23127,0.002316,0.889421,-0.105876,-1.386304,0.100477,-0.22105,-0.079756,0.80979,-0.523869,-0.341162,-0.572579,0.655067,0.481434,-0.933281,0.043895,0.438669,-0.300771,-0.502657,-0.407984,0.026069,-0.683694,-0.183004,-0.428313,0.12463,0.214569,0.091484,0.475242,0.105166,-0.599871,0.59199,-0.078524,0.664413,-0.38349,-0.387154,-0.62113,...,-0.24824,-0.108317,-0.499268,0.253148,0.385276,1.105777,0.068909,-0.5276,0.484778,-0.112996,1.063261,-0.053312,-0.344455,0.108098,-0.231784,-1.214793,0.806518,0.266271,-0.553391,-0.240308,-0.677317,-0.048945,0.741028,-0.228335,0.209252,-1.145957,0.006479,-0.610886,0.239446,0.738753,0.065963,-0.092622,-0.188571,1.692709,0.493308,0.294067,0.167081,0.341486,0.383013,-1.398136
2,1268723566046044160,Fri Jun 05 01:57:24 +0000 2020,25073877,38970,140965,0.665973,-0.100513,-0.164027,0.139198,-0.03246,0.390195,0.143771,-0.959026,0.867784,-0.237218,0.384009,0.749585,-0.562197,0.548478,0.479267,0.007717,0.193133,-0.816202,-0.129182,-0.076623,-0.215628,0.807896,-0.199856,0.866422,-0.592638,-0.030237,-1.031496,-0.079839,0.340891,0.414805,0.342915,0.253907,-0.622601,-0.391279,-0.144034,...,0.403575,0.147734,-1.058175,-0.378817,0.487404,0.194929,0.405415,0.129211,-0.203467,0.754181,0.285376,0.097831,0.084294,0.192409,-0.204263,-0.703972,0.203187,0.652797,-0.232823,0.667632,-0.521989,0.637491,-0.638542,0.397937,0.340223,-0.734231,0.419678,-0.027287,-0.04496,-0.527267,-0.514494,0.408547,-0.364139,0.762514,0.483082,-0.007674,0.136244,0.4643,0.044225,-0.663777
3,1268869099431608320,Fri Jun 05 11:35:42 +0000 2020,25073877,20376,92842,-0.124394,0.178987,-0.749291,-0.440119,-0.677025,-0.066555,0.809576,0.691965,0.02223,-0.263378,0.22111,0.84731,-0.476807,0.753762,-0.129881,-0.545644,-0.474164,0.664445,-0.290609,0.35327,0.70941,-0.20912,-0.031486,-0.175357,-0.066709,0.274672,-0.663042,-0.155775,-0.626229,-0.458706,-0.537441,-0.223214,0.328447,-0.065055,-0.284714,...,0.181631,-0.125888,-0.563426,-0.305767,0.661888,0.514929,0.456533,-0.307952,0.544417,-0.026941,0.412454,0.081916,-0.106447,-0.283791,-0.582741,0.057383,-0.494649,-0.056708,-0.792143,-0.206483,-0.592827,-0.07213,0.316784,-1.188334,-1.042614,-1.009845,-0.065083,-0.093712,0.731537,0.449066,-1.770977,-0.607551,-1.028697,0.893659,-0.452105,0.198086,0.73987,-0.955381,0.322228,-1.159263
4,1268998143733051394,Fri Jun 05 20:08:28 +0000 2020,25073877,41234,205898,0.699836,0.215625,-0.362011,0.20636,-0.55115,-0.077212,0.265463,-0.770413,0.055084,-0.470636,0.074159,0.84308,-0.669345,0.588671,0.215185,-0.0431,0.293181,0.700903,-0.28089,0.057553,-0.685977,0.239619,0.312351,0.043481,-0.536498,0.000219,0.529216,-0.176834,0.619924,0.383311,0.515964,-0.158169,-0.522149,-1.094282,-0.273721,...,0.104677,-0.320151,-1.013057,-0.030049,0.641755,-0.457022,0.278227,0.631839,-0.463971,1.206755,0.26474,0.376495,-0.022828,0.346248,-0.780784,0.390352,0.435484,0.002853,0.170745,-0.177248,0.169696,0.626583,0.331661,0.03225,0.300451,-0.374162,0.800378,0.360194,0.168164,-0.050756,-0.647762,0.35582,-0.227254,0.930033,0.249645,0.028556,-0.23014,0.202612,0.431107,-0.375893


> Note: Might take some time! One time job.

In [None]:
%%time
tweets = df_trump_mentions['_tweet_content'].tolist()
bert_embeddings = get_bert_embeddings(tweets)

CPU times: user 34.3 s, sys: 8.4 s, total: 42.7 s
Wall time: 42.7 s


In [None]:
df_trump_mentions_embed = pd.concat([df_trump_mentions[['tweet_id', 'created_at', 'user_id', 'retweet_count', 'favorite_count']], pd.DataFrame(bert_embeddings)], axis=1)
df_trump_mentions_embed.to_csv(root/'trump_mentions_embed.csv', index=False)
df_trump_mentions_embed.head()

Unnamed: 0,tweet_id,created_at,user_id,retweet_count,favorite_count,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,...,728,729,730,731,732,733,734,735,736,737,738,739,740,741,742,743,744,745,746,747,748,749,750,751,752,753,754,755,756,757,758,759,760,761,762,763,764,765,766,767
0,1026516623719587840,Mon Aug 06 17:13:29 +0000 2018,1017500185356853248,9,19,0.296496,0.542614,-0.453187,-0.891998,-0.090881,0.006193,-0.303954,0.249364,0.039712,-1.286487,-0.044627,0.651053,-0.044583,0.429252,-0.263339,-0.05301,-0.06454,-0.490691,0.902055,-0.489038,-0.743008,-0.636298,0.292564,0.059178,-0.512473,-0.125629,0.196875,0.320604,-0.05262,0.13937,0.169842,-0.916152,-0.737109,-0.501976,-1.668477,...,0.137196,0.087176,-0.54064,0.261148,-0.043927,-0.706609,0.249547,0.359341,-0.103794,-0.247834,0.453887,0.845283,0.388629,-0.332676,-1.132645,-0.196032,0.688646,0.308717,0.471878,0.163504,-0.867474,0.350448,-0.380599,-0.468762,-0.478088,-1.165385,0.754126,-0.780248,0.090353,-0.203745,-0.647261,0.53068,-0.382568,0.500944,-0.336224,0.220711,-0.159,0.271637,-0.350223,-0.095384
1,1025485773875937283,Fri Aug 03 20:57:15 +0000 2018,1017500185356853248,1,5,-0.428294,0.578304,-0.162056,-0.228074,-0.462658,-0.029283,0.585583,-0.699514,0.418962,-0.697391,0.008845,0.467787,-0.850617,0.578669,0.503724,-0.292381,-0.335499,-0.5139,0.467011,-0.201876,-0.32594,-0.264376,-0.404916,0.226402,-0.26881,0.272352,0.341223,0.022195,0.300996,0.597184,0.506188,0.317362,-1.087153,-0.091599,0.560678,...,-0.024032,-0.5086,-0.617541,-0.141626,0.212026,-0.215322,-0.076981,0.786223,-0.373745,1.697614,-0.188368,-1.000211,-0.077082,0.331316,0.197368,-0.804826,0.122856,0.936029,-0.013965,-0.648371,0.435433,1.29155,0.43242,0.438572,0.477754,-0.859429,0.141657,0.090128,0.415385,0.226492,-0.741634,0.742004,0.266928,0.736419,0.843931,0.181311,-0.242699,-0.222048,-0.937796,0.367424
2,1032951113815928832,Fri Aug 24 11:21:51 +0000 2018,1017500185356853248,8,0,0.094901,0.615759,-0.689562,-0.422887,-0.052428,0.750496,-0.346024,-0.458219,-0.192849,-0.275984,0.621015,0.478449,-0.607759,0.617976,-0.312946,0.143848,-0.13754,-0.043492,-0.706348,-0.28107,-0.386789,0.381325,0.477924,0.664289,-0.434742,-0.361319,-0.555706,-0.514309,-0.08451,0.227876,0.466844,0.023709,-0.058591,-0.451548,-0.874648,...,0.530859,-0.034789,-0.165674,-0.450385,0.487636,0.197166,0.162187,0.060101,-0.543129,1.340839,-0.511484,0.059931,0.326432,0.029854,0.204637,-0.187105,0.569561,0.136692,0.176373,0.357851,0.275542,-0.104816,-0.528044,-0.137594,0.773938,0.606297,-0.017134,0.730254,0.242864,-0.256488,-0.175479,1.154517,-0.763381,0.273006,0.393731,-0.328782,0.047578,0.292758,0.503481,-0.528074
3,1052157223646978048,Tue Oct 16 11:20:04 +0000 2018,1017500185356853248,1768,0,0.708865,0.51862,-0.047712,-0.53295,0.146501,0.262488,-0.133962,-0.24348,0.077844,-0.575408,0.416497,0.301206,0.100369,0.13147,0.091316,0.096511,-0.530616,0.394006,0.682099,0.119839,0.000791,-0.176863,0.101857,0.824487,-0.039022,-0.681879,-0.264388,-0.763861,0.305914,0.3712,-0.193137,-1.119112,-0.011367,0.185721,-0.457377,...,-0.011262,0.389398,-0.49047,-0.7169,0.228523,0.842964,-0.146859,-0.393022,-0.250563,0.212922,-0.211197,0.490965,0.056749,0.332958,-0.354161,0.230801,0.788439,0.540131,-0.149457,-0.001058,-0.06901,1.179062,-0.17666,-0.245552,0.19954,-0.26054,-0.185489,0.423785,0.149124,-0.360008,-1.027331,0.512619,0.401362,1.076128,0.050959,0.066167,0.402367,-0.095855,0.446844,-0.901757
4,1026545492354691072,Mon Aug 06 19:08:12 +0000 2018,1017500185356853248,4,0,0.51916,0.554478,0.116808,-0.250892,-0.240644,0.040561,0.597544,-0.934036,-0.545247,-0.765313,-0.10542,0.265675,0.288152,0.926259,0.303288,-0.544012,0.575286,-0.166188,-0.047129,-0.242889,-0.33921,0.046245,-0.292118,0.535238,-0.174444,-0.086731,-0.178658,0.053102,0.246698,0.013568,0.466314,-0.719602,-0.97139,0.047073,-0.125972,...,1.026371,-0.095357,-0.273369,-0.453069,0.289113,0.670054,0.224019,0.220753,-0.867422,0.382044,-0.616033,0.23273,-0.707207,-0.221402,0.573499,-0.441059,-0.287151,0.497306,0.080583,0.036897,-0.25711,-0.0441,0.471426,-0.036785,0.523028,-0.690391,0.555457,-0.245956,-0.338947,-0.236552,-1.147223,-0.790507,-0.208819,1.266072,0.120563,0.354832,-0.198582,0.860355,-0.193678,-0.59183


> Note: Load the saved embeddings.

Start here -> Load 

In [None]:
df_trump_embed = pd.read_csv(root/'trump_embed.csv')
df_trump_mentions_embed = pd.read_csv(root/'trump_mentions_embed.csv')

### Aggregate

#### Weekly aggregation
- As daily granularity might not be sufficient for all handles
- Trump "events" are a mostly (?) weekly affair

Convert to datetime

In [None]:
df_trump_embed['_created_at'] = pd.to_datetime(df_trump_embed.created_at) # convert to datetime
# https://strftime.org/
df_trump_embed_agg = df_trump_embed.groupby(['user_id', df_trump_embed['_created_at'].dt.strftime('%Y-%W')]).mean().reset_index() # Aggregate mean 
df_trump_embed_agg.insert(4, 'tweet_count', df_trump_embed.groupby(['user_id', df_trump_embed['_created_at'].dt.strftime('%Y-%W')]).size().reset_index(name='counts')['counts'])
df_trump_embed_agg.head()

Unnamed: 0,user_id,_created_at,retweet_count,favorite_count,tweet_count,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,...,728,729,730,731,732,733,734,735,736,737,738,739,740,741,742,743,744,745,746,747,748,749,750,751,752,753,754,755,756,757,758,759,760,761,762,763,764,765,766,767
0,25073877,2020-22,23284.24359,106426.115385,78,0.149069,0.196724,0.009865,-0.074891,-0.136623,-0.050789,-0.018215,-0.121087,-0.002508,-0.253522,0.048538,0.538218,0.031037,0.207657,-0.059558,-0.223908,0.374647,0.010845,-0.232791,-0.212621,-0.070813,-0.162409,0.031521,0.432515,-0.153892,-0.118426,-0.053464,-0.01732,0.233058,0.44613,0.297773,-0.162308,-0.426458,0.046976,-0.199064,...,0.06025,0.065337,-0.47238,-0.135391,0.348175,0.40807,0.176138,-0.000476,-0.161571,0.128069,0.01833,-0.058814,-0.381077,0.114411,-0.188778,-0.669264,0.168758,0.179931,-0.238355,0.297934,-0.32413,0.287228,0.014956,-0.128813,-0.010256,-0.500943,0.131608,-0.049081,0.069148,0.018128,-0.644699,-0.137394,0.069613,0.474739,-0.150652,0.181468,0.109798,0.098584,0.071442,-0.349355
1,25073877,2020-23,25813.372881,119981.940678,118,0.130131,0.168596,0.059496,-0.147593,-0.120444,0.0314,0.085227,-0.136017,0.091151,-0.169573,0.077318,0.502065,-0.096762,0.263359,0.03633,-0.180619,0.172534,-0.008196,-0.179198,-0.16339,-0.113877,-0.068395,0.052847,0.445153,-0.140546,-0.128827,0.024212,-0.015598,0.211106,0.454623,0.253302,-0.103111,-0.35653,0.071913,-0.108193,...,0.084192,-0.046942,-0.516986,-0.129547,0.239534,0.425506,0.159567,-0.004157,-0.127883,0.161891,-0.033009,-0.184439,-0.38396,0.013948,-0.132854,-0.498902,0.091804,0.15821,-0.19965,0.233135,-0.317617,0.353699,-0.023758,-0.112575,-0.043616,-0.442584,0.070816,0.075494,0.025127,0.103591,-0.617178,0.029935,0.201681,0.50305,-0.090961,0.166065,0.122945,0.089543,-0.0033,-0.342832
2,25073877,2020-24,23270.346154,108389.875,104,0.174402,0.209939,0.212131,-0.211437,-0.028061,0.037372,0.064046,-0.168289,0.165889,-0.250349,-0.00084,0.500226,-0.100721,0.200725,0.047075,-0.059068,0.163065,-0.081924,-0.180536,-0.176664,-0.136622,-0.024289,0.011005,0.497681,-0.084454,-0.084653,0.059277,0.084912,0.213907,0.372498,0.419329,-0.153556,-0.357528,-0.058441,-0.079701,...,0.170166,-0.003388,-0.526447,-0.149801,0.225415,0.338546,0.120587,-0.011296,-0.084735,0.127489,-0.013782,-0.127446,-0.326326,0.135246,-0.113703,-0.465171,0.129767,0.156625,-0.119274,0.263168,-0.322511,0.408654,-0.064433,-0.099455,-0.374443,-0.484099,0.054322,-0.009954,0.187017,0.061123,-0.504695,-0.018486,0.272336,0.41356,-0.022964,-0.034976,0.057769,0.13094,-0.109448,-0.211156
3,25073877,2020-25,30583.536364,119448.854545,110,0.126918,0.14093,0.102434,-0.180341,0.065021,0.020687,0.116005,-0.084946,-0.000715,-0.17509,-0.010302,0.386464,-0.113783,0.205333,-0.019877,-0.04104,0.090974,-0.064094,-0.265847,-0.21289,-0.101543,-0.07639,-0.062079,0.463331,-0.194141,-0.117052,0.065792,0.009253,0.233397,0.252097,0.355374,-0.015534,-0.20887,0.035357,-0.088392,...,0.146909,-0.013857,-0.467288,-0.174678,0.193679,0.278724,0.125561,-0.032361,0.153473,0.040698,-0.002883,-0.181792,-0.336598,0.068713,-0.144788,-0.431569,0.13655,-0.056801,-0.155427,0.023514,-0.339566,0.335186,0.0037,-0.191827,-0.707371,-0.404956,-0.102287,0.057777,0.095256,0.114299,-0.499153,0.040431,0.30582,0.387635,-0.082689,-0.044691,0.12292,0.019053,-0.10262,-0.185542
4,25073877,2020-26,23813.743119,101047.220183,109,0.04309,0.186572,0.15035,-0.136757,-0.098916,0.05972,0.101506,-0.129037,-0.059347,-0.169613,-0.071304,0.469229,-0.031224,0.166891,-0.005362,-0.083954,0.149434,-0.034979,-0.348998,-0.200863,-0.088236,-0.048189,-0.024036,0.50979,-0.178232,-0.075938,0.083486,-0.008226,0.183436,0.287209,0.365199,-0.038842,-0.16045,0.037199,-0.020076,...,0.125978,-0.035469,-0.448616,-0.191778,0.206719,0.257885,0.046402,0.04867,0.135372,0.014421,0.035622,-0.209451,-0.359327,0.025559,-0.146101,-0.551064,0.120041,-0.056089,-0.18259,0.032812,-0.379215,0.326321,-0.048775,-0.131914,-0.593004,-0.399126,-0.098912,0.055167,0.160132,0.21357,-0.550226,-0.006206,0.301971,0.332442,-0.105304,0.028203,0.114484,0.003019,-0.026187,-0.149166


In [None]:
df_trump_mentions_embed['_created_at'] = pd.to_datetime(df_trump_mentions_embed.created_at) # convert to datetime
# https://strftime.org/
df_trump_mentions_embed_agg = df_trump_mentions_embed.groupby(['user_id', df_trump_mentions_embed['_created_at'].dt.strftime('%Y-%W')]).mean().reset_index() # Aggregate mean 
df_trump_mentions_embed_agg.insert(4, 'tweet_count', df_trump_mentions_embed.groupby(['user_id', df_trump_mentions_embed['_created_at'].dt.strftime('%Y-%W')]).size().reset_index(name='counts')['counts'])
df_trump_mentions_embed_agg.head()

Unnamed: 0,user_id,_created_at,retweet_count,favorite_count,tweet_count,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,...,728,729,730,731,732,733,734,735,736,737,738,739,740,741,742,743,744,745,746,747,748,749,750,751,752,753,754,755,756,757,758,759,760,761,762,763,764,765,766,767
0,1017500185356853248,2018-31,1.0,5.0,1,-0.428294,0.578304,-0.162056,-0.228074,-0.462658,-0.029283,0.585583,-0.699514,0.418962,-0.697391,0.008845,0.467787,-0.850617,0.578669,0.503724,-0.292381,-0.335499,-0.5139,0.467011,-0.201876,-0.32594,-0.264376,-0.404916,0.226402,-0.26881,0.272352,0.341223,0.022195,0.300996,0.597184,0.506188,0.317362,-1.087153,-0.091599,0.560678,...,-0.024032,-0.5086,-0.617541,-0.141626,0.212026,-0.215322,-0.076981,0.786223,-0.373745,1.697614,-0.188368,-1.000211,-0.077082,0.331316,0.197368,-0.804826,0.122856,0.936029,-0.013965,-0.648371,0.435433,1.29155,0.43242,0.438572,0.477754,-0.859429,0.141657,0.090128,0.415385,0.226492,-0.741634,0.742004,0.266928,0.736419,0.843931,0.181311,-0.242699,-0.222048,-0.937796,0.367424
1,1017500185356853248,2018-32,6.5,9.5,2,0.407828,0.548546,-0.16819,-0.571445,-0.165762,0.023377,0.146795,-0.342336,-0.252767,-1.0259,-0.075023,0.458364,0.121785,0.677755,0.019974,-0.298511,0.255373,-0.328439,0.427463,-0.365964,-0.541109,-0.295026,0.000223,0.297208,-0.343458,-0.10618,0.009108,0.186853,0.097039,0.076469,0.318078,-0.817877,-0.854249,-0.227452,-0.897224,...,0.581784,-0.00409,-0.407004,-0.095961,0.122593,-0.018278,0.236783,0.290047,-0.485608,0.067105,-0.081073,0.539006,-0.159289,-0.277039,-0.279573,-0.318546,0.200748,0.403011,0.27623,0.1002,-0.562292,0.153174,0.045413,-0.252773,0.02247,-0.927888,0.654791,-0.513102,-0.124297,-0.220148,-0.897242,-0.129914,-0.295694,0.883508,-0.10783,0.287772,-0.178791,0.565996,-0.27195,-0.343607
2,1017500185356853248,2018-33,4.5,11.0,2,-0.24276,0.395839,-0.477542,-0.713028,0.205286,0.188811,0.466179,-0.301803,0.409106,-0.251343,0.348974,0.53992,-0.866484,0.167234,-0.317577,-0.343521,0.645656,0.047172,-0.312369,-0.113443,-0.550799,-0.015574,0.115371,0.33819,-0.240382,-0.605067,0.000111,0.354407,0.262328,-0.049443,0.277493,-0.248988,-0.847912,0.388378,-0.46177,...,0.009541,0.056349,-0.589419,-0.124256,0.264127,0.302321,-0.332644,0.078343,-0.271641,0.383267,-0.477686,-0.150009,-0.059783,-0.068449,-0.025285,-0.546396,0.431892,0.23691,-0.318586,0.425535,0.351555,0.182487,-0.16216,-0.240141,0.107445,-0.010343,0.109944,0.518258,-0.082819,-0.187585,-1.006294,-0.134078,0.245479,0.465279,-0.121299,-0.083107,0.086489,-0.105323,0.177168,-0.197506
3,1017500185356853248,2018-34,8.0,12.0,2,0.094901,0.615759,-0.689562,-0.422887,-0.052428,0.750496,-0.346024,-0.458219,-0.192849,-0.275984,0.621015,0.478449,-0.607759,0.617976,-0.312946,0.143848,-0.13754,-0.043492,-0.706348,-0.28107,-0.386789,0.381325,0.477924,0.664289,-0.434742,-0.361319,-0.555706,-0.514309,-0.08451,0.227876,0.466844,0.023709,-0.058591,-0.451548,-0.874648,...,0.530859,-0.034789,-0.165674,-0.450385,0.487636,0.197166,0.162187,0.060101,-0.543129,1.340839,-0.511484,0.059931,0.326432,0.029854,0.204637,-0.187105,0.569561,0.136692,0.176373,0.357851,0.275542,-0.104816,-0.528044,-0.137594,0.773938,0.606297,-0.017134,0.730254,0.242864,-0.256488,-0.175479,1.154517,-0.763381,0.273006,0.393731,-0.328782,0.047578,0.292758,0.503481,-0.528074
4,1017500185356853248,2018-40,18.0,60.0,1,0.272741,0.01406,-0.742705,-0.497896,-0.330765,0.615861,0.060828,-0.270494,-0.882317,-0.540798,-0.285746,0.767597,0.279633,1.126107,-0.059896,0.372671,1.232064,-0.182995,-0.030169,-0.048287,-0.809972,-0.772041,0.522172,0.512993,0.496799,-0.905688,-0.38042,0.931951,0.237534,0.450448,-0.114126,-0.602381,-0.845676,0.252365,-0.726876,...,1.447782,0.218642,-0.426976,0.020233,-0.005178,-0.609852,-0.548549,0.36305,-0.915708,0.392533,-0.939132,1.328627,-0.485925,-0.155848,-0.074452,-0.398399,0.291838,0.690752,0.459696,-0.081944,-0.527908,0.652632,-0.585126,-0.909187,0.092761,-0.53999,0.410262,0.297158,-0.082341,-0.03367,-0.967036,-0.317589,0.135748,0.745604,-0.528449,-0.520045,0.285236,0.555256,-0.008609,-0.636543


In [None]:
df_trump_mentions_embed_agg.iloc[:, 5:]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,728,729,730,731,732,733,734,735,736,737,738,739,740,741,742,743,744,745,746,747,748,749,750,751,752,753,754,755,756,757,758,759,760,761,762,763,764,765,766,767
0,-0.428294,0.578304,-0.162056,-0.228074,-0.462658,-0.029283,0.585583,-0.699514,0.418962,-0.697391,0.008845,0.467787,-0.850617,0.578669,0.503724,-0.292381,-0.335499,-0.513900,0.467011,-0.201876,-0.325940,-0.264376,-0.404916,0.226402,-0.268810,0.272352,0.341223,0.022195,0.300996,0.597184,0.506188,0.317362,-1.087153,-0.091599,0.560678,0.561274,0.271579,0.320099,0.199576,-0.091365,...,-0.024032,-0.508600,-0.617541,-0.141626,0.212026,-0.215322,-0.076981,0.786223,-0.373745,1.697614,-0.188368,-1.000211,-0.077082,0.331316,0.197368,-0.804826,0.122856,0.936029,-0.013965,-0.648371,0.435433,1.291550,0.432420,0.438572,0.477754,-0.859429,0.141657,0.090128,0.415385,0.226492,-0.741634,0.742004,0.266928,0.736419,0.843931,0.181311,-0.242699,-0.222048,-0.937796,0.367424
1,0.407828,0.548546,-0.168190,-0.571445,-0.165762,0.023377,0.146795,-0.342336,-0.252767,-1.025900,-0.075023,0.458364,0.121785,0.677755,0.019974,-0.298511,0.255373,-0.328439,0.427463,-0.365964,-0.541109,-0.295026,0.000223,0.297208,-0.343458,-0.106180,0.009108,0.186853,0.097039,0.076469,0.318078,-0.817877,-0.854249,-0.227452,-0.897224,0.553147,-0.154255,0.065378,0.259456,-0.232948,...,0.581784,-0.004090,-0.407004,-0.095961,0.122593,-0.018278,0.236783,0.290047,-0.485608,0.067105,-0.081073,0.539006,-0.159289,-0.277039,-0.279573,-0.318546,0.200748,0.403011,0.276230,0.100200,-0.562292,0.153174,0.045413,-0.252773,0.022470,-0.927888,0.654791,-0.513102,-0.124297,-0.220148,-0.897242,-0.129914,-0.295694,0.883508,-0.107830,0.287772,-0.178791,0.565996,-0.271950,-0.343607
2,-0.242760,0.395839,-0.477542,-0.713028,0.205286,0.188811,0.466179,-0.301803,0.409106,-0.251343,0.348974,0.539920,-0.866484,0.167234,-0.317577,-0.343521,0.645656,0.047172,-0.312369,-0.113443,-0.550799,-0.015574,0.115371,0.338190,-0.240382,-0.605067,0.000111,0.354407,0.262328,-0.049443,0.277493,-0.248988,-0.847912,0.388378,-0.461770,0.768112,-0.286755,-0.249266,-0.117617,-0.316830,...,0.009541,0.056349,-0.589419,-0.124256,0.264127,0.302321,-0.332644,0.078343,-0.271641,0.383267,-0.477686,-0.150009,-0.059783,-0.068449,-0.025285,-0.546396,0.431892,0.236910,-0.318586,0.425535,0.351555,0.182487,-0.162160,-0.240141,0.107445,-0.010343,0.109944,0.518258,-0.082819,-0.187585,-1.006294,-0.134078,0.245479,0.465279,-0.121299,-0.083107,0.086489,-0.105323,0.177168,-0.197506
3,0.094901,0.615759,-0.689562,-0.422887,-0.052428,0.750496,-0.346024,-0.458219,-0.192849,-0.275984,0.621015,0.478449,-0.607759,0.617976,-0.312946,0.143848,-0.137540,-0.043492,-0.706348,-0.281070,-0.386789,0.381325,0.477924,0.664289,-0.434742,-0.361319,-0.555706,-0.514309,-0.084510,0.227876,0.466844,0.023709,-0.058591,-0.451548,-0.874648,-0.173086,-0.053622,-0.369348,0.585817,-0.553789,...,0.530859,-0.034789,-0.165674,-0.450385,0.487636,0.197166,0.162187,0.060101,-0.543129,1.340839,-0.511484,0.059931,0.326432,0.029854,0.204637,-0.187105,0.569561,0.136692,0.176373,0.357851,0.275542,-0.104816,-0.528044,-0.137594,0.773938,0.606297,-0.017134,0.730254,0.242864,-0.256488,-0.175479,1.154517,-0.763381,0.273006,0.393731,-0.328782,0.047578,0.292758,0.503481,-0.528074
4,0.272741,0.014060,-0.742705,-0.497896,-0.330765,0.615861,0.060828,-0.270494,-0.882317,-0.540798,-0.285746,0.767597,0.279633,1.126107,-0.059896,0.372671,1.232064,-0.182995,-0.030169,-0.048287,-0.809972,-0.772041,0.522172,0.512993,0.496799,-0.905688,-0.380420,0.931951,0.237534,0.450448,-0.114126,-0.602381,-0.845676,0.252365,-0.726876,0.823105,-0.029162,0.146674,-0.228206,-0.780928,...,1.447782,0.218642,-0.426976,0.020233,-0.005178,-0.609852,-0.548549,0.363050,-0.915708,0.392533,-0.939132,1.328627,-0.485925,-0.155848,-0.074452,-0.398399,0.291838,0.690752,0.459696,-0.081944,-0.527908,0.652632,-0.585126,-0.909187,0.092761,-0.539990,0.410262,0.297158,-0.082341,-0.033670,-0.967036,-0.317589,0.135748,0.745604,-0.528449,-0.520045,0.285236,0.555256,-0.008609,-0.636543
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13084,0.215936,0.366439,0.231274,-0.259537,0.100848,-0.309814,0.102817,-0.265775,0.227498,-0.501895,-0.238847,0.740178,-0.535157,0.417838,-0.056899,-0.633119,0.154940,0.228781,-0.197275,0.078303,-0.230805,0.059776,-0.079930,0.291462,-0.028809,-0.456514,0.138559,0.250878,0.644368,0.456812,0.256868,-0.497173,-1.142857,0.141020,-0.022545,0.443382,0.077660,-0.214748,-0.300696,-0.318975,...,0.807204,0.036270,-0.362439,0.184718,0.537860,0.448733,0.271127,-0.088373,-0.751656,0.278584,-0.004400,0.122947,-0.618982,0.099981,-0.118136,-0.334018,0.039384,0.254787,-0.139029,0.588689,-0.114834,0.350072,0.140787,-0.039029,0.232852,-0.194935,0.303728,0.034409,-0.153247,-0.235681,-0.707784,-0.656749,0.263236,0.846527,0.194489,0.261617,-0.045762,0.415062,0.019635,-0.516164
13085,0.256496,0.203196,-0.124192,-0.198204,-0.124198,-0.205402,0.325370,-0.429885,0.313698,-0.489927,0.029023,0.366346,-0.623321,0.438458,-0.130106,-0.271847,0.361970,-0.014071,-0.003615,0.056601,-0.318568,-0.023200,-0.123064,0.423259,-0.246092,-0.231499,-0.060414,0.138951,0.258191,0.336035,0.266630,-0.535135,-0.768234,-0.052635,-0.121078,0.375613,0.028286,-0.104197,-0.010357,-0.151298,...,0.447372,0.013965,-0.504433,-0.112539,0.348582,0.128589,0.022237,-0.053625,-0.581130,0.735020,0.200632,-0.053361,-0.380903,0.218173,-0.162683,-0.412743,0.021205,0.538315,-0.238811,0.234367,-0.204756,0.446495,-0.104512,0.264053,0.361631,-0.260163,0.089676,0.099575,-0.062122,-0.191560,-0.637717,0.008851,0.028327,0.908031,0.123413,0.231645,0.030346,0.202748,0.021384,-0.374967
13086,0.330945,0.322491,-0.273344,-0.106729,-0.412516,0.245652,-0.182706,-0.475557,0.131754,-0.469704,0.256757,0.570370,-0.470006,0.467516,-0.061178,-0.052571,0.743308,0.085184,-0.039109,0.020118,-0.354075,0.118436,-0.208735,0.491885,-0.080740,-0.277205,-0.168272,0.219671,0.150110,0.085747,0.421018,-0.658743,-0.845508,0.349852,-0.142102,0.167697,0.102943,-0.174621,0.170869,-0.043633,...,0.446606,-0.102802,-0.104742,-0.164597,0.193834,0.083448,-0.024609,0.436058,-0.643746,0.727530,-0.016054,0.000602,-0.410060,0.244780,0.009526,-0.527677,0.099693,0.178526,-0.187576,0.442857,-0.060650,0.520454,0.130646,0.176822,0.396183,-0.133471,0.356883,0.105028,0.037281,0.021538,-0.580487,-0.132103,-0.308153,0.498150,0.368595,0.126295,-0.030942,0.394457,-0.039455,-0.317926
13087,0.275776,0.076512,0.189165,0.045735,0.190736,-0.280917,0.006109,-0.644933,0.765397,-0.562594,0.109648,0.645933,-0.329574,0.796130,0.019375,-0.271539,-0.216531,-0.142217,0.048025,-0.112457,-0.162260,-0.030032,-0.310416,0.482205,0.060933,-0.055254,-0.191260,-0.028432,0.544539,0.682934,-0.034535,-0.072657,-0.803071,0.162252,0.183033,0.577149,0.047068,0.445291,0.484852,-0.195834,...,0.564614,0.031134,-0.779313,-0.022134,0.686816,0.720112,-0.049758,-0.179232,-0.728642,0.798426,0.228575,-0.363424,-0.629539,0.726691,-0.059408,-0.587651,-0.265548,0.539076,0.026374,0.161763,-0.184446,0.537553,0.150719,0.113521,0.192916,-0.411155,0.134595,-0.017145,0.233249,-0.442545,-0.696300,0.277682,0.214173,1.235255,0.511708,0.207571,-0.048087,-0.074788,-0.360032,-0.475339


### Vector similarity

- X-axis: Date
- Y-axis: Cosine similarity



Compute pairwise distance between Trump and everyone else: aggregated weekly tweets

In [None]:
df_trump_embed_agg['_created_at'].max()

'2020-36'

Create a weekly index of cosine similarity between Trump and others

In [None]:
def func(row):
    # get the query vector filtered on week
    x = df_trump_embed_agg[df_trump_embed_agg._created_at == row._created_at].iloc[:, 5:] #.to_numpy()
    y = row[5:] #.to_numpy() # get only the embeddings
    cos_score = cosine_score(x, y)
    if cos_score.shape[0] > 0: 
        return cos_score[0]
    else:
        return None

In [None]:
cosine_score = lambda x, y: np.dot(x, y)/(np.linalg.norm(x) * np.linalg.norm(y))

In [None]:
df_trump_mentions_embed_agg["trump_similarity_score"] = df_trump_mentions_embed_agg.apply(func, axis=1)

Previous results with GloVe
```
0        0.989125
1        0.984315
2        0.986207
3        0.990281
4        0.986024
           ...   
10283    0.991514
10284    0.982669
10285    0.970768
10286    0.992367
10287    0.987436
Name: trump_similarity_score, Length: 10273, dtype: float64
```

In [None]:
df_trump_mentions_embed_agg["trump_similarity_score"]

0             NaN
1             NaN
2             NaN
3             NaN
4             NaN
           ...   
13084    0.669645
13085    0.676860
13086   -0.117782
13087    0.409059
13088         NaN
Name: trump_similarity_score, Length: 13089, dtype: float64

Where `trump_similarity_score` is NOT NA

In [None]:
df_trump_mentions_embed_agg[~df_trump_mentions_embed_agg["trump_similarity_score"].isna()]

Unnamed: 0,user_id,_created_at,retweet_count,favorite_count,tweet_count,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,...,729,730,731,732,733,734,735,736,737,738,739,740,741,742,743,744,745,746,747,748,749,750,751,752,753,754,755,756,757,758,759,760,761,762,763,764,765,766,767,trump_similarity_score
50,1017500185356853248,2020-22,5.500000,23.500000,2,0.578469,0.382775,-0.211122,-0.529742,0.220981,0.174923,0.386238,-0.368511,-0.521326,-0.324194,0.265130,0.624595,-0.125465,0.435626,-0.391925,-0.126850,1.398330,0.371573,-0.208245,-0.354946,-0.318033,-0.156227,0.166782,0.557080,-0.246753,-0.291566,-0.220615,0.435122,0.443528,0.974567,-0.014414,-0.371126,-0.944439,0.015631,0.181982,...,-0.173446,-0.527119,-0.550636,0.723483,0.688339,-0.101384,-0.415139,-0.919286,0.044089,-0.719294,0.107086,-0.775596,0.384175,-0.169958,-0.248016,0.084495,0.339532,-0.040263,0.258807,-0.260607,0.236294,-0.001676,-0.584467,0.566636,-0.111435,0.349002,0.438965,-0.226874,-0.080537,-1.265147,-0.504492,0.495186,0.971563,-0.236220,0.333156,0.242572,0.610855,-0.279292,-0.777029,0.620914
51,1017500185356853248,2020-23,12.000000,65.000000,1,0.415133,0.379527,-0.099740,-0.897400,-0.799075,0.125143,0.200443,-0.029199,0.819083,0.220121,0.261527,0.498688,-1.317475,1.063578,-0.117230,0.366636,0.345720,-0.178392,0.297701,0.284173,-0.520559,-0.153453,0.722448,0.537420,0.089217,-0.429460,0.432719,0.719954,0.202621,0.740574,0.827624,-0.097467,-0.446857,-0.687982,0.502333,...,0.205044,-0.354662,-0.364937,-0.228000,0.108822,-0.250345,0.040203,-0.579387,0.089110,-0.089038,0.247360,-0.764936,-0.104885,-0.529356,-0.548878,-0.236123,0.586617,0.061304,0.876636,0.047540,0.121374,-0.509653,-0.291509,0.232516,-0.211053,0.302600,0.773064,0.404088,0.132660,-0.046414,0.212946,0.508343,0.917213,-0.329371,0.408069,0.153776,-0.048324,-0.155285,-0.446324,0.539433
52,1017500185356853248,2020-24,16.000000,63.000000,2,-0.250559,0.091244,-0.301057,-0.681981,-0.015299,-0.143077,0.205556,-0.425046,-0.369346,-0.362294,-0.506262,0.770842,-0.204277,0.233750,-0.200922,-0.119250,0.528334,-0.324203,0.231272,-0.093023,-0.555862,0.250268,0.126660,0.591646,0.035086,-0.052111,0.259456,0.452343,0.531371,0.596029,0.151922,0.123769,-1.142920,0.121410,-0.550222,...,-0.022805,-0.429753,0.140078,0.209008,0.165410,-0.451978,-0.099314,-0.375515,0.489790,0.329868,-0.072078,-0.306108,0.413814,0.151408,-0.679806,0.358573,0.346114,-0.313919,0.391032,-0.274049,1.261627,-0.303266,-0.013670,-0.088431,-0.637620,0.374948,-0.446546,0.053114,-0.272932,-0.599970,-0.498046,0.235857,0.701505,0.338427,-0.194006,-0.012183,0.708668,-0.603780,-0.399090,0.522314
53,1017500185356853248,2020-25,66.333333,170.666667,3,0.688894,0.084292,0.209936,0.256559,-0.187573,0.225341,0.035416,-0.594263,0.387330,0.207881,0.141298,0.807224,-0.280189,0.666393,-0.236650,-0.050008,0.374042,-0.194323,0.057787,-0.101273,-0.496598,0.519911,-0.002505,0.724212,-0.563288,0.133108,0.086551,0.136700,0.424881,0.682373,0.490565,-0.151378,-0.491300,0.025838,0.232182,...,-0.192378,-0.733744,0.045929,-0.234282,0.284167,0.239868,0.383138,-0.647426,0.579491,-0.160794,-0.405885,-0.466424,0.593144,-0.547006,-0.785030,-0.059935,0.359741,-0.475777,0.175000,-0.070978,0.667746,-0.571209,0.488283,0.428253,-0.707128,-0.033267,0.021069,0.119960,-0.134574,-0.519354,0.337993,0.712474,0.381080,0.354880,0.161034,-0.380630,-0.007110,-0.361711,-0.074117,0.529402
54,1017500185356853248,2020-26,39.000000,109.666667,3,0.238489,-0.049215,0.012358,-0.193455,-0.161872,0.193864,0.418272,-0.312250,-0.049817,-0.274501,0.020438,0.839938,-0.553129,0.521240,-0.065869,-0.210198,1.098208,0.069449,-0.279507,-0.272535,-0.161232,-0.074851,0.502252,0.311159,0.102755,-0.527196,-0.308286,0.480614,0.546294,0.441716,-0.002488,0.001302,-0.444891,-0.213730,-0.210198,...,0.107406,-0.454884,-0.264762,0.327209,0.258401,-0.267670,0.251800,-0.494076,0.427700,-0.151545,-0.066145,-0.662348,0.004795,-0.284783,-0.744615,0.332404,0.110881,0.116543,0.357582,-0.136611,0.554178,0.167693,-0.275550,0.440527,-0.446463,0.126090,0.537624,-0.120916,-0.123079,-0.732515,-0.237145,0.550038,0.333195,-0.226512,0.292278,0.189782,0.470662,0.236972,-0.966554,0.531211
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13083,996094929733652481,2020-31,102.210526,302.578947,19,0.305969,0.156688,-0.039649,-0.244932,-0.014690,-0.373230,0.113910,-0.224740,0.306974,-0.592372,-0.068721,0.489579,-0.470647,0.582457,-0.075625,-0.378334,0.266071,0.078553,-0.113178,-0.010484,-0.224231,-0.133563,-0.218370,0.456284,-0.204397,-0.386758,-0.182824,0.058118,0.565652,0.519320,0.033105,-0.540936,-0.880307,0.166982,-0.335358,...,-0.024058,-0.518753,0.185236,0.556821,0.439014,0.058764,-0.173479,-0.718560,0.521174,0.073409,-0.027607,-0.657197,0.313892,-0.164808,-0.500603,0.099388,0.389446,-0.298585,0.212902,-0.258560,0.390793,0.098888,0.082276,0.227855,-0.557487,0.209767,0.016993,0.078010,-0.284275,-0.724102,-0.056982,0.145844,1.104532,0.307578,0.455140,-0.095489,0.203224,0.207388,-0.558739,0.617216
13084,996094929733652481,2020-32,79.600000,226.000000,5,0.215936,0.366439,0.231274,-0.259537,0.100848,-0.309814,0.102817,-0.265775,0.227498,-0.501895,-0.238847,0.740178,-0.535157,0.417838,-0.056899,-0.633119,0.154940,0.228781,-0.197275,0.078303,-0.230805,0.059776,-0.079930,0.291462,-0.028809,-0.456514,0.138559,0.250878,0.644368,0.456812,0.256868,-0.497173,-1.142857,0.141020,-0.022545,...,0.036270,-0.362439,0.184718,0.537860,0.448733,0.271127,-0.088373,-0.751656,0.278584,-0.004400,0.122947,-0.618982,0.099981,-0.118136,-0.334018,0.039384,0.254787,-0.139029,0.588689,-0.114834,0.350072,0.140787,-0.039029,0.232852,-0.194935,0.303728,0.034409,-0.153247,-0.235681,-0.707784,-0.656749,0.263236,0.846527,0.194489,0.261617,-0.045762,0.415062,0.019635,-0.516164,0.669645
13085,996094929733652481,2020-33,205.727273,456.681818,22,0.256496,0.203196,-0.124192,-0.198204,-0.124198,-0.205402,0.325370,-0.429885,0.313698,-0.489927,0.029023,0.366346,-0.623321,0.438458,-0.130106,-0.271847,0.361970,-0.014071,-0.003615,0.056601,-0.318568,-0.023200,-0.123064,0.423259,-0.246092,-0.231499,-0.060414,0.138951,0.258191,0.336035,0.266630,-0.535135,-0.768234,-0.052635,-0.121078,...,0.013965,-0.504433,-0.112539,0.348582,0.128589,0.022237,-0.053625,-0.581130,0.735020,0.200632,-0.053361,-0.380903,0.218173,-0.162683,-0.412743,0.021205,0.538315,-0.238811,0.234367,-0.204756,0.446495,-0.104512,0.264053,0.361631,-0.260163,0.089676,0.099575,-0.062122,-0.191560,-0.637717,0.008851,0.028327,0.908031,0.123413,0.231645,0.030346,0.202748,0.021384,-0.374967,0.676860
13086,996094929733652481,2020-34,250.800000,407.800000,5,0.330945,0.322491,-0.273344,-0.106729,-0.412516,0.245652,-0.182706,-0.475557,0.131754,-0.469704,0.256757,0.570370,-0.470006,0.467516,-0.061178,-0.052571,0.743308,0.085184,-0.039109,0.020118,-0.354075,0.118436,-0.208735,0.491885,-0.080740,-0.277205,-0.168272,0.219671,0.150110,0.085747,0.421018,-0.658743,-0.845508,0.349852,-0.142102,...,-0.102802,-0.104742,-0.164597,0.193834,0.083448,-0.024609,0.436058,-0.643746,0.727530,-0.016054,0.000602,-0.410060,0.244780,0.009526,-0.527677,0.099693,0.178526,-0.187576,0.442857,-0.060650,0.520454,0.130646,0.176822,0.396183,-0.133471,0.356883,0.105028,0.037281,0.021538,-0.580487,-0.132103,-0.308153,0.498150,0.368595,0.126295,-0.030942,0.394457,-0.039455,-0.317926,-0.117782


Get `screen_name` for each `user_id`

In [None]:
df_trump_mentions_embed_agg["screen_name"] = df_trump_mentions_embed_agg.user_id.apply(lambda x: users[x])

Sort weeks

In [None]:
print(f"Min year-month: {df_trump_mentions_embed_agg._created_at.min()} / Max year-month: {df_trump_mentions_embed_agg._created_at.max()}")
df_trump_mentions_embed_agg = df_trump_mentions_embed_agg.sort_values(by=['_created_at'], ascending=False)

Min year-month: 2013-19 / Max year-month: 2020-36


Assign week number from most recent to way back

In [None]:
weeks = df_trump_mentions_embed_agg._created_at.unique().tolist()
week_idx = {week : i for i, week in enumerate(weeks)}
df_trump_mentions_embed_agg["weeks_elapsed"] = df_trump_mentions_embed_agg._created_at.apply(lambda x : week_idx[x])

In [None]:
df_trump_mentions_embed_agg.head()

Unnamed: 0,user_id,_created_at,retweet_count,favorite_count,tweet_count,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,...,731,732,733,734,735,736,737,738,739,740,741,742,743,744,745,746,747,748,749,750,751,752,753,754,755,756,757,758,759,760,761,762,763,764,765,766,767,trump_similarity_score,screen_name,weeks_elapsed
5097,1917731,2020-36,76.285714,199.428571,7,-0.20527,0.01638,-0.148027,-0.373647,0.273769,-0.065402,-0.276937,0.072599,0.134119,-0.047255,0.214167,0.760659,-0.208453,0.412981,0.214158,-0.039573,0.185129,-0.049116,0.015651,-0.150683,-0.409431,0.494808,0.245545,0.259508,-0.216896,-0.132889,-0.517054,-0.387,0.362352,0.357618,0.561307,0.189935,-0.474394,-0.314114,-0.297588,...,0.48929,0.294624,0.022372,0.503211,0.007623,-0.182789,0.400301,0.000656,-0.124834,-0.176468,0.130508,-0.206619,-0.591681,0.183079,0.112897,-0.337118,0.184059,-0.178293,0.591839,0.11431,-0.183829,0.273752,-0.880183,0.123339,0.095722,0.148057,-0.364999,0.071211,0.551505,0.237369,0.2832,0.534408,-0.125079,-0.260351,-0.117501,-0.030025,-0.522277,0.421886,thehill,0
4675,18029328,2020-36,69.5,234.0,2,0.118476,0.207613,-0.088429,-0.624597,0.319908,0.071351,-0.34497,-0.407576,0.303681,-0.514474,0.392652,0.261499,-0.02955,0.430041,0.007616,-0.621135,-0.021993,0.168961,0.266399,-0.389957,-0.483733,0.293492,-0.106175,-0.193985,0.065344,-0.334661,-0.092723,0.039644,0.228293,1.015544,0.583909,-0.401638,-0.344669,-0.236624,-0.42915,...,0.152601,0.157286,0.469065,-0.039645,0.473586,0.372301,-0.26637,0.127052,0.055948,0.160544,-0.096229,-0.068353,-0.517633,0.390785,0.421639,-0.061583,0.595827,-0.22065,0.908683,0.207161,0.127111,0.044898,-0.231695,0.646802,-0.01829,0.451102,0.09091,-0.672925,-0.021985,0.348408,0.900875,0.636198,0.189733,-0.03486,-0.512973,-0.186765,-0.611793,0.428459,SecretsBedard,0
9598,39249305,2020-36,4.0,9.0,1,0.326287,-0.276758,-0.391384,0.300816,-0.453985,0.412052,0.142072,-0.511366,0.101578,0.185175,-0.374523,0.451404,0.360811,0.689495,-0.21644,-0.218805,0.686634,0.005188,-0.200913,-0.197942,-1.090176,0.5644,-0.069836,1.02986,0.060898,0.280931,0.439461,-0.081554,0.208392,0.483002,0.844378,0.104777,-0.532354,-0.27267,-0.71546,...,0.017462,0.485488,-0.085207,0.175009,0.9057,-0.99584,0.250485,-0.500054,-0.358088,-0.716817,-0.268404,-0.037481,-1.116962,0.295353,0.020719,-0.199071,0.226603,-0.397508,-0.165689,0.248406,0.027174,0.563224,-0.580477,0.696069,0.003664,-0.471037,0.393062,-0.78315,0.565805,0.87438,0.65199,0.338414,0.879705,0.022909,0.897779,0.079184,-0.423548,0.455533,USRepMikeDoyle,0
4710,18172905,2020-36,4.0,0.0,1,-0.440572,-0.510889,0.333811,0.42034,-0.16892,-0.315663,-0.620747,-0.764247,0.663397,-0.234013,0.390651,0.604288,0.63593,-0.237807,-0.180062,0.27728,0.147519,0.015277,-0.027139,-0.372259,0.098902,-0.094035,-0.017918,0.511894,-0.074842,-0.200307,0.087245,-0.260887,0.271158,1.139333,0.774194,-0.055085,0.154457,-0.405563,-0.04551,...,-0.18143,-0.17675,0.290982,0.0209,0.297553,-0.182182,0.267102,0.508749,0.179591,-0.344424,-0.262217,-0.288288,-0.55337,0.453551,0.236683,0.071006,0.641599,-0.449468,1.248119,-0.410105,0.913731,-0.275578,-0.058719,-0.640368,0.339884,0.108033,0.114227,-0.243408,0.156417,0.154646,0.919207,0.311572,0.374534,-0.079234,-0.356988,0.308405,-0.142568,0.437298,rickklein,0
681,10615232,2020-36,1366.0,3966.0,1,-0.107576,0.35021,0.036407,0.079485,-0.308997,-0.320512,-0.298839,-0.628479,0.355197,-0.791496,-0.446755,0.50663,0.232336,0.185109,0.196265,0.207596,-0.208938,-0.8221,0.09582,-0.10286,0.078436,0.272592,0.033451,0.795278,0.035792,0.191285,-0.622537,0.687709,0.336798,1.143336,0.494549,-0.804263,-0.493593,0.038548,-0.495253,...,0.004894,0.364928,0.528113,0.637313,0.499301,-0.242641,0.393663,0.743477,0.07186,-0.760769,-0.148221,-0.118557,-1.134515,-0.084811,0.32513,-0.840417,0.574795,-0.364717,0.982124,0.18748,0.457385,0.168959,-1.04114,0.007434,-0.674878,0.331772,-0.263563,-0.167091,0.501658,0.079491,1.227904,0.158272,0.193435,-0.174915,0.073403,0.148029,-0.890452,0.493006,ChuckGrassley,0


#### Plots

In [None]:
fig = px.scatter(df_trump_mentions_embed_agg, x="weeks_elapsed", y="trump_similarity_score", color="screen_name",
#               line_group="country", 
              hover_name="screen_name",
                 log_y=True, 
                 range_x=[df_trump_mentions_embed_agg.weeks_elapsed.min(), 15]
                )
fig.show()

Showing by Senators

In [None]:
import plotly.express as px

fig = px.scatter(df_trump_mentions_embed_agg[df_trump_mentions_embed_agg.user_id.isin(senators)], x="weeks_elapsed", y="trump_similarity_score", color="screen_name",
#               line_group="country", 
              log_y=True,
              hover_name="screen_name",
             range_x=[df_trump_mentions_embed_agg.weeks_elapsed.min(), 15]
             )
fig.show()

### Republican Senators

- Source: https://triagecancer.org/congressional-social-media

In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

web_url = 'https://triagecancer.org/congressional-social-media'
response = requests.get(web_url)
print(f"response status: {response.status_code}")

response status: 200


In [None]:
soup_object = BeautifulSoup(response.content)
data_table = soup_object.find_all('table')[0]

In [None]:
columns = [elem.text for elem in data_table.find_all('th')]
print(columns)

['State', 'Chamber of Congress', 'Name', 'Name Links', 'Party', 'Twitter', 'Twitter Links', 'Instagram', 'Facebook Page', 'Facebook']


In [None]:
all_values = []
for row in data_table.find_all('tr')[1:]:
    values = [td.text for td in row.find_all('td')]
    all_values.append(values)

In [None]:
df_senators = pd.DataFrame(all_values, columns=columns)
df_senators.head()

Unnamed: 0,State,Chamber of Congress,Name,Name Links,Party,Twitter,Twitter Links,Instagram,Facebook Page,Facebook
0,Alabama,Senator,Richard Shelby,https://www.shelby.senate.gov/public/,R,@SenShelby,https://twitter.com/SenShelby?ref_src=twsrc%5E...,@senatorshelby,x,https://www.facebook.com/RichardShelby
1,Alabama,Senator,Doug Jones,https://www.jones.senate.gov/,D,@DougJones,https://twitter.com/DougJones?ref_src=twsrc%5E...,@dougjonesbama,x,https://www.facebook.com/senatordougjones/
2,Alabama 1st District,Representative,"Byrne, Bradley",https://byrne.house.gov/,R,@RepByrne,https://twitter.com/RepByrne,@repbyrne,x,https://www.facebook.com/RepByrne
3,Alabama 2nd District,U.S. Representative,"Roby, Martha",https://roby.house.gov/,R,@RepMarthaRoby,https://twitter.com/RepMarthaRoby,@martharoby,x,https://www.facebook.com/Representative.Martha...
4,Alabama 3rd District,U.S. Representative,"Rogers, Mike",http://mikerogers.house.gov/,R,@RepMikeRogersAL,https://twitter.com/RepMikeRogersAL,@repmikerogersal,x,https://www.facebook.com/pages/Mike-Rogers/640...


In [None]:
df_senators['Chamber of Congress'].unique()

array(['Senator', 'Representative', 'U.S. Representative', 'U.S. Senator'],
      dtype=object)

Republican Senators

In [None]:
rep_senators = df_senators[(df_senators.Party == 'R') & (df_senators['Chamber of Congress'].isin(['Senator', 'U.S. Senator']))][['State', 'Twitter']].to_dict(orient='records')

In [None]:
rep_senators = {x['Twitter'].replace('@', '') : x['State'] for x in rep_senators}
print(f"Total R senators: {len(rep_senators)}")

Total R senators: 53


In [None]:
rep_senators

{'BillCassidy': 'Louisiana',
 'Braun4Indiana': 'Indiana',
 'ChuckGrassley': 'Iowa',
 'HawleyMO': 'Missouri',
 'InhofePress': 'Oklahoma',
 'JerryMoran': 'Kansas',
 'JohnBoozman': 'Arkansas',
 'JohnCornyn': 'Texas',
 'JohnKennedyLA': 'Louisiana',
 'LindsayGrahamSC': 'South Carolina',
 'MarshaBlackburn': 'Tennessee',
 'McConnellPress': 'Kentucky',
 'MikeCrapo': 'Idaho',
 'RandPaul': 'Kentucky',
 'RoyBlunt': 'Missouri',
 'ScottforFlorida': 'Florida',
 'SenAlexander': 'Tennessee',
 'SenCapito': 'West Virginia',
 'SenCoryGardner': 'Colorado',
 'SenDanSullivan': 'Alaska',
 'SenHydeSmith': 'Mississippi',
 'SenJohnBarrasso': 'Wyoming',
 'SenJohnHoeven': 'North Dakota',
 'SenJohnThune': 'South Dakota',
 'SenJoniErnst': 'Iowa',
 'SenKevinCramer': 'North Dakota',
 'SenMcSallyAZ': 'Arizona',
 'SenMikeLee': 'Utah',
 'SenPatRoberts': 'Kansas',
 'SenRonJohnson': 'Wisconsin',
 'SenSasse': 'Nebraska',
 'SenShelby': 'Alabama',
 'SenTedCruz': 'Texas',
 'SenThomTillis': 'North Carolina',
 'SenToddYoung': '

Rep senators who are present in the twitter dataset

In [None]:
rep_sen = df_trump_mentions_embed_agg[df_trump_mentions_embed_agg.screen_name.isin(list(rep_senators.keys()))].screen_name.unique().tolist()

Subplots

In [None]:
dim = (9,5)
grid = np.indices(dim)
grid = list(zip(grid[0].reshape(np.prod(dim)), grid[1].reshape(np.prod(dim))))
grid = list(map(lambda x: (x[0]+1, x[1]+1), grid))
senators = dict(zip(rep_sen, grid))

In [None]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

screen_names = tuple([f"{x} - {rep_senators[x]}" for x in senators.keys()])
# Initialize figure with subplots
fig = make_subplots(
    rows=dim[0], 
    cols=dim[1], #, subplot_titles=("Plot 1", "Plot 2", "Plot 3", "Plot 4")
    x_title='Weeks elapsed',
    y_title='Trump Similarity Score',
    subplot_titles=screen_names
)

In [None]:
# Add traces
for senator, (row, col) in senators.items():
    df_sub = df_trump_mentions_embed_agg[df_trump_mentions_embed_agg.screen_name == senator]
    weeks = df_sub.weeks_elapsed.tolist()
    scores = df_sub.trump_similarity_score.tolist() # NA values?
    # only plot if all values are NOT NAs
    # if len([x for x in scores if np.isnan(x)]) != len(scores):
    row, col = int(row), int(col)
    fig.add_trace(go.Scatter(x=weeks, y=scores, name=senator, mode='markers+lines'), row=row, col=col)

fig.update_yaxes(type="log")
fig.update_layout(title_text="Twitter Semantic Similarity between Rep Senators and Trump (weekly tweet aggregate)", height=1500)

fig.show()

Saving the interactive plot as HTML

In [None]:
fig.write_html("drive/My Drive/us-2020/tweet_semantic_similarity.html")