# US 2020 political observatory

Analysing twitter language using BERT

### Pre-requisites

Mount Google Drive

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Install `transformers`

In [3]:
!pip install transformers
!pip install -U sentence-transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/2c/4e/4f1ede0fd7a36278844a277f8d53c21f88f37f3754abf76a5d6224f76d4a/transformers-3.4.0-py3-none-any.whl (1.3MB)
[K     |████████████████████████████████| 1.3MB 2.5MB/s 
[?25hCollecting tokenizers==0.9.2
[?25l  Downloading https://files.pythonhosted.org/packages/7c/a5/78be1a55b2ac8d6a956f0a211d372726e2b1dd2666bb537fea9b03abd62c/tokenizers-0.9.2-cp36-cp36m-manylinux1_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 17.4MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 28.6MB/s 
[?25hCollecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/e5/2d/6d4ca4bef9a67070fa1cac508606328329152b1df10bdf31fb6e4e727894/sentencepiece-0.1.94-cp36-cp36m-manylinux2014_x86_64.whl (1.1

In [4]:
# general
import re
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
from pathlib import Path

# viz
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

# nlp
from transformers import pipeline
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA
from sklearn import datasets
from sklearn.preprocessing import StandardScaler

%matplotlib inline

#### Paths

In [5]:
root = Path("drive/My Drive/us-2020")
tweets_path = Path("data/tweets-04112020/tweets.csv")
hashtags_path = Path("data/tweets-04112020/hashtags.csv")
mentions_path = Path("data/tweets-04112020/user-mentions.csv")
user_path = Path("data/user_handles.json")

#### Datasets

In [6]:
df_tweets = pd.read_csv(root/tweets_path)
df_hashtags = pd.read_csv(root/hashtags_path)
df_mentions = pd.read_csv(root/mentions_path)

print(f"tweets: {df_tweets.shape}")
print(f"hashtags: {df_hashtags.shape}")
print(f"mentions: {df_mentions.shape}")

tweets: (1569371, 19)
hashtags: (630657, 4)
mentions: (1312092, 5)


In [7]:
df_tweets.head()

Unnamed: 0,tweet_id,created_at,tweet_content,user_id,screen_name,retweet_count,favorite_count,place_id,place_type,place_name,lon,lat,country_code,original_user_id,original_screen_name,original_favorite_count,in_reply_to_status_id_str,in_reply_to_user_id_str,in_reply_to_screen_name
0,1297974719707308033,Mon Aug 24 19:11:03 +0000 2020,"As a member of @HouseScience, I'm proud to sup...",1009269193,RepLipinski,5,8,,,,,,,,,,,,
1,1241136765886496768,Fri Mar 20 22:57:18 +0000 2020,[2/4] To learn more about what is or is not pe...,1009269193,RepLipinski,0,2,,,,,,,,,,1.241137e+18,1009269000.0,RepLipinski
2,1245860684706926593,Thu Apr 02 23:48:28 +0000 2020,Together we can stop the spread of #COVID19 by...,1009269193,RepLipinski,2,16,,,,,,,,,,,,
3,1244784554361192448,Tue Mar 31 00:32:19 +0000 2020,And thank you @RepLipinski for your investment...,1009269193,RepLipinski,4,0,,,,,,,208566585.0,PaceSuburbanBus,12.0,,,
4,1234148366147346439,Sun Mar 01 16:07:54 +0000 2020,Enjoyed talking with parents and teachers at t...,1009269193,RepLipinski,4,13,014241bf2253c205,city,"Lockport, IL",,,US,,,,,,


> Note: The presence of `original_` prefixed entries suggest the tweet was a RT.

Sanity checks

In [8]:
print(f"Number of unique tweet IDs == Number of total rows: {df_tweets.tweet_id.nunique() == df_tweets.shape[0]}")

Number of unique tweet IDs == Number of total rows: True


#### Data type coercion

In [9]:
df_tweets.dtypes

tweet_id                       int64
created_at                    object
tweet_content                 object
user_id                        int64
screen_name                   object
retweet_count                  int64
favorite_count                 int64
place_id                      object
place_type                    object
place_name                    object
lon                          float64
lat                          float64
country_code                  object
original_user_id             float64
original_screen_name          object
original_favorite_count      float64
in_reply_to_status_id_str    float64
in_reply_to_user_id_str      float64
in_reply_to_screen_name       object
dtype: object

All IDs to `str`

In [10]:
# tweet_id
df_tweets['tweet_id'] = df_tweets.tweet_id.astype(str)
df_hashtags['tweet_id'] = df_hashtags.tweet_id.astype(str)
df_mentions['tweet_id'] = df_mentions.tweet_id.astype(str)

# user_id
df_tweets['user_id'] = df_tweets.user_id.astype(str)
df_hashtags['user_id'] = df_hashtags.user_id.astype(str)
df_mentions['user_id'] = df_mentions.user_id.astype(str)

# others 
# Note: here "0" indicates NULL
df_tweets['original_user_id'] = df_tweets.original_user_id.fillna(0).astype(int).astype(str)
df_tweets['in_reply_to_status_id_str'] = df_tweets.in_reply_to_status_id_str.fillna(0).astype(int).astype(str)
df_tweets['in_reply_to_user_id_str'] = df_tweets.in_reply_to_user_id_str.fillna(0).astype(int).astype(str)
df_mentions['mentioned_user_id'] = df_mentions.mentioned_user_id.fillna(0).astype(int).astype(str)

sanity check

In [11]:
df_tweets.dtypes

tweet_id                      object
created_at                    object
tweet_content                 object
user_id                       object
screen_name                   object
retweet_count                  int64
favorite_count                 int64
place_id                      object
place_type                    object
place_name                    object
lon                          float64
lat                          float64
country_code                  object
original_user_id              object
original_screen_name          object
original_favorite_count      float64
in_reply_to_status_id_str     object
in_reply_to_user_id_str       object
in_reply_to_screen_name       object
dtype: object

#### Primary actors

In [12]:
trump_id = '25073877' # @realDonaldTrump
biden_id = '939091'
harris_id = '803694179079458816'
primary_actors = ["realDonaldTrump", "JoeBiden", "SenKamalaHarris"] # pence?

#### Senators

In [13]:
import os
user_info = []
with open(root/user_path, 'r') as f:
    user_info = json.load(f)

In [171]:
users = {x['id_str'] : x['screen_name'] for x in user_info}

In [15]:
senators = []
for user in user_info:
    slugs = list(map(lambda x: x.strip(), user['slug'].split(',')))
    if 'senators' in slugs:
        senators.append(str(user['id_str']))

In [16]:
print(f"Total no of senators: {len(senators)}")

Total no of senators: 104


### Summary statistics

No. of tweets per user

In [17]:
tweet_counts = df_tweets.groupby(['user_id'])['tweet_id'].count().reset_index(name="tweet_count")

In [18]:
tweet_counts.sort_values('tweet_count').tail(5)

Unnamed: 0,user_id,tweet_count
884,822215679726100480,3251
345,17494010,3254
241,14412533,3256
243,14465607,3260
883,822215673812119553,3263


Only senators

In [19]:
tweet_counts[tweet_counts.user_id.isin(senators)].sort_values('tweet_count').tail(5)

Unnamed: 0,user_id,tweet_count
342,172858784,3245
641,2964174789,3246
157,109287731,3247
820,76456274,3250
345,17494010,3254


### Filter tweets

Tweets from `@realDonaldTrump`

In [105]:
trump_tweet_ids = set(df_tweets[df_tweets.user_id == trump_id].tweet_id.tolist())
print(f"Total Trump tweets: {len(trump_tweet_ids)}")

Total Trump tweets: 3192


Tweets mentioning `@realDonaldTrump` AND not from `@realDonaldTrump`

In [106]:
trump_mentions_tweet_ids = set(df_mentions[(df_mentions['mentioned_user_id'] == trump_id) & (df_mentions['user_id'] != trump_id)].tweet_id.tolist())
print(f"Total Trump mentions: {len(trump_mentions_tweet_ids)}")

Total Trump mentions: 36051


Tweets mentioning `@realDonaldTrump` that are retweets (RTs)

In [107]:
trump_rt_tweet_ids = set(df_tweets[df_tweets.original_user_id == trump_id].tweet_id.tolist())
print(f"Trump RTs: {len(trump_rt_tweet_ids)}")

Trump RTs: 4628


Filtering out RTs from `@realDonadTrump` FROM the *actual* mentions (to cut down on redundant signals)

In [108]:
trump_no_rt_mentions_ids = trump_mentions_tweet_ids - trump_rt_tweet_ids
print(f"Sanity check: {len(trump_mentions_tweet_ids - trump_rt_tweet_ids) == len(trump_no_rt_mentions_ids)}")

Sanity check: True


In [109]:
len(trump_no_rt_mentions_ids)

31592

Get the tweets

In [112]:
df_trump_mentions = df_tweets[df_tweets.tweet_id.isin(trump_no_rt_mentions_ids)].reset_index(drop=True).copy()
print(df_trump_mentions.shape)
df_trump_mentions.head()

(31592, 19)


Unnamed: 0,tweet_id,created_at,tweet_content,user_id,screen_name,retweet_count,favorite_count,place_id,place_type,place_name,lon,lat,country_code,original_user_id,original_screen_name,original_favorite_count,in_reply_to_status_id_str,in_reply_to_user_id_str,in_reply_to_screen_name
0,1026516623719587840,Mon Aug 06 17:13:29 +0000 2018,"Just found the Trump Tower in Mamou, Louisiana...",1017500185356853248,SenBillCassidy,9,19,,,,,,,0,,,0,0,
1,1025485773875937283,Fri Aug 03 20:57:15 +0000 2018,We need better border security. Washington Dem...,1017500185356853248,SenBillCassidy,1,5,,,,,,,0,,,0,0,
2,1032951113815928832,Fri Aug 24 11:21:51 +0000 2018,President @realDonaldTrump is standing up for ...,1017500185356853248,SenBillCassidy,8,0,,,,,,,1017500185356853248,SenBillCassidy,24.0,0,0,
3,1052157223646978048,Tue Oct 16 11:20:04 +0000 2018,American manufacturers are #BetterOffNow.\n\n→...,1017500185356853248,SenBillCassidy,1768,0,,,,,,,1209417007,SteveScalise,6189.0,0,0,
4,1026545492354691072,Mon Aug 06 19:08:12 +0000 2018,Great work by Senator @BillCassidy and @realDo...,1017500185356853248,SenBillCassidy,4,0,,,,,,,2706910842,ChrisNeiweem,9.0,0,0,


Getting Trump's direct tweets

In [113]:
df_trump = df_tweets[(df_tweets.tweet_id.isin(trump_tweet_ids)) & (df_tweets.original_user_id == '0')].reset_index(drop=True).copy()
print(df_trump.shape)
df_trump.head()

(1507, 19)


Unnamed: 0,tweet_id,created_at,tweet_content,user_id,screen_name,retweet_count,favorite_count,place_id,place_type,place_name,lon,lat,country_code,original_user_id,original_screen_name,original_favorite_count,in_reply_to_status_id_str,in_reply_to_user_id_str,in_reply_to_screen_name
0,1268685511755026432,Thu Jun 04 23:26:11 +0000 2020,Great to be with our wonderful Men and Women o...,25073877,realDonaldTrump,23514,96180,,,,,,,0,,,0,0,
1,1268874882827378688,Fri Jun 05 11:58:41 +0000 2020,Great going Mike! https://t.co/fmInHTfj9k,25073877,realDonaldTrump,4769,23127,,,,,,,0,,,0,0,
2,1268723566046044160,Fri Jun 05 01:57:24 +0000 2020,Sleepy Joe Biden’s 1994 Crime Bill was a total...,25073877,realDonaldTrump,38970,140965,,,,,,,0,,,0,0,
3,1268869099431608320,Fri Jun 05 11:35:42 +0000 2020,USA! https://t.co/p6LrDNkSB9,25073877,realDonaldTrump,20376,92842,,,,,,,0,,,0,0,
4,1268998143733051394,Fri Jun 05 20:08:28 +0000 2020,"...We should be standing up straight and tall,...",25073877,realDonaldTrump,41234,205898,,,,,,,0,,,1268998142860627968,25073877,realDonaldTrump


## Embeddings

### Non-contextual embeddings (static vectors)

1. GloVe 
2. word2vec

Convert tweets to their embeddings

In [None]:
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

Choose embedding size

In [None]:
embedding_size = 100 # 50 100 200

In [None]:
embed_path = root/Path("embeddings")
glove_embed = f"glove.twitter.27B.{embedding_size}d.txt"
glove_file = datapath(embed_path/glove_embed)
tmp_file = get_tmpfile(embed_path/f"glove.twitter.27B.{embedding_size}d.word2vec.txt")
glove_fname = f"glove.twitter.27B.{embedding_size}d.word2vec.txt"

To convert Glove to w2v model (for Gensim)

In [None]:
# _ = glove2word2vec(glove_file, tmp_file) # convert Glove to word2vec format

Load the model (in memory)

In [None]:
model = KeyedVectors.load_word2vec_format(embed_path/glove_fname)

Vocab size

In [None]:
print(f"Vocab size: {len(model.vocab)}")

Vocab size: 1193514


#### Preprocessing for GloVe

In [None]:
import string
puncs = ['!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
#  '-',
 '.',
 '/',
 ':',
 ';',
#  '<',
 '=',
#  '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 '{',
 '|',
 '}',
 '~',
'’',
'‘', 
'“', 
'”'    
]

In [None]:
"""
preprocess-twitter.py
python preprocess-twitter.py "Some random text with #hashtags, @mentions and http://t.co/kdjfkdjf (links). :)"
Script for preprocessing tweets by Romain Paulus
with small modifications by Jeffrey Pennington
with translation to Python by Motoki Wu
Translation of Ruby script to create features for GloVe vectors for Twitter data.
http://nlp.stanford.edu/projects/glove/preprocess-twitter.rb
"""

import sys
import regex as re

FLAGS = re.MULTILINE | re.DOTALL

def hashtag(text):
    text = text.group()
    hashtag_body = text[1:]
    if hashtag_body.isupper():
        result = " {} ".format(hashtag_body.lower())
    else:
        result = " ".join(["<hashtag>"] + re.split(r"(?=[A-Z])", hashtag_body, flags=FLAGS))
    return result

def allcaps(text):
    text = text.group()
    return text.lower() + " <allcaps>"

# Reference : https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def tokenize(text):
    # Different regex parts for smiley faces
    eyes = r"[8:=;]"
    nose = r"['`\-]?"

    # function so code less repetitive
    def re_sub(pattern, repl):
        return re.sub(pattern, repl, text, flags=FLAGS)

    text = re_sub(r"https?:\/\/\S+\b|www\.(\w+\.)+\S*", "<url>")
    text = re_sub(r"@\w+", "<user>")
    text = re_sub(r"{}{}[)dD]+|[)dD]+{}{}".format(eyes, nose, nose, eyes), "<smile>")
    text = re_sub(r"{}{}p+".format(eyes, nose), "<lolface>")
    text = re_sub(r"{}{}\(+|\)+{}{}".format(eyes, nose, nose, eyes), "<sadface>")
    text = re_sub(r"{}{}[\/|l*]".format(eyes, nose), "<neutralface>")
    text = re_sub(r"/"," / ")
    text = re_sub(r"<3","<heart>")
    text = re_sub(r"[-+]?[.\d]*[\d]+[:,.\d]*", "<number>")
    text = re_sub(r"#\S+", hashtag)
    text = re_sub(r"([!?.]){2,}", r"\1 <repeat>")
    text = re_sub(r"\b(\S*?)(.)\2{2,}\b", r"\1\2 <elong>")
    
    ## -- I just don't understand why the Ruby script adds <allcaps> to everything so I limited the selection.
    # text = re_sub(r"([^a-z0-9()<>'`\-]){2,}", allcaps)
    text = re_sub(r"([A-Z]){2,}", allcaps)
    text = re_sub(r"[\n\r]", r" ") # added SBG: newlines carriage returns
    text = re_sub(r"[!\"\#\$\%\&\'\(\)\*\+\,\.\/\:\;\=\?\@\[\\\]\^\_\`\{\|\}\~\’\‘\“\”]", r"") # filter out puncs
    text = remove_emoji(text)
    return text.lower()

In [None]:
remove_emoji("Omg another Earthquake 😔😔")

'Omg another Earthquake '

In [None]:
text = "I TEST alllll \n\nkinds\n /r :) 😔😔😔😔 “ <hashaha> ‘” \rking's king\"s of #hashtags and #WelcomeRefugees #HASHTAGS, :))) @mentions and 3000 (http://t.co/dkfjkdf). w/ <3 :) haha!!!!!"
tokens = tokenize(text)
tokens

'i test <allcaps> al <elong>   kinds    r <smile>   <hashaha>   kings kings of <hashtag> hashtags and <hashtag> welcomerefugees  hashtags  <smile> <user> and <number> <url> w   <heart> <smile> haha <repeat>'

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
stop_words = stopwords.words("english")

In [None]:
def create_corpus(df: pd.DataFrame):
    corpus=[]
    vocab = []
    for tweet in tqdm(df['tweet_content']):
        tokens = tokenize(tweet).split(" ")
        tokens = [token for token in tokens if token.strip() != ""]
        # stopwords filtering
#         tokens = [token for token in tokens if token not in stop_words]
#         print(f"original tweet: {tweet}\nTokenized: {tokens}")
        corpus.append(tokens)
        vocab.extend(tokens)
    
    return corpus, set(vocab)

In [None]:
model["trump"].shape

(100,)

corpus is a list (tweets) of list (tokens) and vocab is the set of all unique tokens.

In [None]:
corpus, vocab = create_corpus(df)

100%|██████████| 31866/31866 [00:05<00:00, 5424.65it/s]


In [None]:
model["trump"].shape

(100,)

In [None]:
df.iloc[10000].tweet_content

'RT @margbrennan: "It is not racist at all. It comes from China. I want to be accurate,” @realDonaldTrump in response to @CeciliaVega questi…'

Tokenize and create vocab

In [None]:
vocab

{'treats',
 'alarmed',
 'unsure',
 'succeeds…',
 'orion',
 'credits',
 'incessant',
 'divestdonald',
 'source',
 'grade',
 'crim…',
 'banned',
 'censuretrump',
 '🤦\u200d🤦\u200d🤦\u200d🤦\u200d🤦\u200d',
 'perjudicar',
 'recep',
 'springfield',
 'explodes',
 'cub',
 'all-weather',
 'coerce',
 'rejoin',
 '<elong>usmca',
 'memphis',
 'schumershutdown',
 'rephrase',
 '<number>vcf',
 'gouging',
 'trashed',
 'prosperous',
 'birtherism',
 'man…',
 'woman-founded',
 '\u2066<user>\u2069',
 'beacon',
 'borrower',
 'cant-make-it-up',
 'misbehavior',
 'irrelevant',
 'w',
 'blast',
 'glenn',
 'wld',
 'buffalo',
 'months-since',
 'crackdown',
 'huma…',
 'in<number>',
 'mcgee',
 'tanker',
 'test-run',
 'one-third',
 'nellie',
 'league',
 'catastrophes',
 're-elect',
 'nys',
 'nationalities',
 'snapshot',
 'protocols',
 'isolating',
 'damages',
 'thebriankilmeadeshow',
 'rabbit',
 'dialed',
 'prospect',
 'provisions—in',
 'romance',
 'art',
 '-pressured',
 'adore',
 'deepwaterdisaster',
 'reg…',
 'fulfil

['entrepreneurship.',
 '"coronavirus',
 'people\n\nsubject',
 'workshop',
 '\n\nthere',
 'screaming.',
 'wxjb',
 'booming.',
 'firing,',
 'navarro']

Do this after getting pca1 and pca2

#### Get a single handle's view

- groupby `user_id`

In [None]:
# df_sub = df_mentions[df_mentions.user_id == 939091]

In [None]:
num_words = len(model.vocab)
# lambda x: True if x % 2 == 0 else False

In [None]:
def tweet_vec(tweet, model):
    """Get the embeddings and return the mean across a tweet
    
    Retrieves the GloVe embedding or returns a vector of zeros in case OOV
    """
    wv_agg = np.mean(list(map(lambda x: model.get_vector(x) if x in model else np.zeros(embedding_size), tweet)), axis=0)
    return wv_agg

Get mean Trump embedding i.e. across all tweets (?)

Get cosine distance

Using mean GloVe embeddings for each tweet

In [None]:
def get_wv_agg(df, model):
    """Get the aggregate distance from a user
    realDonaldTrump: 25073877
    """
    wv_agg = []
    corpus, vocab = create_corpus(df) # tokenizes and create a corpus
    for tweet in tqdm(corpus):
        wv_agg.append(tweet_vec(tweet, model)) # get the mean embedding for each tweet
    return corpus, vocab, np.array(wv_agg)

In [None]:
corpus, vocab, wv_agg = get_wv_agg(df_mentions, model)

100%|██████████| 24846/24846 [00:03<00:00, 7744.62it/s]
100%|██████████| 24846/24846 [00:01<00:00, 18747.86it/s]


### Contextual embeddings

1. BERT (base)
2. DistillBERT
3. RoBERTa

#### Text preprocessing

As per https://web.stanford.edu/class/cs224n/reports/custom/15785631.pdf:
> 2.1 Text preprocessing
Texts are lowercased. Non-ascii letters, urls, @RT:[NAME], @[NAME] are removed. For BERT, an
additional [CLS] token is inserted to the beginning of each text. Texts with length less than 4 are
thrown away. No lemmatization is performed and no punctuation mark is removed since pre-trained
embeddings are always used. No stop-word is removed for fluency purpose.


In [124]:
def bert_preprocessor(tweet: str):
    """Minimal preprocessing for BERT 
    
    1. URL removal
    2. @[screen_name] removal
    3. #hashtag removal
    4. Removal of leading and trailing spaces

    """
    FLAGS = re.MULTILINE | re.DOTALL
    # tweet = tweet.lower() # lowercase
    tweet = re.sub(r"https?:\/\/\S+\b|www\.(\w+\.)+\S*", "", tweet, flags=FLAGS) # remove URLs
    tweet = re.sub(r"@\w+", "", tweet, flags=FLAGS) # @user
    tweet = re.sub(r"#\S+", "", tweet, flags=FLAGS) # #hashtags
    tweet = tweet.strip()
    return tweet

Apply

In [125]:
df_trump['_tweet_content'] = df_trump.tweet_content.apply(bert_preprocessor)
df_trump_mentions['_tweet_content'] = df_trump_mentions.tweet_content.apply(bert_preprocessor)

BERT to perform [Semantic Textual Similarity (STS) ](https://www.sbert.net/docs/usage/semantic_textual_similarity.html)

Select model from: https://docs.google.com/spreadsheets/d/14QplCdTCDwEmTqrn1LH4yrbKvdogK4oQvYO1K1aPR5M

In [43]:
# efficiency vs performance trade-off
model = 'distilbert-base-nli-stsb-mean-tokens' # 'roberta-base-nli-stsb-mean-tokens'
bert = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens') # DistillBERT is more efficient

100%|██████████| 245M/245M [00:05<00:00, 48.0MB/s]


In [126]:
def get_bert_embeddings(tweets):
    """Computes the BERT embeddings for all tweets
    
    Args:
    tweets (list): A list of tweets
    
    Returns a 768-dimensional embedding for each tweet (mean)
    """
    embeddings = bert.encode(tweets, convert_to_tensor=False) # do not need a torch tensor
    return embeddings

Get the list of tweets

In [127]:
%%time
tweets = df_trump['_tweet_content'].tolist()
bert_embeddings = get_bert_embeddings(tweets)

CPU times: user 1.45 s, sys: 37.3 ms, total: 1.49 s
Wall time: 1.49 s


In [137]:
df_trump_embed = pd.concat([df_trump[['tweet_id', 'created_at', 'user_id', 'retweet_count',	'favorite_count']], pd.DataFrame(bert_embeddings)], axis=1)
df_trump_embed.to_csv(root/'trump_embed.csv', index=False)
df_trump_embed.head()

Unnamed: 0,tweet_id,created_at,user_id,retweet_count,favorite_count,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,...,728,729,730,731,732,733,734,735,736,737,738,739,740,741,742,743,744,745,746,747,748,749,750,751,752,753,754,755,756,757,758,759,760,761,762,763,764,765,766,767
0,1268685511755026432,Thu Jun 04 23:26:11 +0000 2020,25073877,23514,96180,1.103124,0.424156,0.050481,0.141553,0.025443,-0.275829,-0.403517,-0.030968,-0.982882,-1.13635,-0.87308,0.647381,0.431796,0.20921,0.019798,-0.521178,1.341751,0.361453,-0.891255,-0.459986,1.232851,-1.218536,-0.037328,0.027012,0.371096,-0.386543,-0.769604,0.250894,0.344037,0.141113,-0.490979,-0.923257,-0.224342,0.328991,-0.557137,...,0.178904,-0.246942,-0.389809,-0.561453,0.802101,0.823415,0.288733,0.559953,-0.654947,0.320738,0.754348,0.468034,-0.747992,-0.486636,-0.046141,-1.585838,0.081889,0.572955,-0.550021,-0.006115,0.137136,0.399055,-0.074018,-0.886708,0.474638,-0.490246,-0.329489,0.512363,-0.290211,0.254168,-0.880716,-0.748653,-0.077986,0.362621,-0.871173,0.408366,0.435463,-0.178711,0.151998,-0.970517
1,1268874882827378688,Fri Jun 05 11:58:41 +0000 2020,25073877,4769,23127,0.002316,0.889421,-0.105876,-1.386303,0.100477,-0.22105,-0.079756,0.809791,-0.523869,-0.341162,-0.572579,0.655067,0.481434,-0.933281,0.043895,0.438669,-0.300771,-0.502657,-0.407984,0.026068,-0.683694,-0.183004,-0.428313,0.12463,0.214569,0.091484,0.475241,0.105165,-0.599871,0.59199,-0.078524,0.664413,-0.38349,-0.387154,-0.62113,...,-0.24824,-0.108317,-0.499268,0.253147,0.385276,1.105778,0.068909,-0.5276,0.484778,-0.112996,1.063261,-0.053312,-0.344455,0.108098,-0.231784,-1.214794,0.806519,0.266271,-0.553391,-0.240308,-0.677317,-0.048945,0.741028,-0.228336,0.209252,-1.145957,0.006479,-0.610886,0.239447,0.738753,0.065962,-0.092621,-0.188571,1.692709,0.493308,0.294067,0.16708,0.341486,0.383013,-1.398136
2,1268723566046044160,Fri Jun 05 01:57:24 +0000 2020,25073877,38970,140965,0.665973,-0.100513,-0.164027,0.139198,-0.03246,0.390195,0.143771,-0.959026,0.867784,-0.237218,0.384009,0.749585,-0.562197,0.548477,0.479267,0.007716,0.193133,-0.816202,-0.129182,-0.076623,-0.215628,0.807896,-0.199856,0.866422,-0.592638,-0.030237,-1.031495,-0.079839,0.340891,0.414805,0.342915,0.253906,-0.622601,-0.391279,-0.144035,...,0.403574,0.147734,-1.058175,-0.378817,0.487404,0.194929,0.405415,0.129211,-0.203467,0.754181,0.285376,0.097831,0.084293,0.192409,-0.204263,-0.703972,0.203187,0.652797,-0.232823,0.667632,-0.521989,0.637491,-0.638542,0.397937,0.340223,-0.734231,0.419679,-0.027287,-0.044959,-0.527267,-0.514494,0.408547,-0.36414,0.762515,0.483081,-0.007675,0.136244,0.4643,0.044225,-0.663777
3,1268869099431608320,Fri Jun 05 11:35:42 +0000 2020,25073877,20376,92842,-0.124393,0.178987,-0.749291,-0.440119,-0.677026,-0.066555,0.809576,0.691964,0.022229,-0.263378,0.221109,0.847311,-0.476806,0.753762,-0.129881,-0.545644,-0.474164,0.664445,-0.290609,0.35327,0.709411,-0.20912,-0.031486,-0.175357,-0.066709,0.274672,-0.663042,-0.155775,-0.626229,-0.458706,-0.537441,-0.223215,0.328447,-0.065055,-0.284714,...,0.181631,-0.125888,-0.563426,-0.305768,0.661888,0.514929,0.456534,-0.307952,0.544418,-0.026941,0.412455,0.081916,-0.106447,-0.28379,-0.582741,0.057384,-0.494649,-0.056708,-0.792144,-0.206483,-0.592827,-0.072131,0.316783,-1.188334,-1.042616,-1.009844,-0.065083,-0.093712,0.731537,0.449065,-1.770977,-0.607552,-1.028697,0.89366,-0.452105,0.198086,0.739869,-0.955381,0.322227,-1.159262
4,1268998143733051394,Fri Jun 05 20:08:28 +0000 2020,25073877,41234,205898,0.699836,0.215626,-0.36201,0.206359,-0.55115,-0.077212,0.265463,-0.770412,0.055084,-0.470637,0.07416,0.84308,-0.669345,0.588671,0.215185,-0.043101,0.293181,0.700903,-0.28089,0.057552,-0.685977,0.239619,0.312351,0.043481,-0.536498,0.000219,0.529217,-0.176834,0.619924,0.383312,0.515965,-0.158169,-0.52215,-1.094283,-0.273722,...,0.104677,-0.320151,-1.013057,-0.03005,0.641755,-0.457022,0.278227,0.631839,-0.46397,1.206755,0.26474,0.376495,-0.022828,0.346247,-0.780784,0.390352,0.435485,0.002853,0.170745,-0.177247,0.169696,0.626583,0.331661,0.03225,0.300451,-0.374163,0.800378,0.360195,0.168164,-0.050756,-0.647762,0.35582,-0.227254,0.930034,0.249645,0.028556,-0.23014,0.202612,0.431107,-0.375893


> Note: Might take some time! One time job.

In [139]:
%%time
tweets = df_trump_mentions['_tweet_content'].tolist()
bert_embeddings = get_bert_embeddings(tweets)

CPU times: user 33.5 s, sys: 274 ms, total: 33.8 s
Wall time: 33.8 s


In [141]:
df_trump_mentions_embed = pd.concat([df_trump_mentions[['tweet_id', 'created_at', 'user_id', 'retweet_count', 'favorite_count']], pd.DataFrame(bert_embeddings)], axis=1)
df_trump_mentions_embed.to_csv(root/'trump_mentions_embed.csv', index=False)
df_trump_mentions_embed.head()

Unnamed: 0,tweet_id,created_at,user_id,retweet_count,favorite_count,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,...,728,729,730,731,732,733,734,735,736,737,738,739,740,741,742,743,744,745,746,747,748,749,750,751,752,753,754,755,756,757,758,759,760,761,762,763,764,765,766,767
0,1026516623719587840,Mon Aug 06 17:13:29 +0000 2018,1017500185356853248,9,19,0.296495,0.542614,-0.453187,-0.891998,-0.090881,0.006193,-0.303954,0.249364,0.039713,-1.286487,-0.044627,0.651054,-0.044583,0.429252,-0.263339,-0.05301,-0.06454,-0.490691,0.902055,-0.489038,-0.743008,-0.636297,0.292564,0.059178,-0.512472,-0.12563,0.196875,0.320604,-0.05262,0.13937,0.169841,-0.916152,-0.737109,-0.501976,-1.668477,...,0.137196,0.087176,-0.54064,0.261148,-0.043927,-0.70661,0.249548,0.359341,-0.103795,-0.247834,0.453886,0.845284,0.388629,-0.332676,-1.132646,-0.196032,0.688646,0.308717,0.471878,0.163504,-0.867474,0.350447,-0.380599,-0.468762,-0.478088,-1.165384,0.754126,-0.780247,0.090352,-0.203745,-0.647261,0.530679,-0.382568,0.500944,-0.336224,0.220711,-0.158999,0.271637,-0.350223,-0.095383
1,1025485773875937283,Fri Aug 03 20:57:15 +0000 2018,1017500185356853248,1,5,-0.428294,0.578304,-0.162057,-0.228073,-0.462657,-0.029283,0.585583,-0.699514,0.418962,-0.69739,0.008845,0.467787,-0.850617,0.578669,0.503725,-0.292381,-0.335498,-0.5139,0.467011,-0.201876,-0.32594,-0.264376,-0.404915,0.226402,-0.26881,0.272352,0.341224,0.022196,0.300996,0.597184,0.506189,0.317362,-1.087152,-0.091599,0.560678,...,-0.024032,-0.5086,-0.617541,-0.141626,0.212026,-0.215321,-0.076981,0.786224,-0.373745,1.697614,-0.188368,-1.000211,-0.077082,0.331316,0.197367,-0.804825,0.122857,0.936029,-0.013965,-0.648372,0.435433,1.291549,0.43242,0.438572,0.477754,-0.859429,0.141657,0.090128,0.415385,0.226492,-0.741634,0.742004,0.266929,0.736418,0.843931,0.181311,-0.242699,-0.222048,-0.937796,0.367424
2,1032951113815928832,Fri Aug 24 11:21:51 +0000 2018,1017500185356853248,8,0,0.094901,0.615759,-0.689562,-0.422887,-0.052428,0.750495,-0.346024,-0.458219,-0.192848,-0.275985,0.621015,0.478449,-0.607758,0.617976,-0.312946,0.143847,-0.13754,-0.043492,-0.706348,-0.28107,-0.386788,0.381326,0.477924,0.664289,-0.434742,-0.361318,-0.555707,-0.514309,-0.08451,0.227876,0.466844,0.023709,-0.05859,-0.451548,-0.874648,...,0.53086,-0.034788,-0.165674,-0.450385,0.487636,0.197166,0.162186,0.060101,-0.543128,1.340839,-0.511484,0.059931,0.326431,0.029854,0.204637,-0.187105,0.569561,0.136692,0.176373,0.357852,0.275542,-0.104815,-0.528044,-0.137594,0.773937,0.606296,-0.017135,0.730254,0.242864,-0.256488,-0.175478,1.154517,-0.763382,0.273006,0.393731,-0.328781,0.047578,0.292757,0.503482,-0.528074
3,1052157223646978048,Tue Oct 16 11:20:04 +0000 2018,1017500185356853248,1768,0,0.708866,0.51862,-0.047712,-0.53295,0.1465,0.262488,-0.133962,-0.243481,0.077845,-0.575408,0.416497,0.301207,0.100368,0.13147,0.091317,0.096511,-0.530615,0.394006,0.682099,0.119839,0.00079,-0.176863,0.101857,0.824486,-0.039021,-0.68188,-0.264388,-0.76386,0.305914,0.3712,-0.193137,-1.119111,-0.011367,0.185722,-0.457377,...,-0.011262,0.389398,-0.490471,-0.7169,0.228522,0.842964,-0.146859,-0.393021,-0.250562,0.212922,-0.211197,0.490965,0.056749,0.332958,-0.354161,0.230801,0.788439,0.540131,-0.149457,-0.001058,-0.069011,1.179062,-0.17666,-0.245552,0.19954,-0.26054,-0.185489,0.423785,0.149124,-0.360008,-1.02733,0.512619,0.401362,1.076128,0.050959,0.066167,0.402367,-0.095855,0.446844,-0.901757
4,1026545492354691072,Mon Aug 06 19:08:12 +0000 2018,1017500185356853248,4,0,0.51916,0.554478,0.116807,-0.250892,-0.240643,0.040561,0.597543,-0.934036,-0.545247,-0.765313,-0.10542,0.265675,0.288152,0.926259,0.303288,-0.544012,0.575287,-0.166187,-0.047129,-0.24289,-0.33921,0.046246,-0.292118,0.535238,-0.174444,-0.086731,-0.178658,0.053102,0.246698,0.013568,0.466313,-0.719602,-0.97139,0.047073,-0.125972,...,1.026371,-0.095356,-0.273369,-0.453069,0.289113,0.670054,0.224019,0.220753,-0.867422,0.382044,-0.616032,0.232729,-0.707206,-0.221402,0.573499,-0.441059,-0.287151,0.497305,0.080583,0.036896,-0.25711,-0.0441,0.471426,-0.036785,0.523029,-0.690391,0.555457,-0.245956,-0.338947,-0.236552,-1.147224,-0.790507,-0.208819,1.266071,0.120563,0.354832,-0.198582,0.860355,-0.193678,-0.59183


> Note: Load the saved embeddings.

### Aggregate

#### Weekly aggregation
- As daily granularity might not be sufficient for all handles
- Trump "events" are a mostly (?) weekly affair

Convert to datetime

In [148]:
df_trump_embed['_created_at'] = pd.to_datetime(df_trump_embed.created_at) # convert to datetime
# https://strftime.org/
df_trump_embed_agg = df_trump_embed.groupby(['user_id', df_trump_embed['_created_at'].dt.strftime('%Y-%W')]).mean().reset_index() # Aggregate mean 
df_trump_embed_agg.insert(4, 'tweet_count', df_trump_embed.groupby(['user_id', df_trump_embed['_created_at'].dt.strftime('%Y-%W')]).size().reset_index(name='counts')['counts'])
df_trump_embed_agg.head()

Unnamed: 0,user_id,_created_at,retweet_count,favorite_count,tweet_count,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,...,728,729,730,731,732,733,734,735,736,737,738,739,740,741,742,743,744,745,746,747,748,749,750,751,752,753,754,755,756,757,758,759,760,761,762,763,764,765,766,767
0,25073877,2020-22,23284.24359,106426.115385,78,0.149069,0.196724,0.009865,-0.074891,-0.136623,-0.050789,-0.018215,-0.121087,-0.002508,-0.253522,0.048538,0.538218,0.031037,0.207657,-0.059558,-0.223908,0.374647,0.010845,-0.232791,-0.212621,-0.070813,-0.162409,0.031521,0.432515,-0.153893,-0.118426,-0.053464,-0.01732,0.233058,0.44613,0.297773,-0.162308,-0.426458,0.046976,-0.199064,...,0.06025,0.065337,-0.47238,-0.135391,0.348175,0.40807,0.176138,-0.000476,-0.161571,0.128069,0.01833,-0.058815,-0.381077,0.114411,-0.188778,-0.669264,0.168758,0.179931,-0.238355,0.297934,-0.32413,0.287228,0.014956,-0.128813,-0.010256,-0.500943,0.131608,-0.049081,0.069148,0.018128,-0.644699,-0.137394,0.069613,0.474739,-0.150652,0.181468,0.109798,0.098584,0.071442,-0.349355
1,25073877,2020-23,25813.372881,119981.940678,118,0.130131,0.168596,0.059496,-0.147593,-0.120444,0.0314,0.085227,-0.136017,0.091151,-0.169573,0.077318,0.502065,-0.096762,0.263359,0.03633,-0.180619,0.172534,-0.008196,-0.179198,-0.16339,-0.113877,-0.068395,0.052847,0.445153,-0.140547,-0.128827,0.024212,-0.015598,0.211106,0.454623,0.253302,-0.103111,-0.35653,0.071913,-0.108193,...,0.084192,-0.046942,-0.516986,-0.129547,0.239534,0.425506,0.159567,-0.004157,-0.127883,0.161891,-0.033009,-0.184439,-0.38396,0.013948,-0.132854,-0.498902,0.091804,0.15821,-0.19965,0.233135,-0.317617,0.353699,-0.023758,-0.112575,-0.043616,-0.442584,0.070816,0.075494,0.025127,0.103591,-0.617178,0.029935,0.201681,0.50305,-0.090961,0.166065,0.122945,0.089543,-0.0033,-0.342832
2,25073877,2020-24,23270.346154,108389.875,104,0.174402,0.209939,0.212131,-0.211437,-0.028061,0.037372,0.064046,-0.168289,0.165889,-0.250349,-0.00084,0.500226,-0.100721,0.200725,0.047075,-0.059068,0.163066,-0.081924,-0.180537,-0.176664,-0.136622,-0.024289,0.011006,0.497681,-0.084454,-0.084653,0.059277,0.084912,0.213906,0.372498,0.419329,-0.153556,-0.357528,-0.058441,-0.079701,...,0.170166,-0.003388,-0.526447,-0.149801,0.225414,0.338546,0.120587,-0.011296,-0.084735,0.127489,-0.013782,-0.127446,-0.326326,0.135246,-0.113703,-0.465171,0.129767,0.156625,-0.119274,0.263168,-0.322511,0.408654,-0.064433,-0.099455,-0.374443,-0.484099,0.054322,-0.009954,0.187017,0.061123,-0.504695,-0.018486,0.272336,0.41356,-0.022964,-0.034976,0.057769,0.13094,-0.109448,-0.211156
3,25073877,2020-25,30583.536364,119448.854545,110,0.126918,0.14093,0.102434,-0.180341,0.065021,0.020687,0.116004,-0.084946,-0.000715,-0.175089,-0.010302,0.386464,-0.113783,0.205333,-0.019877,-0.04104,0.090974,-0.064094,-0.265847,-0.21289,-0.101543,-0.07639,-0.062079,0.463331,-0.194141,-0.117052,0.065792,0.009253,0.233397,0.252097,0.355374,-0.015534,-0.20887,0.035357,-0.088392,...,0.146909,-0.013857,-0.467288,-0.174678,0.193679,0.278724,0.125561,-0.032361,0.153473,0.040697,-0.002883,-0.181792,-0.336598,0.068713,-0.144788,-0.431569,0.13655,-0.056801,-0.155427,0.023514,-0.339566,0.335187,0.0037,-0.191827,-0.707371,-0.404956,-0.102287,0.057777,0.095256,0.114299,-0.499153,0.040431,0.30582,0.387635,-0.082689,-0.044691,0.12292,0.019053,-0.10262,-0.185542
4,25073877,2020-26,23813.743119,101047.220183,109,0.04309,0.186572,0.15035,-0.136757,-0.098916,0.05972,0.101506,-0.129037,-0.059347,-0.169612,-0.071304,0.469229,-0.031224,0.166891,-0.005362,-0.083954,0.149434,-0.034979,-0.348998,-0.200863,-0.088236,-0.048189,-0.024036,0.50979,-0.178232,-0.075938,0.083486,-0.008226,0.183436,0.287209,0.365199,-0.038842,-0.16045,0.037199,-0.020076,...,0.125978,-0.035469,-0.448616,-0.191778,0.206719,0.257885,0.046402,0.04867,0.135372,0.014421,0.035622,-0.209451,-0.359327,0.025559,-0.146101,-0.551064,0.120041,-0.056089,-0.18259,0.032812,-0.379216,0.326321,-0.048775,-0.131914,-0.593004,-0.399126,-0.098912,0.055167,0.160132,0.21357,-0.550226,-0.006206,0.301971,0.332442,-0.105304,0.028203,0.114484,0.003019,-0.026187,-0.149166


In [149]:
df_trump_mentions_embed['_created_at'] = pd.to_datetime(df_trump_mentions_embed.created_at) # convert to datetime
# https://strftime.org/
df_trump_mentions_embed_agg = df_trump_mentions_embed.groupby(['user_id', df_trump_mentions_embed['_created_at'].dt.strftime('%Y-%W')]).mean().reset_index() # Aggregate mean 
df_trump_mentions_embed_agg.insert(4, 'tweet_count', df_trump_mentions_embed.groupby(['user_id', df_trump_mentions_embed['_created_at'].dt.strftime('%Y-%W')]).size().reset_index(name='counts')['counts'])
df_trump_mentions_embed_agg.head()

Unnamed: 0,user_id,_created_at,retweet_count,favorite_count,tweet_count,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,...,728,729,730,731,732,733,734,735,736,737,738,739,740,741,742,743,744,745,746,747,748,749,750,751,752,753,754,755,756,757,758,759,760,761,762,763,764,765,766,767
0,1017500185356853248,2018-31,1.0,5.0,1,-0.428294,0.578304,-0.162057,-0.228073,-0.462657,-0.029283,0.585583,-0.699514,0.418962,-0.69739,0.008845,0.467787,-0.850617,0.578669,0.503725,-0.292381,-0.335498,-0.5139,0.467011,-0.201876,-0.32594,-0.264376,-0.404915,0.226402,-0.26881,0.272352,0.341224,0.022196,0.300996,0.597184,0.506189,0.317362,-1.087152,-0.091599,0.560678,...,-0.024032,-0.5086,-0.617541,-0.141626,0.212026,-0.215321,-0.076981,0.786224,-0.373745,1.697614,-0.188368,-1.000211,-0.077082,0.331316,0.197367,-0.804825,0.122857,0.936029,-0.013965,-0.648372,0.435433,1.291549,0.43242,0.438572,0.477754,-0.859429,0.141657,0.090128,0.415385,0.226492,-0.741634,0.742004,0.266929,0.736418,0.843931,0.181311,-0.242699,-0.222048,-0.937796,0.367424
1,1017500185356853248,2018-32,6.5,9.5,2,0.407828,0.548546,-0.16819,-0.571445,-0.165762,0.023377,0.146795,-0.342336,-0.252767,-1.0259,-0.075024,0.458364,0.121784,0.677755,0.019975,-0.298511,0.255373,-0.328439,0.427463,-0.365964,-0.541109,-0.295026,0.000223,0.297208,-0.343458,-0.10618,0.009108,0.186853,0.097039,0.076469,0.318077,-0.817877,-0.854249,-0.227452,-0.897224,...,0.581784,-0.00409,-0.407005,-0.095961,0.122593,-0.018278,0.236783,0.290047,-0.485608,0.067105,-0.081073,0.539007,-0.159288,-0.277039,-0.279574,-0.318545,0.200748,0.403011,0.276231,0.1002,-0.562292,0.153174,0.045413,-0.252773,0.02247,-0.927887,0.654791,-0.513101,-0.124298,-0.220148,-0.897242,-0.129914,-0.295693,0.883507,-0.107831,0.287772,-0.178791,0.565996,-0.27195,-0.343607
2,1017500185356853248,2018-33,4.5,11.0,2,-0.24276,0.395839,-0.477542,-0.713028,0.205286,0.188811,0.466178,-0.301802,0.409106,-0.251343,0.348974,0.53992,-0.866483,0.167234,-0.317578,-0.343521,0.645655,0.047172,-0.312369,-0.113443,-0.550799,-0.015574,0.115371,0.33819,-0.240381,-0.605067,0.000111,0.354407,0.262327,-0.049443,0.277493,-0.248988,-0.847912,0.388378,-0.46177,...,0.009541,0.056349,-0.589419,-0.124256,0.264126,0.302321,-0.332644,0.078343,-0.27164,0.383266,-0.477686,-0.150009,-0.059784,-0.068449,-0.025285,-0.546396,0.431892,0.23691,-0.318586,0.425535,0.351555,0.182488,-0.16216,-0.240141,0.107444,-0.010343,0.109944,0.518258,-0.082818,-0.187585,-1.006294,-0.134078,0.24548,0.465279,-0.121298,-0.083107,0.086489,-0.105323,0.177168,-0.197506
3,1017500185356853248,2018-34,8.0,12.0,2,0.094901,0.615759,-0.689562,-0.422887,-0.052428,0.750495,-0.346024,-0.458219,-0.192848,-0.275985,0.621015,0.478449,-0.607758,0.617976,-0.312946,0.143847,-0.13754,-0.043492,-0.706348,-0.28107,-0.386788,0.381326,0.477924,0.664289,-0.434742,-0.361318,-0.555707,-0.514309,-0.08451,0.227876,0.466844,0.023709,-0.05859,-0.451548,-0.874648,...,0.53086,-0.034788,-0.165674,-0.450385,0.487636,0.197166,0.162186,0.060101,-0.543128,1.340839,-0.511484,0.059931,0.326431,0.029854,0.204637,-0.187105,0.569561,0.136692,0.176373,0.357852,0.275542,-0.104815,-0.528044,-0.137594,0.773937,0.606296,-0.017135,0.730254,0.242864,-0.256488,-0.175478,1.154517,-0.763382,0.273006,0.393731,-0.328781,0.047578,0.292757,0.503482,-0.528074
4,1017500185356853248,2018-40,18.0,60.0,1,0.272741,0.01406,-0.742705,-0.497896,-0.330765,0.615861,0.060828,-0.270493,-0.882317,-0.540799,-0.285746,0.767598,0.279633,1.126107,-0.059896,0.372671,1.232063,-0.182995,-0.030168,-0.048287,-0.809972,-0.772041,0.522172,0.512994,0.496799,-0.905688,-0.38042,0.931951,0.237535,0.450447,-0.114126,-0.602381,-0.845676,0.252365,-0.726877,...,1.447782,0.218642,-0.426976,0.020234,-0.005178,-0.609852,-0.548548,0.36305,-0.915707,0.392533,-0.939132,1.328627,-0.485925,-0.155849,-0.074452,-0.398398,0.291838,0.690752,0.459696,-0.081944,-0.527907,0.652632,-0.585126,-0.909187,0.092761,-0.539991,0.410262,0.297158,-0.082341,-0.03367,-0.967036,-0.317589,0.135749,0.745604,-0.52845,-0.520045,0.285236,0.555256,-0.008609,-0.636543


### Vector similarity

- X-axis: Date
- Y-axis: Cosine similarity



Compute pairwise distance between Trump and everyone else: aggregated weekly tweets

In [145]:
df_trump_embed_agg['_created_at'].max()

'2020-36'

Create a weekly index of cosine similarity between Trump and others

In [156]:
def func(row):
    # get the query vector filtered on week
    x = df_trump_embed_agg[df_trump_embed_agg._created_at == row._created_at].iloc[:, 5:] #.to_numpy()
    y = row[5:] #.to_numpy() # get only the embeddings
    cos_score = cosine_score(x, y)
    if cos_score.shape[0] > 0: 
        return cos_score[0]
    else:
        return None

In [157]:
cosine_score = lambda x, y: np.dot(x, y)/(np.linalg.norm(x) * np.linalg.norm(y))

In [158]:
df_trump_mentions_embed_agg["trump_similarity_score"] = df_trump_mentions_embed_agg.apply(func, axis=1)

Previous results with GloVe
```
0        0.989125
1        0.984315
2        0.986207
3        0.990281
4        0.986024
           ...   
10283    0.991514
10284    0.982669
10285    0.970768
10286    0.992367
10287    0.987436
Name: trump_similarity_score, Length: 10273, dtype: float64
```

In [159]:
df_trump_mentions_embed_agg["trump_similarity_score"]

0             NaN
1             NaN
2             NaN
3             NaN
4             NaN
           ...   
13084    0.669645
13085    0.676860
13086   -0.117782
13087    0.409059
13088         NaN
Name: trump_similarity_score, Length: 13089, dtype: float64

In [160]:
df_trump_mentions_embed_agg[~df_trump_mentions_embed_agg["trump_similarity_score"].isna()]

Unnamed: 0,user_id,_created_at,retweet_count,favorite_count,tweet_count,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,...,729,730,731,732,733,734,735,736,737,738,739,740,741,742,743,744,745,746,747,748,749,750,751,752,753,754,755,756,757,758,759,760,761,762,763,764,765,766,767,trump_similarity_score
50,1017500185356853248,2020-22,5.500000,23.500000,2,0.578469,0.382775,-0.211122,-0.529742,0.220981,0.174922,0.386238,-0.368512,-0.521326,-0.324194,0.265130,0.624595,-0.125465,0.435626,-0.391925,-0.126850,1.398329,0.371572,-0.208245,-0.354946,-0.318033,-0.156228,0.166782,0.557080,-0.246753,-0.291566,-0.220615,0.435122,0.443528,0.974567,-0.014414,-0.371126,-0.944439,0.015630,0.181982,...,-0.173446,-0.527119,-0.550636,0.723483,0.688339,-0.101384,-0.415139,-0.919286,0.044089,-0.719294,0.107086,-0.775596,0.384175,-0.169958,-0.248016,0.084495,0.339532,-0.040263,0.258807,-0.260606,0.236294,-0.001676,-0.584467,0.566635,-0.111435,0.349002,0.438965,-0.226874,-0.080538,-1.265147,-0.504492,0.495186,0.971563,-0.236221,0.333156,0.242572,0.610855,-0.279292,-0.777029,0.620914
51,1017500185356853248,2020-23,12.000000,65.000000,1,0.415133,0.379527,-0.099740,-0.897401,-0.799076,0.125144,0.200442,-0.029199,0.819083,0.220121,0.261527,0.498689,-1.317474,1.063579,-0.117230,0.366635,0.345720,-0.178392,0.297701,0.284173,-0.520560,-0.153452,0.722448,0.537421,0.089217,-0.429460,0.432720,0.719954,0.202621,0.740574,0.827624,-0.097467,-0.446858,-0.687983,0.502332,...,0.205044,-0.354662,-0.364937,-0.228000,0.108822,-0.250345,0.040203,-0.579386,0.089110,-0.089038,0.247360,-0.764935,-0.104884,-0.529356,-0.548878,-0.236123,0.586617,0.061304,0.876636,0.047540,0.121374,-0.509653,-0.291508,0.232516,-0.211054,0.302600,0.773064,0.404088,0.132659,-0.046414,0.212947,0.508343,0.917213,-0.329371,0.408069,0.153776,-0.048324,-0.155285,-0.446324,0.539433
52,1017500185356853248,2020-24,16.000000,63.000000,2,-0.250559,0.091244,-0.301057,-0.681981,-0.015299,-0.143077,0.205556,-0.425046,-0.369346,-0.362294,-0.506263,0.770843,-0.204277,0.233750,-0.200922,-0.119250,0.528333,-0.324203,0.231272,-0.093023,-0.555862,0.250267,0.126660,0.591646,0.035086,-0.052111,0.259456,0.452343,0.531371,0.596029,0.151921,0.123769,-1.142921,0.121410,-0.550222,...,-0.022804,-0.429753,0.140078,0.209008,0.165411,-0.451978,-0.099314,-0.375516,0.489790,0.329867,-0.072078,-0.306108,0.413814,0.151409,-0.679806,0.358573,0.346114,-0.313919,0.391032,-0.274049,1.261627,-0.303266,-0.013670,-0.088431,-0.637620,0.374948,-0.446546,0.053114,-0.272932,-0.599970,-0.498047,0.235857,0.701505,0.338427,-0.194006,-0.012183,0.708668,-0.603780,-0.399089,0.522314
53,1017500185356853248,2020-25,66.333333,170.666667,3,0.688894,0.084292,0.209936,0.256560,-0.187573,0.225341,0.035416,-0.594263,0.387330,0.207881,0.141297,0.807224,-0.280189,0.666393,-0.236650,-0.050008,0.374042,-0.194323,0.057787,-0.101273,-0.496598,0.519911,-0.002505,0.724211,-0.563288,0.133108,0.086551,0.136700,0.424881,0.682373,0.490565,-0.151378,-0.491300,0.025838,0.232182,...,-0.192378,-0.733744,0.045929,-0.234282,0.284166,0.239868,0.383138,-0.647426,0.579491,-0.160794,-0.405885,-0.466424,0.593145,-0.547006,-0.785030,-0.059935,0.359741,-0.475778,0.175000,-0.070978,0.667746,-0.571209,0.488283,0.428253,-0.707128,-0.033267,0.021069,0.119960,-0.134573,-0.519354,0.337993,0.712474,0.381080,0.354880,0.161034,-0.380630,-0.007110,-0.361712,-0.074117,0.529402
54,1017500185356853248,2020-26,39.000000,109.666667,3,0.238490,-0.049215,0.012358,-0.193455,-0.161872,0.193864,0.418272,-0.312250,-0.049817,-0.274502,0.020438,0.839938,-0.553129,0.521240,-0.065869,-0.210198,1.098208,0.069449,-0.279507,-0.272535,-0.161232,-0.074851,0.502253,0.311159,0.102754,-0.527196,-0.308286,0.480614,0.546294,0.441716,-0.002488,0.001302,-0.444891,-0.213731,-0.210198,...,0.107407,-0.454884,-0.264762,0.327209,0.258401,-0.267670,0.251801,-0.494076,0.427700,-0.151545,-0.066145,-0.662348,0.004795,-0.284782,-0.744616,0.332404,0.110881,0.116543,0.357582,-0.136611,0.554178,0.167694,-0.275550,0.440527,-0.446463,0.126090,0.537624,-0.120916,-0.123079,-0.732515,-0.237145,0.550038,0.333195,-0.226512,0.292278,0.189782,0.470662,0.236972,-0.966554,0.531211
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13083,996094929733652481,2020-31,102.210526,302.578947,19,0.305969,0.156688,-0.039649,-0.244932,-0.014690,-0.373230,0.113911,-0.224740,0.306974,-0.592372,-0.068721,0.489579,-0.470647,0.582458,-0.075625,-0.378334,0.266071,0.078553,-0.113178,-0.010484,-0.224231,-0.133562,-0.218370,0.456284,-0.204397,-0.386758,-0.182824,0.058118,0.565652,0.519320,0.033105,-0.540936,-0.880307,0.166982,-0.335358,...,-0.024058,-0.518754,0.185236,0.556821,0.439014,0.058764,-0.173479,-0.718560,0.521174,0.073409,-0.027607,-0.657197,0.313892,-0.164808,-0.500602,0.099388,0.389446,-0.298586,0.212902,-0.258560,0.390793,0.098888,0.082276,0.227855,-0.557487,0.209767,0.016993,0.078010,-0.284275,-0.724102,-0.056981,0.145844,1.104532,0.307578,0.455140,-0.095489,0.203224,0.207388,-0.558739,0.617216
13084,996094929733652481,2020-32,79.600000,226.000000,5,0.215936,0.366439,0.231274,-0.259536,0.100848,-0.309814,0.102817,-0.265775,0.227498,-0.501895,-0.238847,0.740178,-0.535158,0.417838,-0.056899,-0.633119,0.154940,0.228781,-0.197275,0.078303,-0.230805,0.059776,-0.079930,0.291462,-0.028808,-0.456514,0.138559,0.250878,0.644368,0.456812,0.256868,-0.497173,-1.142858,0.141021,-0.022545,...,0.036270,-0.362439,0.184718,0.537860,0.448733,0.271127,-0.088373,-0.751656,0.278584,-0.004400,0.122947,-0.618982,0.099981,-0.118136,-0.334017,0.039384,0.254787,-0.139029,0.588689,-0.114834,0.350072,0.140787,-0.039029,0.232852,-0.194935,0.303728,0.034409,-0.153247,-0.235681,-0.707784,-0.656749,0.263236,0.846527,0.194489,0.261616,-0.045762,0.415062,0.019635,-0.516164,0.669645
13085,996094929733652481,2020-33,205.727273,456.681818,22,0.256496,0.203196,-0.124192,-0.198204,-0.124198,-0.205402,0.325369,-0.429885,0.313698,-0.489927,0.029023,0.366346,-0.623322,0.438458,-0.130106,-0.271847,0.361970,-0.014071,-0.003615,0.056601,-0.318568,-0.023200,-0.123064,0.423259,-0.246092,-0.231499,-0.060414,0.138951,0.258191,0.336035,0.266630,-0.535136,-0.768234,-0.052635,-0.121078,...,0.013965,-0.504433,-0.112539,0.348582,0.128589,0.022237,-0.053625,-0.581130,0.735020,0.200632,-0.053361,-0.380903,0.218173,-0.162683,-0.412743,0.021205,0.538315,-0.238811,0.234368,-0.204756,0.446495,-0.104512,0.264053,0.361631,-0.260162,0.089676,0.099575,-0.062122,-0.191560,-0.637717,0.008851,0.028327,0.908031,0.123413,0.231645,0.030346,0.202748,0.021384,-0.374967,0.676860
13086,996094929733652481,2020-34,250.800000,407.800000,5,0.330945,0.322491,-0.273344,-0.106729,-0.412516,0.245652,-0.182706,-0.475557,0.131754,-0.469703,0.256757,0.570370,-0.470006,0.467516,-0.061179,-0.052571,0.743308,0.085184,-0.039108,0.020118,-0.354075,0.118436,-0.208735,0.491885,-0.080740,-0.277205,-0.168272,0.219671,0.150110,0.085747,0.421018,-0.658743,-0.845508,0.349852,-0.142102,...,-0.102802,-0.104742,-0.164597,0.193834,0.083447,-0.024609,0.436058,-0.643746,0.727530,-0.016054,0.000602,-0.410060,0.244780,0.009526,-0.527677,0.099693,0.178526,-0.187576,0.442857,-0.060650,0.520454,0.130646,0.176822,0.396183,-0.133471,0.356883,0.105028,0.037281,0.021538,-0.580488,-0.132103,-0.308153,0.498150,0.368595,0.126295,-0.030942,0.394458,-0.039455,-0.317926,-0.117782


Get `screen_name` for each `user_id`

In [172]:
df_trump_mentions_embed_agg["screen_name"] = df_trump_mentions_embed_agg.user_id.apply(lambda x: users[x])

Sort weeks

In [173]:
df_trump_mentions_embed_agg = df_trump_mentions_embed_agg.sort_values(by=['_created_at'], ascending=False)

In [175]:
df_trump_mentions_embed_agg._created_at.min()

'2013-19'

In [None]:
df_weekly_others_embed.created_at.max()

'2020-36'

In [176]:
df_trump_mentions_embed_agg.head()

Unnamed: 0,user_id,_created_at,retweet_count,favorite_count,tweet_count,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,...,730,731,732,733,734,735,736,737,738,739,740,741,742,743,744,745,746,747,748,749,750,751,752,753,754,755,756,757,758,759,760,761,762,763,764,765,766,767,trump_similarity_score,screen_name
5097,1917731,2020-36,76.285714,199.428571,7,-0.20527,0.01638,-0.148027,-0.373646,0.273769,-0.065402,-0.276937,0.072599,0.134118,-0.047255,0.214167,0.760659,-0.208453,0.412981,0.214158,-0.039573,0.185129,-0.049115,0.015651,-0.150683,-0.409432,0.494808,0.245545,0.259508,-0.216896,-0.132889,-0.517054,-0.386999,0.362353,0.357618,0.561307,0.189935,-0.474394,-0.314114,-0.297588,...,-0.579419,0.48929,0.294624,0.022372,0.503211,0.007623,-0.182789,0.400301,0.000656,-0.124834,-0.176468,0.130508,-0.206619,-0.591681,0.183079,0.112897,-0.337118,0.18406,-0.178293,0.591839,0.11431,-0.183829,0.273752,-0.880183,0.123339,0.095722,0.148057,-0.364999,0.071211,0.551505,0.23737,0.2832,0.534408,-0.125079,-0.260351,-0.117501,-0.030025,-0.522277,0.421886,thehill
5715,21696279,2020-36,738.0,97.0,3,-0.021061,0.222514,0.649465,-0.255125,-0.028632,-0.097316,0.279793,0.134701,-0.442638,-0.218087,-0.553521,0.214341,0.332342,-0.086623,-0.076847,0.516595,-0.286839,0.311267,-0.550089,-0.241928,0.099921,-0.206818,-0.066471,-0.054246,0.180753,-0.077524,0.502179,0.1449,0.294147,0.096128,0.072989,0.241614,0.067956,0.103271,0.458181,...,-0.370099,0.083035,0.039856,-0.302186,0.209626,-0.235106,1.188753,-0.654802,0.321347,0.032617,-0.262012,-0.135199,-0.041768,0.250837,0.186935,-0.846544,-0.448185,-0.222159,-0.409533,0.222468,0.207678,-0.363457,-2.62639,-0.274822,-0.51531,0.034278,0.084907,0.600008,-0.33385,-0.325568,0.455258,-0.001252,-0.076525,-0.344567,0.068647,-0.152771,-0.527953,0.341258,0.076172,brianbeutler
12556,93069110,2020-36,222.0,0.0,1,0.621847,-0.096286,-0.309783,0.142259,0.494554,0.033701,-0.393716,-0.495325,0.384767,-0.048979,0.51757,0.353271,-0.113513,-0.067785,0.471005,-0.458388,0.082413,-0.543366,0.661846,-0.505674,-0.688531,0.494589,0.752816,-0.266022,-0.040148,0.132922,-0.68779,-0.053066,0.12747,0.262811,0.787098,-0.303072,-0.664732,-0.531677,-0.373935,...,-0.703687,-0.006867,0.633304,-0.180992,0.548834,0.501007,-0.39825,0.31853,0.050208,-0.195299,-0.495778,0.016913,-0.502314,-0.399661,0.779825,0.619647,0.326783,0.87667,-0.17512,1.746396,0.062424,0.438659,0.170426,-0.378461,0.045979,-0.012058,0.382388,-0.164259,-0.358121,0.938438,0.471099,1.063151,0.902451,0.81102,-0.605454,-0.505614,-0.407206,-0.624524,0.392045,maggieNYT
11386,816652616625168388,2020-36,166.0,235.5,2,0.055431,-0.518336,-0.052383,0.418397,-0.08207,-0.141478,0.355776,-0.632093,0.309339,0.447394,-0.156531,1.110093,-0.184785,0.682687,0.032424,0.252509,0.574353,0.370221,0.22752,-0.384655,-0.574779,0.43152,-0.098929,0.095287,-0.028064,-0.18019,-0.348591,-0.372659,0.359567,0.316395,0.538298,0.261953,-0.121178,0.233015,0.23415,...,-0.519524,0.214819,-0.010454,0.628758,0.102082,0.442745,-0.761501,0.889979,-0.918414,-0.478194,-0.359181,-0.513736,-0.394107,0.034155,0.39987,0.140414,-0.434079,0.280298,-0.284547,0.473133,-0.444147,-0.072406,0.244321,-0.098906,0.133305,-0.07289,-0.265612,0.118448,-1.239698,-0.266682,-0.105232,-0.1917,0.3347,-0.313913,0.757481,0.206458,0.151883,-0.100743,0.423515,RepAndyBiggsAZ
11316,816012124505931780,2020-36,38.0,118.0,1,-0.345538,-0.289368,0.252833,0.262042,0.071931,-0.826752,-0.003032,0.115132,1.26766,-1.475306,0.441729,0.587466,-0.049784,1.280296,0.217518,0.411858,-0.338393,-0.13878,-0.117636,-0.121738,-0.596369,-0.252046,-0.363713,0.574575,-0.082812,-0.852116,0.444099,0.239793,0.821904,1.194098,0.086255,-0.747336,-0.5062,0.610848,0.75473,...,-0.692573,-0.102154,0.279008,0.14646,-0.374439,0.105753,-0.814307,0.37205,-0.099854,0.912053,-0.096533,0.724773,-0.439682,-0.378797,-0.162787,0.482262,-0.686154,-0.234054,0.029672,1.164121,-0.4353,0.583019,0.044589,-1.091236,0.442476,0.088172,0.358356,-0.609954,-0.023752,-0.273454,0.604052,0.7293,0.599465,-0.628541,0.138252,0.041054,0.592071,0.508146,0.302053,RepDavidKustoff


Assign week number from most recent to way back

In [177]:
weeks = df_trump_mentions_embed_agg._created_at.unique().tolist()
week_idx = {week : i for i, week in enumerate(weeks)}

In [178]:
df_trump_mentions_embed_agg["weeks_elapsed"] = df_trump_mentions_embed_agg._created_at.apply(lambda x : week_idx[x])

In [180]:
fig = px.scatter(df_trump_mentions_embed_agg, x="weeks_elapsed", y="trump_similarity_score", color="screen_name",
#               line_group="country", 
              hover_name="screen_name",
                 log_y=True, 
                 range_x=[df_trump_mentions_embed_agg.weeks_elapsed.min(), 15]
                )
fig.show()

Showing by Senators

In [185]:
import plotly.express as px

fig = px.scatter(df_trump_mentions_embed_agg[df_trump_mentions_embed_agg.user_id.isin(senators)], x="weeks_elapsed", y="trump_similarity_score", color="screen_name",
#               line_group="country", 
              log_y=True,
              hover_name="screen_name",
             range_x=[df_trump_mentions_embed_agg.weeks_elapsed.min(), 15]
             )
fig.show()