# Unsupervised map of the political message on Twitter

![a](https://i.imgur.com/jPBQ6di.jpg)



My goal is to plot a 2D map of the political message broadcasted from the candidates Twitter accounts. 

# Method used

- First I clean the tweets and select a subset relevant to my scope.
- Use a simple GloVe + GRU RNN to identify the tweet author. Very high accuracy is not needed, a few percentage points difference wouldn't matter.
- Use *predict_proba* score as new features
- Apply PCA and keep the first 2 components

# Check the results

A good map should cluster together ideologically similar candidates. To check if the map has any value I will plot the Republican candidates in red, and the Progressive Senators in yellow.

The "unbiased" (at least no bias from me) data for progressiveness comes from [progressivepunch](https://progressivepunch.org/scores.htm?house=senate). I would have marked more ideological groups, but this gets more complicated when moving towards the moderates.

If there is any merit in my approach three groups should emerge. Judge for yourself!

In [None]:
import os
import numpy as np
import pandas as pd
import re
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import plotly
import plotly.plotly as py
import plotly.graph_objs as go
import colorlover as cl

plotly.offline.init_notebook_mode() 

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels
from sklearn.decomposition import PCA

from keras.models import Model
from keras.optimizers import Adamax
from sklearn.metrics import log_loss
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Embedding, Dense, Bidirectional, CuDNNGRU, GlobalMaxPooling1D

import seaborn as sns
import matplotlib.pyplot as plt  

# Define functions

In [None]:
def clean(text):
    
    text = re.sub(r'#\S*', ' ', text) 
    text = re.sub(r'@\S*', ' ', text) 
    text = re.sub(r'http\S*', ' ', text)
    
    for ch in ['\\','`','*','_','{','}','[',']','(',')','>','+','-','.','!','\'',"\”",'\"', '\“', "\’", "?", ":",
               "-",",", "//t", "&amp;", "/", "'", "'", "…","-", "’", "\—", "—", "–", "“", "”"]:
        if ch in text:
            text = text.replace(ch," ")

    return(text)  

def clean_tweet(tweet):
    return ' '.join((clean(tweet.lower())).split())

def print_table(header_values, content, colors):
    data = go.Table(
    
      header = dict(
        values = header_values ,
        line = dict(color = "rgb(70,130,180)"),
        fill = dict(color = "rgb(70,130,180)"),
        align = 'center',
        font = dict(color = 'black', size = 9)
      ),
      cells = dict(
        values = content,
        fill = colors,  
        align = 'center',
        font = dict(color = 'black', size = 9),
        height = 40
        ))

    plotly.offline.iplot([data])

def load_glove(word_index):
    EMBEDDING_FILE = '../input/glove840b300dtxt/glove.840B.300d.txt'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))

    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]

    # word_index = tokenizer.word_index
    nb_words = min(18000, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in word_index.items():
        if i >= 18000: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector
            
    return embedding_matrix 

def load_data(X, y, train_index, test_index):
        
    ## split to train and val    
    Xtr = X.iloc[train_index]
    Xte = X.iloc[test_index]

    ## Tokenize the sentences
    tokenizer = Tokenizer(num_words=18000)
    tokenizer.fit_on_texts(list(Xtr))
    Xtr = tokenizer.texts_to_sequences(Xtr)
    Xte = tokenizer.texts_to_sequences(Xte)

    ## Pad the sentences 
    Xtr = pad_sequences(Xtr, maxlen=50)
    Xte = pad_sequences(Xte, maxlen=50)

    ## Get the target values
    ytr = y.iloc[train_index]
    yte = y.iloc[test_index]

    word_index = tokenizer.word_index
    
    return Xtr, ytr, Xte, yte, word_index

# Clean the Tweets

In [None]:
twitter_files = os.listdir("../input/2020-united-states-presidential-election/twitter")
twitter_users_files = os.listdir("../input/2020-united-states-presidential-election/twitter_users")
pic_files = os.listdir("../input/2020-united-states-presidential-election/pics")
metadata = pd.read_csv("../input/2020-united-states-presidential-election/candidates_info.csv")

metadata["filename"] = metadata["handle"].apply(lambda x: x[1:])
metadata["age"] = ((datetime.today() - pd.to_datetime(metadata["born"])).dt.days/365).astype(int)
metadata = metadata.sort_values("filename")
metadata.reset_index(inplace=True)

dataset = pd.DataFrame()

for index, row in metadata.iterrows():
    
    df = pd.read_csv("../input/2020-united-states-presidential-election/twitter/%s.csv"%row["filename"])
    dataset = pd.concat([dataset,df],ignore_index=True)
    
dataset["clean tweet"] = dataset['Text'].apply(clean_tweet)
dataset['number of characters'] = dataset["clean tweet"].str.len()

metadata["number of all tweets"] = dataset.groupby(["Screen Name"]).count()["Tweet Id"].values
metadata["tweets in english"] = dataset[dataset["Language"] == "English"].groupby(["Screen Name"]).count()["Tweet Id"].values
metadata["tweets not in english"] = dataset[dataset["Language"] != "English"].groupby(["Screen Name"]).count()["Tweet Id"].values
metadata["tweets only"] = dataset[dataset["Tweet Type"] == "Tweet"].groupby(["Screen Name"]).count()["Tweet Id"].values
metadata["retweets only"] = dataset[dataset["Tweet Type"] == "Retweet"].groupby(["Screen Name"]).count()["Tweet Id"].values
metadata["replies only"] = dataset[dataset["Tweet Type"] == "Reply"].groupby(["Screen Name"]).count()["Tweet Id"].values
metadata["after 2019"] = dataset[dataset["Created At"].astype("datetime64").dt.year > 2018].groupby(["Screen Name"]).count()["Tweet Id"].values
metadata["before 2019"] = metadata["number of all tweets"] - metadata["after 2019"]
metadata["more than 40 characters"] = dataset[dataset["number of characters"] > 39].groupby(["Screen Name"]).count()["Tweet Id"].values
metadata["less than 40 characters"] = dataset[dataset["number of characters"] < 40].groupby(["Screen Name"]).count()["Tweet Id"].values

dataset = dataset[dataset["Created At"].astype("datetime64").dt.year > 2018]  
dataset = dataset[dataset["number of characters"] > 39]
dataset = dataset[dataset["Tweet Type"] == "Tweet"]
dataset = dataset[dataset["Language"] == "English"]

X = dataset["clean tweet"]
y = dataset["Name"]

metadata["useful tweets"] = dataset.groupby(["Screen Name"]).count()["Tweet Id"].values

# Tweet selection
 
Most candidates have ~3200 tweets in the database, but not all are used for separating candidates in clusters.

- keep only "Tweets in English" (for this purpose it's another way of saying that I only use worded tweets, not picture or links)
- drop retweets
- drop replies
- drop tweets before 2019, irrelevant to the current campaign
- drop tweets shorter than 40 characters.

When all is said and done, I am left with most candidates having a reasonable amount of tweets, but a few without enough data to work on. Going forward I do not account for the imbalance in the dataset, so please keep in mind this fact when interpreting the results.

In [None]:
columns =  ["name", "number of all tweets", "tweets in english", "tweets not in english",
           "tweets only", "retweets only", "replies only", "after 2019", "before 2019", 
           "more than 40 characters", "less than 40 characters", "useful tweets"]

header_values = ['<b>%s</b>'%x for x in columns]
content = metadata.sort_values("useful tweets", ascending=False)[columns].T
colors = dict()
print_table(header_values, content, colors)

# Predict the tweet author using GloVe and GRU

Accuracy is ~50%

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

kfold = StratifiedKFold(n_splits=10, random_state=0, shuffle=True)
scores = []
test_list = []
predict_list = []

test_df = pd.DataFrame()
pred_df = pd.DataFrame()

for train_index, test_index in kfold.split(X,y):
    
    yoh = pd.get_dummies(y)
    train_X, train_y, test_X, test_y, word_index = load_data(X, yoh, train_index, test_index)
    
    embedding = load_glove(word_index)
    
    inp = Input(shape=(50,))
    x = Embedding(18000, 300, weights=[embedding])(inp)

    x = Bidirectional(CuDNNGRU(256, return_sequences=True))(x)
    x = Bidirectional(CuDNNGRU(128, return_sequences=True))(x)
    x = GlobalMaxPooling1D()(x)
    x = Dense(train_y.shape[1],activation='softmax')(x)
    
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='categorical_crossentropy', optimizer=Adamax(lr=0.002), metrics=['accuracy'])

    model.fit(train_X, train_y, batch_size=1024, epochs=15, validation_data=(test_X, test_y), verbose=0)
    pred_y = model.predict(test_X)

    
    test_df = pd.concat([test_df,test_y], axis=0, ignore_index = True)
    pred_df = pd.concat([pred_df,pd.DataFrame(pred_y)], axis=0, ignore_index = True)
    
pred_df.columns=test_df.columns.values
test_label = test_df.idxmax(axis=1)
pred_label = pred_df.idxmax(axis=1)

print ("Accuracy:", accuracy_score(test_label, pred_label))

# Confussion matrix

In [None]:
cm = confusion_matrix(test_label, pred_label, test_df.columns.values)
cm = ((cm.astype('float')*100 / cm.sum(axis=1)[:, np.newaxis] + 0.5).astype('int'))/100
   

f, ax = plt.subplots(figsize=(14, 12))
sns.set(font_scale=1.4)#for label size
sns.heatmap(cm, annot=True,annot_kws={"size": 8}, xticklabels=test_df.columns.values, yticklabels=test_df.columns.values)

# Final Result

In [None]:
progressive_candidates = ["Bernie Sanders", "Elizabeth Warren", "Kamala Harris", 'Cory Booker', 'Kirsten Gillibrand' ]
republican = ['Donald J. Trump', 'Mike Pence', 'Gov. Bill Weld']
all_others = ['Amy Klobuchar', 'Andrew Yang', "Beto O'Rourke",'Bill de Blasio', 'Eric Swalwell', 'Jay Inslee', 'John Delaney', 
              'John Hickenlooper', 'Julián Castro', 'Marianne Williamson', 'Michael Bennet', 'Sen. Mike Gravel', 'Seth Moulton',
              'Steve Bullock', 'Tim Ryan', 'Tulsi Gabbard', 'Wayne Messam', "Pete Buttigieg", "Joe Biden"]


pred_df["true_label"] = test_label
average_pred = pred_df.groupby(['true_label']).sum()/pred_df.groupby(['true_label']).count()

pca = PCA(n_components=2)
pca.fit(average_pred)
boiled_down = pd.DataFrame(data=pca.transform(average_pred),index=test_df.columns.values, columns=["a","b"])

fig = {
    'data': [
       {'x': boiled_down.loc[progressive_candidates].a, 'y': boiled_down.loc[progressive_candidates].b, 'text': boiled_down.loc[progressive_candidates].index, 
        'marker': {'color': 'rgb(251,169,46)', 'size': 6},
        'mode': 'markers',
         'name' :'Progressive Senators' 
       },
       {'x': boiled_down.loc[all_others].a, 'y': boiled_down.loc[all_others].b, 'text': boiled_down.loc[all_others].index, 
        'marker': {'color': 'rgb(0,138,147)', 'size': 6},
        'mode': 'markers',
         'name' :'All Others'
       },
       {'x': boiled_down.loc[republican].a, 'y': boiled_down.loc[republican].b, 'text': boiled_down.loc[republican].index, 
        'marker': {'color': 'rgb(143,26,29)', 'size': 6},
        'mode': 'markers',
         'name' :'Republican'
       },
    ],
}

plotly.offline.iplot(fig)
