<a href="https://colab.research.google.com/github/sljm12/Programming-exploration/blob/master/2020USElection/LSTM_Trump_Biden_Kamala_Classifier_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Intro ##
This notebook uses LSTM and Dense layers to do speaker identification based on the US election speeches

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use("ggplot")

import pandas as pd
import numpy as np

import tensorflow as tf

from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, Input

from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
import json
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [2]:
!wget https://www.dropbox.com/s/fna7obll05a8dmi/2020USElection.zip
!unzip 2020USElection.zip

--2020-10-23 12:08:44--  https://www.dropbox.com/s/fna7obll05a8dmi/2020USElection.zip
Resolving www.dropbox.com (www.dropbox.com)... 162.125.82.1, 2620:100:6032:1::a27d:5201
Connecting to www.dropbox.com (www.dropbox.com)|162.125.82.1|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /s/raw/fna7obll05a8dmi/2020USElection.zip [following]
--2020-10-23 12:08:45--  https://www.dropbox.com/s/raw/fna7obll05a8dmi/2020USElection.zip
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://ucf472704af420100da831cafc2c.dl.dropboxusercontent.com/cd/0/inline/BBwMqLvobXhBdfcYvccS38OEwjfGR0UCY52yHIz1X7xYdqChJ12RrjPup01k4wyo8ZIGJ4SKxt2lVWEQe23JJN8ATFi0LXo9ftpThMrMO_mMmYfOxvfX39fW7ZRyLYxmpCg/file# [following]
--2020-10-23 12:08:45--  https://ucf472704af420100da831cafc2c.dl.dropboxusercontent.com/cd/0/inline/BBwMqLvobXhBdfcYvccS38OEwjfGR0UCY52yHIz1X7xYdqChJ12RrjPup01k4wyo8ZIGJ4SKxt2lVWEQe23JJN8ATFi0L

## Prepping the data ##

In [3]:
df = pd.read_csv("/content/2020USElection-BreakSentence.csv")

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,name,file,text
0,0,Bernie Sanders,2020 Democratic National Convention (DNC) Nigh...,"We must come together to defeat Donald Trump, ..."
1,1,Joe Biden,2020 Democratic National Convention (DNC) Nigh...,I’ll see you on Thursday.
2,2,Kamala Harris,2020 Democratic National Convention (DNC) Nigh...,"In this election, we have a chance to change t..."
3,3,Kamala Harris,2020 Democratic National Convention (DNC) Nigh...,We’re all in this fight together.
4,4,Kamala Harris,2020 Democratic National Convention (DNC) Nigh...,What an awesome responsibility.


In [5]:
df["text"].values

array(['We must come together to defeat Donald Trump, and elect Joe Biden and Kamala Harris as our next President and Vice President.',
       'I’ll see you on Thursday.',
       'In this election, we have a chance to change the course of history.',
       ..., 'We’re going to be in Detroit Monday night.',
       'Come join us Monday night.', 'I’ll see you later.'], dtype=object)

## One Hot Encoding of Labels

In [6]:
names = df["name"].unique()

In [7]:
num_categories = len(names)

In [8]:
i, m = pd.factorize(df["name"])

In [38]:
m

Index(['Bernie Sanders', 'Joe Biden', 'Kamala Harris', 'Donald Trump',
       'Mike Pence'],
      dtype='object')

In [9]:
df["cat_num"]=i

In [10]:
labels = tf.one_hot(i, depth=len(m))

In [11]:
(train_df, others)=train_test_split(df,shuffle=True,test_size=0.4, stratify=df['name'])
(val_df, test_df)=train_test_split(others,shuffle=True,test_size=0.5, stratify=others['name'])

In [12]:
print(len(train_df))
print(len(val_df))
print(len(test_df))

53659
17887
17887


In [13]:
X_train = train_df["text"].astype(str).values
y_train = tf.one_hot(train_df["cat_num"], num_categories)
X_val = val_df["text"].astype(str).values
y_val = tf.one_hot(val_df["cat_num"], num_categories)
X_test = test_df["text"].astype(str).values
y_test= tf.one_hot(test_df["cat_num"], num_categories)

## Tokenizer the text ##

In [14]:
max_len =100
max_features = 20000
batch_size=64
dims=50

## Token ##

In [15]:
t=Tokenizer()
t.fit_on_texts(df["text"].astype(str).values)

In [16]:
#Pad Data
text_encode = t.texts_to_sequences(X_train) #Encode the text
x_train_pad=pad_sequences(maxlen=max_len, sequences=text_encode, padding="post", value=0)

val_encode = t.texts_to_sequences(X_val) #Encode the text
x_val_pad=pad_sequences(maxlen=max_len, sequences=val_encode, padding="post", value=0)

In [17]:
test_encode = t.texts_to_sequences(X_test)
x_test_pad=pad_sequences(maxlen=max_len,sequences=test_encode,padding='post',value=0)

In [18]:
import json
print(X_train[0])
print(text_encode[0])
t_config = t.get_config()

print(t_config.keys())

word_index=json.loads(t_config["word_index"])
index_word=json.loads(t_config["index_word"])
n_words=len(word_index.keys())
print(n_words)

He is going to lower the eligibility age for Medicare from 65 to 60, meaning millions of more people will be eligible for Medicare.
[20, 14, 18, 3, 842, 1, 5777, 1232, 15, 620, 71, 3139, 3, 926, 1497, 267, 5, 53, 22, 40, 23, 2875, 15, 620]
dict_keys(['num_words', 'filters', 'lower', 'split', 'char_level', 'oov_token', 'document_count', 'word_counts', 'word_docs', 'index_docs', 'index_word', 'word_index'])
16445


In [19]:
len(x_train_pad[0])

100

## Complicated Way ##

In [None]:
words = set()

for x in X_train:
  s=text_to_word_sequence(x)
  [words.add(i) for i in s]

words.add("<ENDTok>")

In [None]:
word2idx = {w: i for i, w in enumerate(words)}
idx2word = {i: w for w, i in word2idx.items()}

In [None]:
encode = [[word2idx[tup] for tup in text_to_word_sequence(s)] for s in X_train]

In [None]:
print(len(encoded[0]),len(text_to_word_sequence(X_train[0])))
print(word2idx["<ENDTok>"])
print(encoded[0])

47 47
130783
[196714, 185833, 64440, 111005, 186474, 173023, 61657, 47846, 76658, 98757, 166467, 17638, 121582, 21216, 85786, 155699, 185877, 209583, 14548, 167721, 18175, 190516, 122019, 21667, 16312, 182509, 144284, 19906, 112310, 105622, 21111, 208166, 106211, 64440, 9994, 196605, 64440, 63117, 172630, 79628, 163334, 85048, 7461, 829, 38896, 63136, 168653]


In [None]:
x_train_pad=pad_sequences(maxlen=max_len, sequences=encode, padding="post", value=word2idx["<ENDTok>"])
print(x_train_pad[0])

[196714 185833  64440 111005 186474 173023  61657  47846  76658  98757
 166467  17638 121582  21216  85786 155699 185877 209583  14548 167721
  18175 190516 122019  21667  16312 182509 144284  19906 112310 105622
  21111 208166 106211  64440   9994 196605  64440  63117 172630  79628
 163334  85048   7461    829  38896  63136 168653 130783 130783 130783
 130783 130783 130783 130783 130783 130783 130783 130783 130783 130783
 130783 130783 130783 130783 130783 130783 130783 130783 130783 130783
 130783 130783 130783 130783 130783 130783 130783 130783 130783 130783
 130783 130783 130783 130783 130783 130783 130783 130783 130783 130783
 130783 130783 130783 130783 130783 130783 130783 130783 130783 130783]


In [None]:
n_words=len(words)
print(n_words)

210338


## Model ##

In [25]:
Inp = Input(shape=(max_len,))

#notice the n_words+1 in input_dim, result of using tf tokenizer

x = Embedding(input_dim=n_words+1, output_dim=dims, input_length=max_len)(Inp)
x = Dropout(rate=0.1)(x)
x = Bidirectional(LSTM(units=100, recurrent_dropout=0.1))(x)
#x = Dropout(rate=0.1)(x)

# Note that the output layers are given names.
toxic_prediction = Dense(num_categories, activation="softmax", name='classify')(x)



In [21]:
np.asarray(y_train)

array([[1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0.],
       ...,
       [0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.]], dtype=float32)

In [22]:
model = Model(Inp, [toxic_prediction])

In [26]:
model.compile(optimizer="adam", 
              loss='categorical_crossentropy',
              metrics=["accuracy"])

In [34]:
history = model.fit( np.array(x_train_pad), 
                    np.array(y_train),
                    batch_size=2048, 
                    epochs=10, 
                    validation_data= (np.array(x_val_pad),
                        np.array(y_val)),
                    verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
history = model.fit( np.array(x_train_pad), 
                    np.array(y_train),
                    batch_size=2048, 
                    epochs=10, 
                    validation_data= (np.array(x_val_pad),
                        np.array(y_val)),
                    verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [33]:
score=model.evaluate(np.array(x_test_pad),np.array(y_test), batch_size=1024, verbose=1)



In [None]:
print("model accuracy:",score[1])

model accuracy: 0.7386102080345154


In [None]:
x = np.expand_dims(x_test_pad[0], axis=0)
x.shape
model.predict(x)

array([[0.03021459, 0.9041168 , 0.01275015, 0.02538419, 0.02753435]],
      dtype=float32)

In [None]:
test_df.head()

Unnamed: 0.1,Unnamed: 0,name,time,text,file,cat_num
296,1611,Bernie Sanders,15:03,Donald Trump and the Republican leadership are...,Bernie Sanders Los Angeles Rally Transcript Be...,0
5780,10213,Donald Trump,56:15,But we got more money than they asked for and ...,Donald Trump ‘Black Economic Empowerment’ Spee...,3
18,183,Joe Biden,01:07:49,The future really rests on investment. We’re g...,2020 Democratic National Convention (DNC) Nigh...,1
4229,8328,Donald Trump,01:13:28,We’ll strongly protect Medicare and Social Sec...,"Donald Trump Newport News, Virginia Campaign R...",3
1992,5235,Donald Trump,01:13:48,"But if you’re not requesting them, when you ge...",Donald Trump Campaign Speech Transcript Vandal...,3


In [None]:
y_test[0]

<tf.Tensor: shape=(5,), dtype=float32, numpy=array([1., 0., 0., 0., 0.], dtype=float32)>

In [None]:
model.save_weights("/content/drive/My Drive/Machine Learning/news_lstm_1.hdf5")

In [32]:
model.save("/content/drive/My Drive/Machine Learning/2020USElectionModel/LSTM30epochs")

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: /content/drive/My Drive/Machine Learning/2020USElectionModel/LSTM30epochs/assets


In [30]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [36]:
def predict(text):
  text = [text]
  seq = t.texts_to_sequences(text)
  text_arr = pad_sequences(maxlen=max_len, sequences=seq, padding="post", value=0)
  print(text_arr)
  text_arr.shape
  return model.predict(text_arr)

In [37]:
results = predict("We are going to make Ameriaca great again!")
print(results)

[[ 10  19  18   3  94  49 158   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]]
[[0.00267218 0.02753944 0.00133989 0.96213305 0.00631544]]


In [39]:
arg=np.argmax(results)
print(arg)
print(results[0][arg])
print(m[arg])

3
0.96213305
Donald Trump


In [None]:
24]

'WORLD NEWS'