<a href="https://colab.research.google.com/github/sljm12/Programming-exploration/blob/master/2020USElection/Dense_USE_Embedding_Trump_Biden_Kamala_Classifier_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Intro ##
This notebook uses LSTM and Dense layers to do speaker identification based on the US election speeches

In [1]:
!pip3 install tensorflow_text>=2.0.0rc0

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use("ggplot")

import pandas as pd
import numpy as np

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text

from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, Input

from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
import json
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [3]:
!wget https://www.dropbox.com/s/fna7obll05a8dmi/2020USElection.zip
!unzip 2020USElection.zip

--2020-10-23 12:02:49--  https://www.dropbox.com/s/fna7obll05a8dmi/2020USElection.zip
Resolving www.dropbox.com (www.dropbox.com)... 162.125.1.1, 2620:100:6016:1::a27d:101
Connecting to www.dropbox.com (www.dropbox.com)|162.125.1.1|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /s/raw/fna7obll05a8dmi/2020USElection.zip [following]
--2020-10-23 12:02:49--  https://www.dropbox.com/s/raw/fna7obll05a8dmi/2020USElection.zip
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://ucf472704af420100da831cafc2c.dl.dropboxusercontent.com/cd/0/inline/BBwMqLvobXhBdfcYvccS38OEwjfGR0UCY52yHIz1X7xYdqChJ12RrjPup01k4wyo8ZIGJ4SKxt2lVWEQe23JJN8ATFi0LXo9ftpThMrMO_mMmYfOxvfX39fW7ZRyLYxmpCg/file# [following]
--2020-10-23 12:02:50--  https://ucf472704af420100da831cafc2c.dl.dropboxusercontent.com/cd/0/inline/BBwMqLvobXhBdfcYvccS38OEwjfGR0UCY52yHIz1X7xYdqChJ12RrjPup01k4wyo8ZIGJ4SKxt2lVWEQe23JJN8ATFi0LXo9

# Prepping the data ##

In [4]:
df = pd.read_csv("/content/2020USElection-BreakSentence.csv")

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,name,file,text
0,0,Bernie Sanders,2020 Democratic National Convention (DNC) Nigh...,"We must come together to defeat Donald Trump, ..."
1,1,Joe Biden,2020 Democratic National Convention (DNC) Nigh...,I’ll see you on Thursday.
2,2,Kamala Harris,2020 Democratic National Convention (DNC) Nigh...,"In this election, we have a chance to change t..."
3,3,Kamala Harris,2020 Democratic National Convention (DNC) Nigh...,We’re all in this fight together.
4,4,Kamala Harris,2020 Democratic National Convention (DNC) Nigh...,What an awesome responsibility.


In [6]:
df["text"].values

array(['We must come together to defeat Donald Trump, and elect Joe Biden and Kamala Harris as our next President and Vice President.',
       'I’ll see you on Thursday.',
       'In this election, we have a chance to change the course of history.',
       ..., 'We’re going to be in Detroit Monday night.',
       'Come join us Monday night.', 'I’ll see you later.'], dtype=object)

## One Hot Encoding of Labels

In [7]:
names = df["name"].unique()

In [8]:
num_categories = len(names)

In [9]:
i, m = pd.factorize(df["name"])

In [10]:
df["cat_num"]=i

In [11]:
labels = tf.one_hot(i, depth=len(m))

In [12]:
(train_df, others)=train_test_split(df,shuffle=True,test_size=0.4, stratify=df['name'])
(val_df, test_df)=train_test_split(others,shuffle=True,test_size=0.5, stratify=others['name'])

In [13]:
print(len(train_df))
print(len(val_df))
print(len(test_df))

53659
17887
17887


In [14]:
X_train = train_df["text"].astype(str).values
y_train = tf.one_hot(train_df["cat_num"], num_categories)
X_val = val_df["text"].astype(str).values
y_val = tf.one_hot(val_df["cat_num"], num_categories)
X_test = test_df["text"].astype(str).values
y_test= tf.one_hot(test_df["cat_num"], num_categories)

# TF Tokenizer

## Tokenizer the text ##

In [15]:
max_len =100
max_features = 20000
batch_size=64
dims=50

## Token ##

In [16]:
t=Tokenizer()
t.fit_on_texts(df["text"].astype(str).values)

In [17]:
#Pad Data
text_encode = t.texts_to_sequences(X_train) #Encode the text
x_train_pad=pad_sequences(maxlen=max_len, sequences=text_encode, padding="post", value=0)

val_encode = t.texts_to_sequences(X_val) #Encode the text
x_val_pad=pad_sequences(maxlen=max_len, sequences=val_encode, padding="post", value=0)

In [18]:
test_encode = t.texts_to_sequences(X_test)
x_test_pad=pad_sequences(maxlen=max_len,sequences=test_encode,padding='post',value=0)

In [19]:
import json
print(X_train[0])
print(text_encode[0])
t_config = t.get_config()

print(t_config.keys())

word_index=json.loads(t_config["word_index"])
index_word=json.loads(t_config["index_word"])
n_words=len(word_index.keys())
print(n_words)

President Donald Trump is the most pro-life president in American history, it’s true.
[45, 184, 103, 14, 1, 164, 989, 206, 45, 8, 76, 177, 26, 317]
dict_keys(['num_words', 'filters', 'lower', 'split', 'char_level', 'oov_token', 'document_count', 'word_counts', 'word_docs', 'index_docs', 'index_word', 'word_index'])
16445


In [20]:
len(x_train_pad[0])

100

## Complicated Way ##

In [None]:
words = set()

for x in X_train:
  s=text_to_word_sequence(x)
  [words.add(i) for i in s]

words.add("<ENDTok>")

In [None]:
word2idx = {w: i for i, w in enumerate(words)}
idx2word = {i: w for w, i in word2idx.items()}

In [None]:
encode = [[word2idx[tup] for tup in text_to_word_sequence(s)] for s in X_train]

In [None]:
print(len(encoded[0]),len(text_to_word_sequence(X_train[0])))
print(word2idx["<ENDTok>"])
print(encoded[0])

47 47
130783
[196714, 185833, 64440, 111005, 186474, 173023, 61657, 47846, 76658, 98757, 166467, 17638, 121582, 21216, 85786, 155699, 185877, 209583, 14548, 167721, 18175, 190516, 122019, 21667, 16312, 182509, 144284, 19906, 112310, 105622, 21111, 208166, 106211, 64440, 9994, 196605, 64440, 63117, 172630, 79628, 163334, 85048, 7461, 829, 38896, 63136, 168653]


In [None]:
x_train_pad=pad_sequences(maxlen=max_len, sequences=encode, padding="post", value=word2idx["<ENDTok>"])
print(x_train_pad[0])

[196714 185833  64440 111005 186474 173023  61657  47846  76658  98757
 166467  17638 121582  21216  85786 155699 185877 209583  14548 167721
  18175 190516 122019  21667  16312 182509 144284  19906 112310 105622
  21111 208166 106211  64440   9994 196605  64440  63117 172630  79628
 163334  85048   7461    829  38896  63136 168653 130783 130783 130783
 130783 130783 130783 130783 130783 130783 130783 130783 130783 130783
 130783 130783 130783 130783 130783 130783 130783 130783 130783 130783
 130783 130783 130783 130783 130783 130783 130783 130783 130783 130783
 130783 130783 130783 130783 130783 130783 130783 130783 130783 130783
 130783 130783 130783 130783 130783 130783 130783 130783 130783 130783]


In [None]:
n_words=len(words)
print(n_words)

210338


# USE Encoding

In [32]:
max_len = 512

In [33]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4" #@param ["https://tfhub.dev/google/universal-sentence-encoder/4", "https://tfhub.dev/google/universal-sentence-encoder-large/5", "https://tfhub.dev/google/universal-sentence-encoder-multilingual/3"]

In [34]:
use_model = hub.load(module_url)
print ("module %s loaded" % module_url)
def embed(input):
  return use_model(input)

module https://tfhub.dev/google/universal-sentence-encoder/4 loaded


In [35]:
x_train_pad = embed(X_train)
x_val_pad = embed(X_val)

In [46]:
x_test_pad = embed(X_test)

In [36]:
x_train_pad.shape

TensorShape([53659, 512])

# Model

In [37]:
Inp = Input(shape=(max_len,))

#notice the n_words+1 in input_dim, result of using tf tokenizer

# x = Embedding(input_dim=n_words+1, output_dim=dims, input_length=max_len)(Inp)
# x = Dropout(rate=0.1)(x)
x = Dense(256)(Inp)
x = Dropout(0.2)(x)
x = Dense(128)(x)
x = Dropout(0.2)(x)
x = Dense(64)(x)
x = Dropout(0.2)(x)
#x = Dropout(rate=0.1)(x)

# Note that the output layers are given names.
toxic_prediction = Dense(num_categories, activation="softmax", name='classify')(x)

In [38]:
np.asarray(y_train)

array([[0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       ...,
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0.]], dtype=float32)

In [39]:
model = Model(Inp, [toxic_prediction])

In [43]:
model.compile(optimizer="adam", 
              loss='categorical_crossentropy',
              metrics=["accuracy"])

In [45]:
history = model.fit( np.array(x_train_pad), 
                    np.array(y_train),
                    batch_size=2048, 
                    epochs=300, 
                    validation_data= (np.array(x_val_pad),
                        np.array(y_val)),
                    verbose=1)

Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 78

In [42]:
history = model.fit( np.array(x_train_pad), 
                    np.array(y_train),
                    batch_size=2048, 
                    epochs=200, 
                    validation_data= (np.array(x_val_pad),
                        np.array(y_val)),
                    verbose=1)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

Can't see to get past accuracy 70% just using simple dense layers

In [None]:
y_val

<tf.Tensor: shape=(2173, 5), dtype=float32, numpy=
array([[0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.]], dtype=float32)>

In [47]:
score=model.evaluate(np.array(x_test_pad),np.array(y_test), batch_size=1024, verbose=1)



In [48]:
print("model accuracy:",score[1])

model accuracy: 0.6957007646560669


In [None]:
x = np.expand_dims(x_test_pad[0], axis=0)
x.shape
model.predict(x)

array([[0.03021459, 0.9041168 , 0.01275015, 0.02538419, 0.02753435]],
      dtype=float32)

In [None]:
test_df.head()

Unnamed: 0.1,Unnamed: 0,name,time,text,file,cat_num
296,1611,Bernie Sanders,15:03,Donald Trump and the Republican leadership are...,Bernie Sanders Los Angeles Rally Transcript Be...,0
5780,10213,Donald Trump,56:15,But we got more money than they asked for and ...,Donald Trump ‘Black Economic Empowerment’ Spee...,3
18,183,Joe Biden,01:07:49,The future really rests on investment. We’re g...,2020 Democratic National Convention (DNC) Nigh...,1
4229,8328,Donald Trump,01:13:28,We’ll strongly protect Medicare and Social Sec...,"Donald Trump Newport News, Virginia Campaign R...",3
1992,5235,Donald Trump,01:13:48,"But if you’re not requesting them, when you ge...",Donald Trump Campaign Speech Transcript Vandal...,3


In [None]:
y_test[0]

<tf.Tensor: shape=(5,), dtype=float32, numpy=array([1., 0., 0., 0., 0.], dtype=float32)>

In [None]:
model.save_weights("/content/drive/My Drive/Machine Learning/news_lstm_1.hdf5")

In [None]:
def predict(text):
  text = [text]
  seq = t.texts_to_sequences(text)
  text_arr = pad_sequences(maxlen=max_len, sequences=seq, padding="post", value=0)
  print(text_arr)
  text_arr.shape
  return model.predict(text_arr)

In [None]:
results = predict("Rubio Has A Rocky Road Ahead	!")
print(results)

[[ 999   58    4 6964  664  821    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0]]
[[1.8750572e-07 6.7876118e-07 2.6042713e-04 3.9953052e-04 3.4411922e-05
  6.7518355e-04 5.7862457e-05 2.0026339e-06 1.2046334e-07 3.4730008e-05
  2.9307179e-04 1.7206936e-07 1.9639631e-06 4.7070738e-09 1.3729482e-07
  3.8334758e-05 1.7999735e-04 1.6748100e-08 3.2059750e-05 7.9951606e-05
  4.2417888e-03 1.1336841e-06 2.2617862e-06 1.2134593e-06 9.9218643e-01
  3.4863144e-05 9.9452533e-05 1.6575746e-06 1.1835338e-05 1.2320362e-06
  1.0100941e-06 2.8239212e-08 1.6465271e-04 1.2510

In [None]:
arg=np.argmax(results)
print(arg)
print(results[0][arg])
print(arg_max_label[arg])

24
0.9921864
POLITICS


In [None]:
24]

'WORLD NEWS'