## **Notebook Contents**
- Import Libraries
- Import Dataframes
- Word Cleaning
- Preprocess Data
- Modeling
- Scores
- Citations

<a name="importlibrarieml"></a>
## **Import Libraries**

In [1]:
# Standard Imports
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score

# NLP Imports
import re
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk import word_tokenize
STOPWORDS = set(stopwords.words('english'))

# Keras Imports
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.layers import Dropout
import tensorflow as tf
from tensorflow import keras
# Credit to https://towardsdatascience.com/multi-class-text-classification-with-lstm-1590bee1bd17 for main inspiration and code

[nltk_data] Downloading package stopwords to /home/dlee/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


<a name="importdataframeml"></a>
## **Import Dataframes**


In [4]:
df = pd.read_csv('../data/ready_for_modeling.csv')

In [5]:
zeroes = df[df['label'] == 0]

In [6]:
zeroes['tweet'][1323]

'guys the power just went out there goes the of fortnite that was downloading my playstation '

In [7]:
zeroes['tweet'][34]

'im about the same age as you texasdom and can confirm my mom never went out without being dressed coifed and madeup a very feminine very powerful woman she knew how to wield the power of her femininity when women abandoned femininity they abandoned their power '

In [8]:
zeroes["tweet"][32]

'soooo the power just went out for a split second which is like whatever but its the last thing you want when you work at a production house lmaooooo hoping my editor saves early and often as good ole jim hurguy used to preach'

In [9]:
df.head()

Unnamed: 0,date,tweet,user,label
0,2020-01-10 23:44:13+00:00,another earthquake at our hotel i wasnt there ...,TheCheesyCheska,0
1,2020-08-18 19:48:57+00:00,how can you build stronger business continuity...,Workday,1
2,2020-01-10 23:23:25+00:00,power went out and it messed up my streaming p...,FreeZ3KiLLz,1
3,2020-01-10 22:57:12+00:00,the power went out in my house and i keep hear...,vantaepedia,1
4,2020-01-10 22:56:12+00:00,well my houses power went out and im on my pho...,BlueRepublik,1


In [10]:
df.shape

(279325, 4)

In [11]:
df['tweet'].values

array(['another earthquake at our hotel i wasnt there but it was a and i felt it at my familys house across the city also the power went out in reparto metropolitano in san juan and they have closed the ',
       'how can you build stronger business continuity through fresh insights check out our guide',
       'power went out and it messed up my streaming pc resetting it now streams might be delayed for tonight sorry friends',
       ...,
       'have you considered big island in hawaii yes still technically part of the united states but on the south shore away from most of the tourists its mostly a lot of very nice artsy folk and aging hippies and the weather is lovely ',
       'the weather is nice sarap mag kape ',
       'im getting myself some food and the weather is so nice rn '],
      dtype=object)

<a name="preprocessml"></a>
### **Preprocessing Data** 

In [12]:
# The maximum number of words to be used. (most frequent)
max_words = 15_000

# Max number of words in each tweet.
# First 280 words in the tweet
max_sequence_length = 280

# This is the second argument in our embedding layer 
embedding_dimensions = 100

# Keras Tokenizer turning each Tweet in the corpus into either a sequence of integers or into a vector
# Instantiate the Tokenizer
tokenizer = Tokenizer(num_words=max_words, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)

# Use the tokenizer on every document in our corpus
tokenizer.fit_on_texts(df['tweet'].values)

# Replaces the word with it's index
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 144827 unique tokens.


In [13]:
# Taking the Tweets in df['tweet'] and Tokenizing the list of Tweets
X = tokenizer.texts_to_sequences(df['tweet'].values)

# Keras pad sequence --> Make sequences the same size! Makes the shape the same 
X = pad_sequences(X, maxlen=max_sequence_length) 
print('Shape of data tensor:', X.shape)

Shape of data tensor: (279325, 280)


In [14]:
# Turn our classes into 0's and 1's
y = pd.get_dummies(df['label']).values
print('Shape of label tensor:', y.shape)

Shape of label tensor: (279325, 2)


In [15]:
y

array([[1, 0],
       [0, 1],
       [0, 1],
       ...,
       [1, 0],
       [1, 0],
       [1, 0]], dtype=uint8)

In [16]:
# Split our data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.30, random_state = 42)
print('Train')
print(X_train.shape,y_train.shape) # training data
print('='*40)
print('Test')
print(X_test.shape,y_test.shape) # testing data

Train
(195527, 280) (195527, 2)
Test
(83798, 280) (83798, 2)


In [17]:
len(X)

279325

<a name="modelingml"></a>
## **Modeling**

In [18]:
model = Sequential() #Instantiate the Sequential Model

model.add(Embedding(max_words, embedding_dimensions, input_length=X.shape[1])) # Adding the embedding layer 1st
model.add(SpatialDropout1D(0.10)) # Adding dropout layers to limit overfitting
model.add(LSTM(100, dropout=0.10, recurrent_dropout=0.10)) # Adding dropout layers to limit overfitting
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

epochs = 5
batch_size = 64

history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.001)])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [19]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 280, 100)          1500000   
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 280, 100)          0         
_________________________________________________________________
lstm (LSTM)                  (None, 100)               80400     
_________________________________________________________________
dense (Dense)                (None, 2)                 202       
Total params: 1,580,602
Trainable params: 1,580,602
Non-trainable params: 0
_________________________________________________________________


In [20]:
# Evaluating our model on the Testing Data
accr = model.evaluate(X_test,y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

Test set
  Loss: 0.086
  Accuracy: 0.983


In [21]:
# Example of how the model would classify a Tweet
new_tweet = ["I feel like David is going to try to act surprised when he’s saved. Problem is no one will buy it. Everyone suspects he or kevin have a power."]
# The above text shows a misclassification
seq = tokenizer.texts_to_sequences(new_tweet)
padded = pad_sequences(seq, maxlen=max_sequence_length)
pred = model.predict(padded)
labels = [0, 1]
print(pred, labels[np.argmax(pred)])

[[0.00153951 0.99846053]] 1


In [22]:
# Save the entire model as a SavedModel.
model.save('saved_model/lstm_rnn_model_')

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: saved_model/lstm_rnn_model_/assets


In [23]:
print(tf.__version__)

2.2.0
