# Modeling

There are three steps to creating this model:

1. **Vectorization**
2. **Train/Validation/Test Split**
3. **Modeling**: We apply a baseline CNN.

In [67]:
import numpy as np
import re
import keras
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras import layers
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers import Dense, Conv1D, LSTM, GlobalMaxPooling1D, InputLayer, Dropout, SpatialDropout1D
from keras.layers.normalization import BatchNormalization
from keras import optimizers
from keras import models
from keras import losses
from keras import metrics
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

# Import custom functions
from explore import *
from evaluation import *

In [3]:
# Import the dataset
df = pd.read_csv('processed_data/clean_data.csv')
df = df.dropna()

In [18]:
X_train, X_test, y_train, y_test = train_test_split(df['Tweets with no Stopwords'], 
                                                    df['Years'], test_size = 0.3, random_state = 2)

In [20]:
token = Tokenizer(num_words = 5000, filters = '!"#$%&()*+,-./:;<=>?@[\]^_`{"}~\t\n', lower = True, 
                  split = ' ', oov_token = True)
token.fit_on_texts(X_train)
X_train_seq = token.texts_to_sequences(X_train)
X_test_seq = token.texts_to_sequences(X_test)

In [28]:
# One hot encoding for labels
encoder = LabelBinarizer()
encoder.fit(y_train)
transformed = encoder.transform(y_train)
y_train_encoded = pd.DataFrame(transformed)
transformed_test = encoder.transform(y_test)
y_test_encoded = pd.DataFrame(transformed_test)

In [21]:
max_length = max_seq_length(X_train_seq)
# Total number of words in the corpus
vocabulary_size = len(token.word_index)
number_of_words = 5000

83

In [30]:
X_train_seq_pad = pad_sequences(X_train_seq, maxlen = max_length, padding = 'post')
X_test_seq_pad = pad_sequences(X_test_seq, maxlen = max_length, padding = 'post')

In [31]:
X_train_emb, X_val_emb, y_train_emb, y_val_emb = train_test_split(X_train_seq_pad, y_train_encoded, 
                                                                  test_size = 0.3, random_state = 3)

# CNN for Text Analysis

Neural networks analyze texts in a slightly different way with words as opposed to the sparse TF-IDF framework. Since this is a large dataset, a CNN maybe able to pick up intricate patterns. Preprocessing with CNNs requires it to be processed with Keras' `Embedding.()` when it comes to the modeling.

In [34]:
def define_CNN_model():
    emb_model = models.Sequential()
    emb_model.add(layers.Embedding(number_of_words, 200, input_length = max_length))
    emb_model.add(layers.Conv1D(50, 3, activation = 'relu', input_shape = (200, 1)))
    emb_model.add(layers.GlobalMaxPooling1D())
    emb_model.add(layers.Dense(3, activation = 'softmax'))
    emb_model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', 
                      metrics = ['accuracy', f1, precision_measure, recall_measure])
    return cnn_model

In [35]:
cnn_model = define_CNN_model()
cnn_model.summary()
cnn_model_history = cnn_model.fit(X_train_emb, y_train_emb, epochs = 10, batch_size = 64, 
                                  validation_data = (X_val_emb, y_val_emb), verbose = 1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [36]:
plot_history(emb_model_history)



[0.8499530553817749, 0.8045334219932556]

In [45]:
# Get classification report
y_pred = cnn_model.predict(X_test_seq_pad, batch_size = 64, verbose = 1)
y_pred_bool = encoder.inverse_transform(y_pred) # Undo one-hot encoding
print(classification_report(y_test, y_pred_bool))

              precision    recall  f1-score   support

        2017       0.86      0.90      0.88     85882
        2018       0.84      0.85      0.84    191654
        2019       0.39      0.34      0.36     34874

    accuracy                           0.80    312410
   macro avg       0.70      0.69      0.70    312410
weighted avg       0.80      0.80      0.80    312410



# FastText

In [55]:
def make_fast_text():
    fast_text = Sequential()
    fast_text.add(InputLayer((max_length,)))
    fast_text.add(Embedding(input_dim = vocabulary_size + 1, output_dim = 3, trainable = True))
    fast_text.add(SpatialDropout1D(0.5))
    fast_text.add(GlobalMaxPooling1D())
    fast_text.add(Dropout(0.5))
    fast_text.add(Dense(3, activation = 'softmax'))
    fast_text.compile(loss = 'categorical_crossentropy', optimizer = 'adam', 
                      metrics = ['accuracy', f1, precision_measure, recall_measure])
    return fast_text

In [60]:
fast_text = make_fast_text()
fast_text.summary()
fast_text_history = fast_text.fit(X_train_emb, y_train_emb, epochs = 10, batch_size = 64, 
                                  validation_data = (X_val_emb, y_val_emb), verbose = 1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1b9a4a65d08>

In [61]:
# Get classification report
y_pred = fast_text.predict(X_test_seq_pad, batch_size = 64, verbose = 1)
y_pred_bool = encoder.inverse_transform(y_pred) # Undo one-hot encoding
print(classification_report(y_test, y_pred_bool))

              precision    recall  f1-score   support

        2017       0.85      0.89      0.87     85882
        2018       0.81      0.94      0.87    191654
        2019       0.00      0.00      0.00     34874

    accuracy                           0.82    312410
   macro avg       0.55      0.61      0.58    312410
weighted avg       0.73      0.82      0.77    312410



# LSTM 

In [73]:
def define_LSTM_model():
    lstm_model = Sequential()
    lstm_model.add(InputLayer((max_length,)))
    lstm_model.add(Embedding(input_dim = vocabulary_size + 1, output_dim = 3, trainable = True))
    lstm_model.add(LSTM(125))
    lstm_model.add(Dropout(0.5))
    lstm_model.add(Dense(3, activation = 'softmax'))
    lstm_model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', 
                       metrics = ['accuracy', f1, precision_measure, recall_measure])
    return model_lstm

In [74]:
lstm_model = define_LSTM_model()
lstm_model.summar
lstm_model.fit(X_train_emb, y_train_emb, validation_data = (X_val_emb, y_val_emb), epochs = 10, verbose = 1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1b997087708>

In [75]:
# Get classification report
y_pred = model_lstm.predict(X_test_seq_pad, batch_size = 64, verbose = 1)
y_pred_bool = encoder.inverse_transform(y_pred) # Undo one-hot encoding
print(classification_report(y_test, y_pred_bool))

              precision    recall  f1-score   support

        2017       0.00      0.00      0.00     85882
        2018       0.61      1.00      0.76    191654
        2019       1.00      0.00      0.00     34874

    accuracy                           0.61    312410
   macro avg       0.54      0.33      0.25    312410
weighted avg       0.49      0.61      0.47    312410



In [76]:
model_lstm.summary()

Model: "sequential_25"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_23 (Embedding)     (None, 83, 3)             1123671   
_________________________________________________________________
lstm_3 (LSTM)                (None, 125)               64500     
_________________________________________________________________
dropout_20 (Dropout)         (None, 125)               0         
_________________________________________________________________
dense_22 (Dense)             (None, 3)                 378       
Total params: 1,188,549
Trainable params: 1,188,549
Non-trainable params: 0
_________________________________________________________________
