# Modeling

There are three steps to creating this model:

1. **Vectorization**
2. **Train/Validation/Test Split**
3. **Modeling**: We apply a baseline CNN.

In [49]:
import numpy as np
import re
import keras
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras import layers
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers import Dense, Conv1D, LSTM, GlobalMaxPooling1D, InputLayer, Dropout, SpatialDropout1D, Bidirectional
from keras.layers.normalization import BatchNormalization
from keras import optimizers
from keras import models
from keras import losses
from keras import metrics
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

# Import custom functions
from explore import *
from evaluation import *

In [3]:
# Import the dataset
df = pd.read_csv('processed_data/clean_data.csv')

In [6]:
df.isna().sum()

Tweet                       1830
Years                          0
Lemmatized                  3220
Tweets with no Stopwords    6230
Short Tweets                6893
dtype: int64

In [7]:
df = df.dropna()

In [24]:
print(df['Years'].unique())

[2018 2017 2019]


# CNN for Text Analysis

Neural networks analyze texts in a slightly different way with words as opposed to the sparse TF-IDF framework. Since this is a large dataset, a CNN maybe able to pick up intricate patterns. Preprocessing with CNNs requires it to be processed with Keras' `Embedding.()` when it comes to the modeling.

In [18]:
X_train, X_test, y_train, y_test = train_test_split(df['Tweets with no Stopwords'], 
                                                    df['Years'], test_size = 0.3, random_state = 2)

In [20]:
token = Tokenizer(num_words = 5000, filters = '!"#$%&()*+,-./:;<=>?@[\]^_`{"}~\t\n', lower = True, 
                  split = ' ', oov_token = True)
token.fit_on_texts(X_train)
X_train_seq = token.texts_to_sequences(X_train)
X_test_seq = token.texts_to_sequences(X_test)

In [28]:
# One hot encoding for labels
encoder = LabelBinarizer()
encoder.fit(y_train)
transformed = encoder.transform(y_train)
y_train_encoded = pd.DataFrame(transformed)
transformed_test = encoder.transform(y_test)
y_test_encoded = pd.DataFrame(transformed_test)

In [21]:
max_seq_length(X_train_seq)

83

In [29]:
max_length = 83

In [22]:
# Total number of words in the corpus
vocabulary_size = len(token.word_index)
vocabulary_size

374556

In [23]:
number_of_words = 5000

In [30]:
X_train_seq_pad = pad_sequences(X_train_seq, maxlen = max_length, padding = 'post')
X_test_seq_pad = pad_sequences(X_test_seq, maxlen = max_length, padding = 'post')

In [31]:
X_train_emb, X_val_emb, y_train_emb, y_val_emb = train_test_split(X_train_seq_pad, y_train_encoded, 
                                                                  test_size = 0.3, random_state = 3)

In [34]:
emb_model = models.Sequential()
emb_model.add(layers.Embedding(number_of_words, 200, input_length = max_length))
emb_model.add(layers.Conv1D(50, 3, activation = 'relu', input_shape = (200, 1)))
emb_model.add(layers.GlobalMaxPooling1D())
emb_model.add(layers.Dense(3, activation = 'softmax'))
emb_model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])

In [35]:
emb_model_history = emb_model.fit(X_train_emb, y_train_emb, epochs = 10, batch_size = 64, validation_split = 0.3)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [36]:
emb_model.evaluate(X_val_emb, y_val_emb)



[0.8499530553817749, 0.8045334219932556]

In [40]:
emb_model.evaluate(X_test_seq_pad, y_test_encoded)



[0.8504638075828552, 0.8045485019683838]

In [45]:
# Get classification report
y_pred = emb_model.predict(X_test_seq_pad, batch_size = 64, verbose = 1)
y_pred_bool = encoder.inverse_transform(y_pred) # Undo one-hot encoding
print(classification_report(y_test, y_pred_bool))

              precision    recall  f1-score   support

        2017       0.86      0.90      0.88     85882
        2018       0.84      0.85      0.84    191654
        2019       0.39      0.34      0.36     34874

    accuracy                           0.80    312410
   macro avg       0.70      0.69      0.70    312410
weighted avg       0.80      0.80      0.80    312410



In [46]:
emb_model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 83, 200)           1000000   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 81, 50)            30050     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 50)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 153       
Total params: 1,030,203
Trainable params: 1,030,203
Non-trainable params: 0
_________________________________________________________________


# FastText

In [55]:
def make_fast_text():
    fast_text = Sequential()
    fast_text.add(InputLayer((max_length,)))
    fast_text.add(Embedding(input_dim = vocabulary_size + 1, output_dim = 3, trainable = True))
    fast_text.add(SpatialDropout1D(0.5))
    fast_text.add(GlobalMaxPooling1D())
    fast_text.add(Dropout(0.5))
    fast_text.add(Dense(3, activation = 'softmax'))
    return fast_text

In [None]:
fast_text.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
fast_text.fit(X_train_emb, y_train_emb, epochs = 10, verbose = 1, validation_split = 0.3)

Epoch 1/10

In [None]:
# Get classification report
y_pred = fast_text.predict(X_test_seq_pad, batch_size = 64, verbose = 1)
y_pred_bool = encoder.inverse_transform(y_pred) # Undo one-hot encoding
print(classification_report(y_test, y_pred_bool))

In [None]:
fast_text.summary()

# LSTM 

In [None]:
def define_lstm_model():
    model_lstm = Sequential()
    model_lstm.add(InputLayer((max_length,)))
    model_lstm.add(SpatialDropout1D(0.5))
    model_lstm.add(BatchNormalization())
    model_lstm.add(Bidirectional(LSTM(125)))
    model_lstm.add(BatchNormalization())
    model_lstm.add(Dropout(0.5))
    model_lstm.add(Dense(3, activation = 'softmax'))
    return model_lstm

In [None]:
model_lstm.compile(loss = 'categorical_crossentropy', optimzer = 'adam', metrics = ['accuracy'])
model_lstm.fit(X_train_emb, y_train_emb, validation_data = (X_val_emb, y_val_emb), epochs = 10, verbose = 1)

In [None]:
# Get classification report
y_pred = model_lstm.predict(X_test_seq_pad, batch_size = 64, verbose = 1)
y_pred_bool = encoder.inverse_transform(y_pred) # Undo one-hot encoding
print(classification_report(y_test, y_pred_bool))