In [45]:
#import sys
#!{sys.executable} -m pip install keras==2.4.0
import tensorflow
import keras
import wandb
import pandas as pd
import numpy as np
import statistics as st
from sklearn.utils import class_weight
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from keras.preprocessing import sequence, text
from keras.models import Sequential
from keras.layers import Embedding, Dense, Dropout, Activation, Conv1D, Flatten, MaxPooling1D
from keras.callbacks import EarlyStopping
from tensorflow.python.keras import regularizers

In [11]:
wandb.init()
config = wandb.config

# Set parameters
config.vocab_size = 1000
config.maxlen = 1000
config.batch_size = 32
config.embedding_dims = 10
config.filters = 16
config.kernel_size = 3
config.hidden_dims = 250
config.epochs = 10

In [25]:
df = pd.read_csv('processed_data/clean_data.csv')
df.dropna()
delete_row = df[df['Years'] == 0].index
df = df.drop(delete_row)
df['Years'].unique()

array([2018, 2017, 2019])

In [26]:
X_train, X_test, y_train, y_test = train_test_split(df['Tweets with no Stopwords'], df['Years'], test_size = 0.33,
                                                   random_state = 2)

In [27]:
tokenizer = text.Tokenizer(num_words = config.vocab_size)
tokenizer.fit_on_texts(X_train.astype(str))
X_train_tokenized = tokenizer.texts_to_matrix(X_train.astype(str))
X_test_tokenized = tokenizer.texts_to_matrix(X_test.astype(str))
X_train_seq = sequence.pad_sequences(X_train_tokenized, maxlen = config.maxlen)
X_test_seq = sequence.pad_sequences(X_test_tokenized, maxlen = config.maxlen)

In [28]:
# One-hot encoding
y_train_encoded = LabelBinarizer().fit_transform(y_train)
y_test_encoded = LabelBinarizer().fit_transform(y_test)

In [36]:
label_weights = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train.ravel())
print(label_weights)
label_weights = {i:label_weights[i] for i in range(len(label_weights))} # Create dictionary
print(label_weights)

[1.21118492 0.5432297  2.99831792]
{0: 1.21118492304933, 1: 0.5432297008596592, 2: 2.9983179213948445}




In [34]:
# Check shapes
print(X_train_seq.shape)
print(X_test_seq.shape)
print(y_train_encoded.shape)
print(y_test_encoded.shape)

(702308, 1000)
(345913, 1000)
(702308, 3)
(345913, 3)


In [49]:
callback = EarlyStopping(monitor = 'val_loss', mode = 'min', verbose = 1, patience = 3)

def define_cnn_model(dropout_rate):
    model = Sequential()
    model.add(Embedding(config.vocab_size, config.embedding_dims, input_length = config.maxlen))
    model.add(Conv1D(config.filters, config.kernel_size, padding = 'valid', activation = 'relu'))
    model.add(MaxPooling1D())
    model.add(Conv1D(config.filters, config.kernel_size, padding = 'valid', activation = 'relu'))
    model.add(Flatten())
    model.add(Dense(config.hidden_dims, activation = 'relu'))
    model.add(Dropout(dropout_rate))
    model.add(Dense(3, activation = 'softmax'))
    model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    return model

In [47]:
# Define the drop out grid
dropout_grid = [0.1, 0.5, 0.9]
l1_grid = [2**-5, 2**-6, 2**-7, 2**-8]
l2_grid = [2**-5, 2**-6, 2**-7, 2**-8]
tot = len(dropout_grid) * len(l1_grid) * len(l2_grid)

# Variables for the best result
best_history = [] # place holder
best_ind = 0
best_acc = 0

# Loop through each combination
pos = 0
for ii in dropout_grid:
    for jj in l1_grid:
        for kk in l2_grid:
            pos = pos + 1
            print("Fitting the ", pos, "/", tot , " model")
            # define the model
            curr_model = define_cnn_model(ii, jj, kk)
            
            # train the model
            curr_history = curr_model.fit(X_train_seq, y_train_encoded, epochs = 5, class_weight = label_weights,
                                          batch_size = 64, validation_data = (X_test_seq, y_test_encoded), 
                                          callbacks = [callback], verbose = 1)
            curr_acc = st.mean(curr_history.history['val_acc'])
            
            # save the best result
            if best_acc < curr_acc:
                best_acc = curr_acc
                best_ind = pos - 1
                best_history = curr_history
                
print(best_acc)
print(best_ind)

Fitting the  1 / 48  model
Train on 702308 samples, validate on 345913 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Fitting the  2 / 48  model
Train on 702308 samples, validate on 345913 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Fitting the  3 / 48  model
Train on 702308 samples, validate on 345913 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Fitting the  4 / 48  model
Train on 702308 samples, validate on 345913 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Fitting the  5 / 48  model
Train on 702308 samples, validate on 345913 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Fitting the  6 / 48  model
Train on 702308 samples, validate on 345913 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Fitting the  7 / 48  model
Train on 702308 samples, validate on 345913 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Fitting the  8 / 48  model
Train on 702308 samples, validate on 345913 samples
Epoch 1/5
Epoch 2/

Epoch 4/5
Epoch 5/5
Fitting the  12 / 48  model
Train on 702308 samples, validate on 345913 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Fitting the  13 / 48  model
Train on 702308 samples, validate on 345913 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Fitting the  14 / 48  model
Train on 702308 samples, validate on 345913 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Fitting the  15 / 48  model
Train on 702308 samples, validate on 345913 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Fitting the  16 / 48  model
Train on 702308 samples, validate on 345913 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 00004: early stopping
Fitting the  17 / 48  model
Train on 702308 samples, validate on 345913 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Fitting the  18 / 48  model
Train on 702308 samples, validate on 345913 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Fitting the  19 / 48  model
Train on 702308 samples,

Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Fitting the  23 / 48  model
Train on 702308 samples, validate on 345913 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Fitting the  24 / 48  model
Train on 702308 samples, validate on 345913 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Fitting the  25 / 48  model
Train on 702308 samples, validate on 345913 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Fitting the  26 / 48  model
Train on 702308 samples, validate on 345913 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Fitting the  27 / 48  model
Train on 702308 samples, validate on 345913 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 00005: early stopping
Fitting the  28 / 48  model
Train on 702308 samples, validate on 345913 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Fitting the  29 / 48  model
Train on 702308 samples, validate on 345913 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Fitting the  30 / 48  

Epoch 4/5
Epoch 5/5
Fitting the  33 / 48  model
Train on 702308 samples, validate on 345913 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Fitting the  34 / 48  model
Train on 702308 samples, validate on 345913 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Fitting the  35 / 48  model
Train on 702308 samples, validate on 345913 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Fitting the  36 / 48  model
Train on 702308 samples, validate on 345913 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Fitting the  37 / 48  model
Train on 702308 samples, validate on 345913 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Fitting the  38 / 48  model
Train on 702308 samples, validate on 345913 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 00004: early stopping
Fitting the  39 / 48  model
Train on 702308 samples, validate on 345913 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Fitting the  40 / 48  model
Train on 702308 samples,

Epoch 3/5
Epoch 4/5
Epoch 5/5
Fitting the  44 / 48  model
Train on 702308 samples, validate on 345913 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 00005: early stopping
Fitting the  45 / 48  model
Train on 702308 samples, validate on 345913 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 00005: early stopping
Fitting the  46 / 48  model
Train on 702308 samples, validate on 345913 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Fitting the  47 / 48  model
Train on 702308 samples, validate on 345913 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Fitting the  48 / 48  model
Train on 702308 samples, validate on 345913 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
0.7102173089773272
14


In [None]:
model = define_cnn_model(0.5)
model.summary()
history = model.fit(X_train_seq, y_train_encoded, epochs = 15, class_weight = label_weights, batch_size = 64, 
                    validation_data = (X_test_seq, y_test_encoded), verbose = 1)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_53 (Embedding)     (None, 1000, 10)          10000     
_________________________________________________________________
conv1d_105 (Conv1D)          (None, 998, 16)           496       
_________________________________________________________________
max_pooling1d_53 (MaxPooling (None, 499, 16)           0         
_________________________________________________________________
conv1d_106 (Conv1D)          (None, 497, 16)           784       
_________________________________________________________________
flatten_53 (Flatten)         (None, 7952)              0         
_________________________________________________________________
dense_105 (Dense)            (None, 250)               1988250   
_________________________________________________________________
dropout_56 (Dropout)         (None, 250)               0         
__________