In [1]:
# importing libraries
import pandas as pd
import matplotlib.pyplot as plt
import string
import re
import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import sklearn.metrics
import keras
from keras.datasets import cifar10
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Activation, BatchNormalization, Dropout
from tensorflow.keras import regularizers
from keras.optimizers import SGD, Adam, RMSprop
from sklearn.model_selection import GridSearchCV
from sklearn.utils import class_weight
from keras.wrappers.scikit_learn import KerasClassifier

# reading the data as pandas dataframe
train = pd.read_csv("processed_data/clean_data_201819.csv")
train["Target"] = (train["Years"] == 2018).astype("int")

In [2]:
# print one instance of the data
train.iloc[[0, 1, 2, 3, 1500]]

Unnamed: 0,Tweet,Years,Tweet Length,Lemmatized,Tweets with no Stopwords,Short Tweets,Target
0,big tipper nice double entendre awesome so ...,2018,281,big tipper nice double entendre awesome so ins...,big tipper nice double entendre awesome instea...,big tipper nice double entendre awesome instea...,1
1,...,2018,313,should n't we be suppoing this brave boy who c...,n't suppoing brave boy come forward abuser mon...,n't suppoing brave boy come forward abuser mon...,1
2,smh today s movement is abt women no lo...,2018,284,smh today s movement be abt woman no longer re...,smh today movement abt woman longer remain sil...,smh today movement abt woman longer remain sil...,1
3,cry march protest amp advocate for equali...,2018,285,cry march protest amp advocate for equality be...,cry march protest amp advocate equality litera...,cry march protest amp advocate equality litera...,1
1500,if it staed with what a...,2019,294,if it staed with what american idolize first h...,staed american idolize first hollywood amp fir...,staed american idolize first hollywood amp fir...,0


In [3]:
# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(train["Lemmatized"], train["Target"], random_state = 1)
print(X_train.shape)
print(X_train[2])
print(y_train[2])

(1329,)
smh today s movement be abt woman no longer remain silent amp men be hold accountable that be not the case before i don t doubt bc be a womanizer bc amp lewinsky be an adult consensual blow job ford be yo bk attempt rape not anywhere near the same
1


In [4]:
# create bag-of-words with weights using tfid vectoriser
# strip accents and remove stop words during vectorisation
tf=TfidfVectorizer(strip_accents = 'ascii', stop_words='english')

# transform and fit the training set with vectoriser
X_train_tf = tf.fit_transform(X_train).todense()
X_train_tf_3 = X_train_tf[..., None]
#X_train_pd = pd.DataFrame(X_train_tf, columns = tf.get_feature_names())
# transform the test set with vectoriser
X_test_tf = tf.transform(X_test).todense()
X_test_tf_3 = X_test_tf[..., None]
#X_test_pd = pd.DataFrame(X_test_tf, columns = tf.get_feature_names())

y_train_encoded = to_categorical(y_train)
y_test_encoded = to_categorical(y_test)

print(X_train_tf_3.shape, y_train.shape)
print(X_test_tf_3.shape, y_test.shape)

label_weights = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train.ravel())
print(label_weights)
label_weights = {i:label_weights[i] for i in range(len(label_weights))} # Create dictionary
print(label_weights)

(1329, 6812, 1) (1329,)
(444, 6812, 1) (444,)
[2.94026549 0.60244787]
{0: 2.940265486725664, 1: 0.6024478694469628}




In [5]:
# create logistic regression model
logreg = LogisticRegression(verbose=1, random_state=0, penalty='l2', solver='newton-cg')
# train model on  vectorised training data
model = logreg.fit(X_train_tf, y_train)
# evaluate model performance on the test set
pred = model.predict(X_test_tf)
print(pred)
print(y_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
1447    1
1286    1
858     1
1578    0
351     1
       ..
1539    0
1594    0
771     1
353     1

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.9s finished


In [6]:
import statistics as st
st.mean(pred - y_test)
st.mean(y_test)

0.8445945945945946

In [12]:
# Defining A CNN Model
def define_base_model(dropout_rate, l1_value, l2_value):
    model = Sequential()
    model.add(Conv1D(64, (2), input_shape=(6812, 1), kernel_regularizer = regularizers.l1_l2(l1 = l1_value, l2 = l2_value), 
                     activation= 'relu', name = "Conv1")) 
    model.add(MaxPooling1D(2, name = "MaxP1"))
 
    model.add(Conv1D(64, (2), name = "Conv2", activation='relu'))
    model.add(MaxPooling1D(2, name = "MaxP2"))
 
    model.add(Conv1D(64, (2), name = "Conv3", activation='relu'))
    model.add(MaxPooling1D(2, name = "MaxP3"))
    
    model.add(Conv1D(64, (2), name = "Conv4", kernel_regularizer = regularizers.l1_l2(l1 = l1_value, l2 = l2_value),
                     activation='relu'))
    model.add(MaxPooling1D(2, name = "MaxP4"))
    
    model.add(Conv1D(64, (2), name = "Conv5", activation='relu'))
    model.add(MaxPooling1D(2, name = "MaxP5"))
    
    model.add(Conv1D(64, (2), name = "Conv6", kernel_regularizer = regularizers.l1_l2(l1 = l1_value, l2 = l2_value),
                       activation='relu'))
    model.add(MaxPooling1D(2, name = "MaxP6"))
    
    model.add(Conv1D(64, (2), name = "Conv7", activation='relu'))
    model.add(MaxPooling1D(2, name = "MaxP7"))
 
    model.add(Flatten(name = "Flat1"))
    model.add(Dense(1024, activation='relu', name = "Dense1"))
    model.add(Dropout(dropout_rate))
    model.add(Dense(2, activation='softmax', name = "Output"))
    model.compile(optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [None]:
# Define the drop out grid
dropout_grid = [0.1, 0.5, 0.9]
l1_grid = [2**-5, 2**-6, 2**-7, 2**-8]
l2_grid = [2**-5, 2**-6, 2**-7, 2**-8]
tot = len(dropout_grid) * len(l1_grid) * len(l2_grid)

# Variables for the best result
scores = []
best_history = [] # place holder
best_ind = 0
best_acc = 0

# Loop through each combination
pos = 0
for ii in dropout_grid:
    for jj in l1_grid:
        for kk in l2_grid:
            pos = pos + 1
            print("Fitting the ", pos, "/", tot , " model")
            # define the model
            curr_model = define_base_model(ii, jj, kk)
            
            # train the model
            curr_history = curr_model.fit(X_train_tf_3, y_train_encoded, epochs = 8, 
                                          validation_data = (X_test_tf_3, y_test_encoded), verbose = 1)
            curr_acc = st.mean(curr_history.history['val_acc'][5:10])
                        
            # get prediction report
            y_pred = curr_model.predict(val_X, batch_size=64, verbose=0)
            y_pred_bool = np.argmax(y_pred, axis=1)
            scores.append(classification_report(val_y, y_pred_bool))
            
            # save the best result
            if best_acc < curr_acc:
                best_acc = curr_acc
                best_ind = pos - 1
                best_history = curr_history
                
print(best_acc)
print(best_ind)

Fitting the  1 / 48  model
Epoch 1/8
Epoch 2/8

In [169]:
curr_history = base_model.fit(X_train_tf_3, y_train_encoded, epochs = 10, batch_size = 100,
                              validation_data = (X_test_tf_3, y_test_encoded), class_weight = label_weights, verbose = 1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
