In [14]:
# Global imports

import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import plotly.express as px

import warnings

warnings.filterwarnings('ignore')

In [15]:
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [16]:
import os

root_folder = "/gdrive/MyDrive/corona_tweet_sentiment"
data_folder = os.path.join(root_folder, "data")

In [25]:
df = pd.read_csv(data_folder+'/final.csv', encoding= "utf-8")

In [26]:
df.head()

Unnamed: 0,Sentiment,text
0,Neutral,menyrbie phil_gahan chrisitv
1,Positive,advice talk neighbours family exchange phone n...
2,Positive,coronavirus australia woolworths give elderly ...
3,Positive,food stock one empty please dont panic enough ...
4,Negative,ready go supermarket covid outbreak im paranoi...


In [27]:
df["Sentiment"] = df["Sentiment"].map({"Neutral":0, "Positive":1, "Negative":2})

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44955 entries, 0 to 44954
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Sentiment  44955 non-null  int64 
 1   text       44944 non-null  object
dtypes: int64(1), object(1)
memory usage: 702.5+ KB


We can see that there are some null values in text columns, let's remove the rows

In [30]:
df.dropna(inplace=True)

In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44944 entries, 0 to 44954
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Sentiment  44944 non-null  int64 
 1   text       44944 non-null  object
dtypes: int64(1), object(1)
memory usage: 1.0+ MB


In [32]:
df.groupby('Sentiment').describe()

Unnamed: 0_level_0,text,text,text,text
Unnamed: 0_level_1,count,unique,top,freq
Sentiment,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,8321,8260,online shopping way covid restrictions,7
1,19592,19528,get protected face masks stock regular prices,4
2,17031,16966,oil prices falling amid covid recession beginn...,6


Let's also remove duplicate rows

In [36]:
df = df.drop_duplicates()
df.groupby('Sentiment').describe()

Unnamed: 0_level_0,text,text,text,text
Unnamed: 0_level_1,count,unique,top,freq
Sentiment,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,8260,8260,new york largest plant supplies sanitizer u ca...,1
1,19528,19528,sellers cashing panic sell facemasks sanitizer...,1
2,16966,16966,lagos state asked people stock covid lockdown ...,1


In [48]:
def get_num_words_per_tweet(sample_texts):
    num_words = [len(s.split()) for s in sample_texts]
    return np.median(num_words)

n  = get_num_words_per_tweet(df["text"])

print(len(df)/n)

2355.4736842105262


# Handling Class Imbalance Problem

**Approach 1**: Under-sample both positive and negative instances

**Approach 2**: Over-sample neutral instances

In [37]:
df_neutral = df[df["Sentiment"]==0]
df_neutral.shape

(8260, 2)

In [38]:
df_positive = df[df["Sentiment"]==1]
df_negative = df[df["Sentiment"]==2]

## Under-sampling

In [39]:
df_pos_downsampled =  df_positive.sample(df_neutral.shape[0])
df_neg_downsampled = df_negative.sample(df_neutral.shape[0])

In [40]:
df_under = pd.concat([df_neutral, df_pos_downsampled , df_neg_downsampled])
df_under.shape

(24780, 2)

# Splitting the dataset

In [56]:
from sklearn.model_selection import train_test_split

def split_dataset(df):
    X_train, X_test, y_train, y_test = train_test_split(df["text"], df["Sentiment"], stratify=df["Sentiment"],test_size=0.15)
    return X_train, X_test, y_train, y_test

In [57]:
X_train, X_test, y_train, y_test = split_dataset(df)

In [144]:
X_under_train, X_under_test, y_under_train, y_under_test = split_dataset(df_under)

In [58]:
len(X_train), len(X_test)

(38040, 6714)

In [47]:
X_train.head()

31464    seen bbcbreaking covid_ talking investing mill...
4319     reminder illegal business increase prices esse...
37940    shocking simulation shows singlecough spreadin...
2733     maybe everyone celebrateourcinemas people work...
33187    duke energy tampa electric lakeland electric a...
Name: text, dtype: object

In [135]:
train_texts = list(X_train)
test_texts = list(X_test)
train_labels = np.array(y_train)
test_labels = np.array(y_test)

In [145]:
train_under_texts = list(X_under_train)
test_under_texts = list(X_under_test)
train_under_labels = np.array(y_under_train)
test_under_labels = np.array(y_under_test)

# Modelling

In NLP, there are generally two types of models:

1. Sequence Models(Use word ordering information)
2. Bag-of-Words Models(N-grams)

Quoting Google([link](https://developers.google.com/machine-learning/guides/text-classification/step-2-5#:~:text=From%20our%20experiments%2C%20we%20have%20observed%20that%20the%20ratio%20of%20%E2%80%9Cnumber%20of%20samples%E2%80%9D%20(S)%20to%20%E2%80%9Cnumber%20of%20words%20per%20sample%E2%80%9D%20(W)%20correlates%20with%20which%20model%20performs%20well.))

> From our experiments, we have observed that the ratio of “number of samples” (S) to “number of words per sample” (W) correlates with which model performs well.

> When the value for this ratio is small (<1500), small multi-layer perceptrons that take n-grams as input (which we'll call Option 2) perform better or at least as well as sequence models. MLPs are simple to define and understand, and they take much less compute time than sequence models. 

> When the value for this ratio is large (>= 1500), use a sequence model (Option 1).


**Our Case**: The ratio is ~2355. So we will use sequence models

In [76]:
import tensorflow as tf

In [91]:
from tensorflow.keras import models
from tensorflow.keras import initializers
from tensorflow.keras import regularizers

from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import SeparableConv1D
from tensorflow.keras.layers import MaxPooling1D
from tensorflow.keras.layers import GlobalAveragePooling1D

from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing import text

In [72]:
TOP_K = 20000
MAX_SEQUENCE_LENGTH = 500

def sequence_vectorize(train_texts, val_texts):
    # Create vocabulary with training texts.
    tokenizer = text.Tokenizer(num_words=TOP_K)
    tokenizer.fit_on_texts(train_texts)

    # Vectorize training and validation texts.
    x_train = tokenizer.texts_to_sequences(train_texts)
    x_val = tokenizer.texts_to_sequences(val_texts)

    # Get max sequence length.
    max_length = len(max(x_train, key=len))
    if max_length > MAX_SEQUENCE_LENGTH:
        max_length = MAX_SEQUENCE_LENGTH

    # Fix sequence length to max value. Sequences shorter than the length are
    # padded in the beginning and sequences longer are truncated
    # at the beginning.
    x_train = sequence.pad_sequences(x_train, maxlen=max_length)
    x_val = sequence.pad_sequences(x_val, maxlen=max_length)
    return x_train, x_val, tokenizer.word_index

In [122]:
def _get_last_layer_units_and_activation(num_classes):
   
    if num_classes == 2:
        activation = 'sigmoid'
        units = 1
    else:
        activation = 'softmax'
        units = num_classes
    return units, activation

In [123]:
def sepcnn_model(blocks,
                 filters,
                 kernel_size,
                 embedding_dim,
                 dropout_rate,
                 pool_size,
                 input_shape,
                 num_classes,
                 num_features,
                 use_pretrained_embedding=False,
                 is_embedding_trainable=False,
                 embedding_matrix=None):
   
    op_units, op_activation = _get_last_layer_units_and_activation(3)
    model = models.Sequential()

    # Add embedding layer. If pre-trained embedding is used add weights to the
    # embeddings layer and set trainable to input is_embedding_trainable flag.
    if use_pretrained_embedding:
        model.add(Embedding(input_dim=num_features,
                            output_dim=embedding_dim,
                            input_length=input_shape[0],
                            weights=[embedding_matrix],
                            trainable=is_embedding_trainable))
    else:
        model.add(Embedding(input_dim=num_features,
                            output_dim=embedding_dim,
                            input_length=input_shape[0]))

    for _ in range(blocks-1):
        model.add(Dropout(rate=dropout_rate))
        model.add(SeparableConv1D(filters=filters,
                                  kernel_size=kernel_size,
                                  activation='relu',
                                  bias_initializer='random_uniform',
                                  depthwise_initializer='random_uniform',
                                  padding='same'))
        model.add(SeparableConv1D(filters=filters,
                                  kernel_size=kernel_size,
                                  activation='relu',
                                  bias_initializer='random_uniform',
                                  depthwise_initializer='random_uniform',
                                  padding='same'))
        model.add(MaxPooling1D(pool_size=pool_size))

    model.add(SeparableConv1D(filters=filters * 2,
                              kernel_size=kernel_size,
                              activation='relu',
                              bias_initializer='random_uniform',
                              depthwise_initializer='random_uniform',
                              padding='same'))
    model.add(SeparableConv1D(filters=filters * 2,
                              kernel_size=kernel_size,
                              activation='relu',
                              bias_initializer='random_uniform',
                              depthwise_initializer='random_uniform',
                              padding='same'))
    model.add(GlobalAveragePooling1D())
    model.add(Dropout(rate=dropout_rate))
    model.add(Dense(op_units, activation=op_activation))
    return model

In [81]:
METRICS = [
           tf.keras.metrics.Accuracy(name='accuracy'),
           tf.keras.metrics.Precision(name='precision'),
           tf.keras.metrics.Recall(name='recall')
]

In [146]:
def train_sequence_model(data,
                         learning_rate=1e-3,
                         epochs=10,
                         batch_size=128,
                         blocks=2,
                         filters=64,
                         dropout_rate=0.2,
                         embedding_dim=200,
                         kernel_size=3,
                         pool_size=3):
   
    # Get the data.
    train_texts, train_labels, val_texts, val_labels = data
    num_classes = 3

    # Vectorize texts.
    x_train, x_val, word_index = sequence_vectorize(
            train_texts, val_texts)

    # Number of features will be the embedding input dimension. Add 1 for the
    # reserved index 0.
    num_features = min(len(word_index) + 1, TOP_K)

    # Create model instance.
    model = sepcnn_model(blocks=blocks,
                                     filters=filters,
                                     kernel_size=kernel_size,
                                     embedding_dim=embedding_dim,
                                     dropout_rate=dropout_rate,
                                     pool_size=pool_size,
                                     input_shape=x_train.shape[1:],
                                     num_classes=num_classes,
                                     num_features=num_features)

    # Compile model with learning parameters.
    if num_classes == 2:
        loss = 'binary_crossentropy'
    else:
        loss = 'categorical_crossentropy'
    optimizer = tf.keras.optimizers.Adam(lr=learning_rate)
    model.compile(optimizer=optimizer, loss=loss, metrics=METRICS)

    # Create callback for early stopping on validation loss. If the loss does
    # not decrease in two consecutive tries, stop training.
    callbacks = [tf.keras.callbacks.EarlyStopping(
        monitor='val_loss', patience=3)]

    # Train and validate model.
    history = model.fit(
            x_train,
            train_labels,
            epochs=epochs,
            callbacks=callbacks,
            validation_data=(x_val, val_labels),
            verbose=2,  # Logs once per epoch.
            batch_size=batch_size)

    # Print results.
    history = history.history
    print('Validation accuracy: {acc}, loss: {loss}'.format(
            acc=history['val_accuracy'][-1], loss=history['val_loss'][-1]))

    # Save model.
    model.save('sepcnn_under_model.h5')
    return history['val_accuracy'][-1], history['val_loss'][-1]

In [136]:
from tensorflow.keras.utils import to_categorical
train_labels = to_categorical(train_labels, 3)
test_labels = to_categorical(test_labels, 3)
train_labels.shape

(38040, 3)

In [147]:
train_under_labels = to_categorical(train_under_labels, 3)
test_under_labels = to_categorical(test_under_labels, 3)
train_under_labels.shape

(21063, 3)

In [137]:
train_labels[0]

array([0., 0., 1.], dtype=float32)

In [138]:
data = train_texts, train_labels, test_texts, test_labels

In [143]:
train_sequence_model(data)

Epoch 1/10
298/298 - 28s - loss: 1.0042 - accuracy: 1.1917e-04 - precision: 0.7053 - recall: 0.1976 - val_loss: 0.8993 - val_accuracy: 0.0000e+00 - val_precision: 0.6920 - val_recall: 0.2064 - 28s/epoch - 93ms/step
Epoch 2/10
298/298 - 26s - loss: 0.7485 - accuracy: 0.0000e+00 - precision: 0.7328 - recall: 0.5463 - val_loss: 0.6092 - val_accuracy: 0.0000e+00 - val_precision: 0.8070 - val_recall: 0.7081 - 26s/epoch - 88ms/step
Epoch 3/10
298/298 - 26s - loss: 0.4660 - accuracy: 1.3144e-04 - precision: 0.8654 - recall: 0.8096 - val_loss: 0.5475 - val_accuracy: 4.9648e-05 - val_precision: 0.8344 - val_recall: 0.7745 - 26s/epoch - 88ms/step
Epoch 4/10
298/298 - 26s - loss: 0.3394 - accuracy: 0.0012 - precision: 0.9082 - recall: 0.8763 - val_loss: 0.5748 - val_accuracy: 8.9366e-04 - val_precision: 0.8221 - val_recall: 0.7748 - 26s/epoch - 87ms/step
Epoch 5/10
298/298 - 26s - loss: 0.2723 - accuracy: 0.0041 - precision: 0.9264 - recall: 0.9035 - val_loss: 0.6135 - val_accuracy: 0.0022 - val_

(0.004567570053040981, 0.6549374461174011)

Let's train with undersampled data

In [148]:
data_under = train_under_texts, train_under_labels, test_under_texts, test_under_labels

In [149]:
train_sequence_model(data_under)

Epoch 1/10
165/165 - 23s - loss: 1.0988 - accuracy: 0.0011 - precision: 0.8110 - recall: 0.1883 - val_loss: 1.0987 - val_accuracy: 0.0000e+00 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00 - 23s/epoch - 139ms/step
Epoch 2/10
165/165 - 16s - loss: 1.0989 - accuracy: 0.0000e+00 - precision: 0.0000e+00 - recall: 0.0000e+00 - val_loss: 1.0987 - val_accuracy: 0.0000e+00 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00 - 16s/epoch - 94ms/step
Epoch 3/10
165/165 - 15s - loss: 1.0989 - accuracy: 0.0000e+00 - precision: 0.0000e+00 - recall: 0.0000e+00 - val_loss: 1.0986 - val_accuracy: 0.0000e+00 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00 - 15s/epoch - 89ms/step
Epoch 4/10
165/165 - 15s - loss: 1.0988 - accuracy: 0.0000e+00 - precision: 0.0000e+00 - recall: 0.0000e+00 - val_loss: 1.0987 - val_accuracy: 0.0000e+00 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00 - 15s/epoch - 89ms/step
Epoch 5/10
165/165 - 15s - loss: 1.0988 - accuracy: 0.0000e+00 - precision: 0.0000e+00 

(0.0, 1.0986825227737427)

No better accuracy