## To Vaccinate or Not to Vaccinate: Analysing social media sentiment towards vaccines

Although it may be many months before we see COVID-19 vaccines available on a global scale, it is important to monitor public sentiment towards vaccinations now and especially in the future when COVID-19 vaccines are offered to the public. The anti-vaccination sentiment could pose a serious threat to the global efforts to get COVID-19 under control in the long term.

The objective of this challenge is to develop a machine learning model to assess if a Twitter post related to vaccinations is positive, neutral, or negative. 

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from matplotlib import pyplot as plt

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import regularizers
from tensorflow.keras import backend as K

import os

In [None]:
train_df = pd.read_csv('./raw_data/Train.csv')
test_df = pd.read_csv('./raw_data/Test.csv')
sub = pd.read_csv('./raw_data/SampleSubmission.csv')

### A quick look at some tweets

Let's have a glimpse as to what pro-vaccination, neutral and anti-vaccination tweets look like

In [None]:
train_df.head()

In [None]:
# Neutral
train_df[train_df['label'] == 0]['safe_text'].values[0]

In [None]:
# Pro-vaccination
train_df[train_df['label'] == 1]['safe_text'].values[0]

In [None]:
# Anti-vaccination
train_df[train_df['label'] == -1]['safe_text'].values[0]

In [None]:
# Slicing out the outlier label
train_df = train_df[train_df['label'].isin([-1, 0, 1])]

In [None]:
plt.figure(figsize=(9,4))
plt.title('Class Distributions')
train_df.label.value_counts().plot(kind='bar', color=('green', 'gray'))

In [None]:
train_df.head()

### Text Preprocessing:
* Remove null labelled tweet and randomly impute null tweet in test set

In [None]:
#test_df[test_df['safe_text'].isnull() == True]
train_df.dropna(inplace=True)
test_df.fillna(value='am ok with it as long as its not dangerous', inplace=True)

In [None]:
# split data into documents/features and labels
X = train_df.safe_text
y = train_df.label
#X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

### Word Embeddings

In [None]:
train_corpus = X.tolist()
test_corpus = test_df.safe_text.tolist()

# Tokenization
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(train_corpus)
vocab_size = len(tokenizer.word_counts)
print(f'Train Vocabulary size: {vocab_size}')

# Sequence lengths (vocabulary size in a given sequence)
# Computing the vocabulary size per percentile
seq_lengths = np.array([len(s.split()) for s in train_corpus])
print([(p, np.percentile(seq_lengths, p)) for p in [75, 80, 90, 95, 99, 100]])

In [None]:
max_seqlen = 33

# Train encodings (words/sentences >> int) with padding
# Padding ensures that sequences are of the same length
train_encodings = tokenizer.texts_to_sequences(train_corpus)
train_encodings = tf.keras.preprocessing.sequence.pad_sequences(
    train_encodings, maxlen = max_seqlen)
labels = np.array(y)

# Creating a train dataset
dataset = tf.data.Dataset.from_tensor_slices(
    (train_encodings, labels))

# Test encodings with padding
test_encodings = tokenizer.texts_to_sequences(test_corpus)
test_encodings = tf.keras.preprocessing.sequence.pad_sequences(
    test_encodings, maxlen= max_seqlen)
test_labels = np.zeros(5177) # Predictions placeholder

# Test dataset
test_dataset = tf.data.Dataset.from_tensor_slices(
 (test_encodings, test_labels))


In [None]:
# Creating train and test batches

# Train_val split and batch creation
dataset = dataset.shuffle(1000)

val_size = (len(train_corpus)) // 6
val_dataset = dataset.take(val_size)
train_dataset = dataset.skip(val_size)

batch_size = 64
train_dataset = train_dataset.batch(batch_size)
val_dataset = val_dataset.batch(batch_size)
print(val_size)

# Batching the test datset
test_batched = test_dataset.batch(batch_size)

### Building the model

The model is a 6-layer NN:
     * An Embedding layer (to generate word embeddings)
     2 stacked LSTM layers
     2 hidden Dense layers with the `relu` activation function
     An output Dense layer

In [None]:
def rmse(y_true, y_pred):
    return K.sqrt(K.mean((K.square(y_pred - y_true))))

In [None]:
embedding_dim=33

model = tf.keras.Sequential([
    layers.Embedding(vocab_size+1, embedding_dim),
    layers.Bidirectional(
        layers.LSTM(max_seqlen, return_sequences=True)),
        layers.Bidirectional(layers.LSTM(8)),
    #layers.GlobalAveragePooling1D(),
    layers.Dense(32, activation='relu'), #, kernel_regularizer=regularizers.l2(0.02)),
    layers.Dense(16, activation='relu'),
    layers.Dense(1)
])

model.build(input_shape=(batch_size, max_seqlen))
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss=tf.keras.losses.CategoricalCrossentropy(),
              metrics=[rmse])

model.summary()

In [None]:
early_stopping = tf.keras.callbacks.EarlyStopping(patience=5)

learning_rate_reduction = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_rmse',
                                                               patience=3, 
                                                               verbose=1, 
                                                               factor=0.5, 
                                                               min_lr=0.00001
                                                              )

best_model_file = os.path.join('./', "best_model.h5")
checkpoint = tf.keras.callbacks.ModelCheckpoint(best_model_file,
                                                save_weights_only=True,
                                                save_best_only=True
                                               )
callbacks = [checkpoint, early_stopping, learning_rate_reduction]
history = model.fit(
    train_dataset,
    epochs=20,
    validation_data=val_dataset,
    callbacks=callbacks
)

## Making predictions

In [None]:
predictions = model.predict(test_batched)

# Padding
for i in range(len(predictions)):
    if predictions[i] > 1:
        predictions[i] = 1
    elif predictions[i] < -1:
        predictions[i] = -1
        
sub['label'] = predictions

In [None]:
sub.head()

In [None]:
#min(sub.label)
score = list(history.history.values())
RMSE = score[-2][-1]
RMSE

In [None]:
#os.mkdir('./submissions')
sub.to_csv(f"./submissions/sub_nn_{RMSE}.csv", index=False)