# Importing Dependencies

In [None]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow.keras.utils as ku
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style

# Reading in Training data

In [None]:
data = pd.read_csv('../input/twitter-sentiment-analysis-hatred-speech/train.csv')

In [None]:
data.head()

# Data Analysis and pre-processing

In [None]:
'''Analysing the distribution of tweets per class'''
style.use('ggplot')
plt.bar(['0', '1'], height=[data['label'].value_counts()[0], data['label'].value_counts()[1]], color='b')
plt.show()

In [None]:
'''Analysing the avarage length of tweets per class'''
temp_data = data[data['label'] == 0]['tweet']
len_avg0 = sum([len(tweet) for tweet in temp_data]) / len(temp_data)

temp_data = data[data['label'] == 1]['tweet']
len_avg1 = sum([len(tweet) for tweet in temp_data]) / len(temp_data)

plt.bar(['0', '1'], [len_avg0, len_avg1])


In [None]:
'''Forming the corpus for tokenization'''
corpus = []
for tweet in data['tweet']:
    corpus.append(tweet)

In [None]:
'''Creating unique key-value pair with tokenizer'''
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1

In [None]:
'''Preparing data for tarining by padding the sequences uniformly'''
max_seq_len = max([len(i) for i in corpus])
input_sequences = []
for tweet in corpus:
    token_list = tokenizer.texts_to_sequences([tweet])[0] # converting words to corresponding values from 'word_index'
    input_sequences.append(token_list)
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_seq_len, padding='pre')) # padding

In [None]:
print(len(input_sequences), len(data['tweet']))

In [None]:
labels = np.array(data['label'])
len(labels)

In [None]:
'''Splitting the data in training and validation set'''
split = int(.2 * len(corpus)) # 20% for validation
rand_row_num = np.random.randint(0, len(corpus), split)

X_test = np.array([input_sequences[i] for i in rand_row_num])
y_test = np.array([labels[i] for i in rand_row_num])

X_train = np.delete(input_sequences, rand_row_num, axis=0)
y_train = np.delete(labels, rand_row_num, axis=0)

In [None]:
len(X_train)

In [None]:
'''Converting Sparsed labels to categorical'''
y_train = ku.to_categorical(y_train, num_classes=2)
y_test = ku.to_categorical(y_test, num_classes=2)

# Model Building & Architecture

In [None]:
model = tf.keras.Sequential([
    layers.Embedding(total_words, 5, input_length=max_seq_len),
    layers.Bidirectional(layers.LSTM(8, return_sequences=True)),
    layers.Dropout(0.2),
    layers.LSTM(8),
    layers.Dense(8, activation='relu'),
    layers.Dense(2, activation='softmax')
])

In [None]:
model.summary()

In [None]:
'''Creating a callback for each epoch thereby knowing optimal learning rate'''
lr_scheduler = tf.keras.callbacks.LearningRateScheduler(lambda epoch: 1e-6 * 10 ** epoch)

In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(), loss=tf.keras.losses.CategoricalCrossentropy(),
             metrics=['accuracy'])

In [None]:
history = model.fit(X_train, y_train, epochs=5, batch_size=64, callbacks=[lr_scheduler])

In [None]:
'''Visualizing Learning rate vs loss function plot to decide the value of learning rate'''
loss = history.history['loss']
lr = history.history['lr']
style.use('ggplot')
plt.semilogx(1e-6, 1, 0, 0.5)
plt.title('lr vs training loss')
plt.xlabel('lr')
plt.ylabel('training_loss')
plt.plot(lr, loss, 'r')

In [None]:
model = tf.keras.Sequential([
    layers.Embedding(total_words, 5, input_length=max_seq_len),
    layers.Bidirectional(layers.LSTM(8, return_sequences=True)),
    layers.Dropout(0.2),
    layers.LSTM(8),
    layers.Dense(8, activation='relu'),
    layers.Dense(2, activation='softmax')
])

In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(1e-2), loss=tf.keras.losses.CategoricalCrossentropy(),
             metrics=['accuracy'])

In [None]:
history = model.fit(X_train, y_train, batch_size=64, epochs=5)

In [None]:
'''Visualizing training accuracy and loss per epoch to eliminate the potential threat of overfitting'''
plt.figure(figsize=(10, 6))
plt.plot(np.arange(len(history.history['accuracy'])), history.history['accuracy'], 'b', label='training acc')
plt.plot(np.arange(len(history.history['accuracy'])), history.history['loss'], 'r', label='training loss')
plt.legend(loc='best')
plt.show()

# Evaluating Model's performance with the validation set

In [None]:
model.evaluate(X_test, y_test, batch_size=64)

In [None]:
y_test = np.array([labels[i] for i in rand_row_num])

## Creating the encoder dict to understand the pattern in the wrong predictions

In [None]:
encoder_dict = {val: key for key, val in tokenizer.word_index.items()}

In [None]:
preds = model.predict(X_test)
pre = []
for pred in preds:
    pre.append(np.argmax(pred))
pre = np.array(pre)
np.unique(pre, return_counts=True)

In [None]:
'''Analysing false negative predictions'''
preds = model.predict(X_test)
for pred in range(len(preds)):
    temp = np.argmax(preds[pred])
    if temp != y_test[pred]:
        tweet = ''
        for word in X_test[pred]:
            if word == 0:
                continue
            else:
                tweet += encoder_dict[word]
                tweet += ' '
        print(tweet)
        print(f'predicted: {preds[pred]}\nactual: {y_test[pred]}')
        print('--------------------------------------------------')

# Final predictions on Test data

In [None]:
test_data = pd.read_csv('../input/twitter-sentiment-analysis-hatred-speech/test.csv')

In [None]:
test_data.head()

In [None]:
test_tweets = []
for tweet in test_data['tweet']:
    token_list = tokenizer.texts_to_sequences([tweet])[0]
    test_tweets.append(token_list)
test_tweets = np.array(pad_sequences(test_tweets, maxlen=max_seq_len, padding='pre'))

In [None]:
pred = model.predict(test_tweets)

In [None]:
pre = []
for pred in preds:
    pre.append(np.argmax(pred))
pre = np.array(pre)

In [None]:
final_preds = pd.DataFrame(data['tweet'], index=pre)

In [None]:
final_preds.head()