<a href="https://colab.research.google.com/github/saatweek/Coronavirus_tweets_NLP_Text_Classification/blob/master/Accuracy_Comparison.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#Mounting Google Drive to this colab notebook

from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
#importing all the dependencies

import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
!pip install ktrain
import ktrain
from ktrain import text
ktrain.__version__


#training file and test file paths

train_csv_path = "/content/drive/My Drive/Colab Notebooks/Personal Projects/Corona tweets/Corona_NLP_train.csv"
test_csv_path = '/content/drive/My Drive/Colab Notebooks/Personal Projects/Corona tweets/Corona_NLP_test.csv'


#reading the training_file through pandas

training_dataframe = pd.read_csv(train_csv_path, encoding = "ISO-8859-1")
training_dataframe.dropna()
testing_dataframe = pd.read_csv(test_csv_path)
testing_dataframe.dropna()

#making empty lists for storing traning and testing data

training_sentences = []
training_labels = []

testing_sentences = []
testing_labels = []



#selecting the training sentences and appending each data to the previously created training_sentences list
for sentences in training_dataframe.iloc[:, -2]:
    training_sentences.append(sentences)

for labels in training_dataframe.iloc[:, -1]:
    labels = labels.replace(' ', '')
    training_labels.append(labels)

for sentences in testing_dataframe.iloc[:, -2]:
    testing_sentences.append(sentences)

for labels in testing_dataframe.iloc[:, -1]:
    labels = labels.replace(' ', '')
    testing_labels.append(labels)


#Hyperparameters
vocab_size = 10000
oov_tok = '<OOV>'
max_length = 64
pad_type = 'post'
trunc_type = 'pre'
num_labels = 5
embedding_dims = 16
num_epochs = 10

sentence_tokenizer = Tokenizer(num_words = vocab_size, oov_token = oov_tok)
label_tokenizer = Tokenizer(num_words = num_labels, oov_token = oov_tok)

sentence_tokenizer.fit_on_texts(training_sentences)
label_tokenizer.fit_on_texts(training_labels)

sentence_word_index = sentence_tokenizer.word_index
label_word_index = label_tokenizer.word_index

training_sequence = sentence_tokenizer.texts_to_sequences(training_sentences)
testing_sequence = sentence_tokenizer.texts_to_sequences(testing_sentences)
training_label_sequence = label_tokenizer.texts_to_sequences(training_labels)
testing_label_sequence = label_tokenizer.texts_to_sequences(testing_labels)

padded_training = pad_sequences(training_sequence, maxlen = max_length, padding = pad_type, truncating = trunc_type)
padded_testing = pad_sequences(testing_sequence, maxlen = max_length, padding = pad_type, truncating = trunc_type)

padded_training = np.array(padded_training)
training_label_sequence = np.array(training_label_sequence)
padded_testing = np.array(padded_testing)
testing_label_sequence = np.array(testing_label_sequence)


#_________________________________________________________________________________________________________________#

#EMBEDDING MODEL
embedding_model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dims),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(16, activation = 'relu'),
    tf.keras.layers.Dense(5, activation = 'softmax')
])

embedding_model.compile(loss = 'sparse_categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

embedding_history = embedding_model.fit(padded_training, training_label_sequence, 
                    validation_data = (padded_testing, testing_label_sequence), 
                    epochs = num_epochs)


#BERT MODEL
categories = ['Extremely Negative', 'Negative', 'Neutral', 'Positive', 'Extremely Positive']

trn, val, preproc = text.texts_from_array(x_train=training_sentences, y_train=training_labels,
                                          x_test=testing_sentences, y_test=testing_labels,
                                          class_names=categories,
                                          preprocess_mode='distilbert',
                                          maxlen=350)

text.print_text_classifiers()

bert_model = text.text_classifier('distilbert', train_data=trn, preproc=preproc)
learner = ktrain.get_learner(bert_model, train_data=trn, val_data=val, batch_size=6)
bert_history = learner.fit_onecycle(1e-5, num_epochs)


#Bidirectional LSTM

LSTM_model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dims),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(64, activation = 'relu'),
    tf.keras.layers.Dense(5, activation = 'softmax')
])

LSTM_model.compile(loss = 'sparse_categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

LSTM_history = LSTM_model.fit(padded_training, training_label_sequence, 
                    validation_data = (padded_testing, testing_label_sequence), 
                    epochs = num_epochs)

#1-D Convolution

convolution_model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dims),
    tf.keras.layers.Conv1D(128, 5, activation='relu'),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(64, activation = 'relu'),
    tf.keras.layers.Dense(5, activation = 'softmax')
])

convolution_model.compile(loss = 'sparse_categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

convolution_history = convolution_model.fit(padded_training, training_label_sequence, 
                    validation_data = (padded_testing, testing_label_sequence), 
                    epochs = num_epochs)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
preprocessing train...
language: en
train sequence lengths:
	mean : 31
	95percentile : 48
	99percentile : 52




Is Multi-Label? False
preprocessing test...
language: en
test sequence lengths:
	mean : 33
	95percentile : 50
	99percentile : 54


task: text classification
fasttext: a fastText-like model [http://arxiv.org/pdf/1607.01759.pdf]
logreg: logistic regression using a trainable Embedding layer
nbsvm: NBSVM model [http://www.aclweb.org/anthology/P12-2018]
bigru: Bidirectional GRU with pretrained fasttext word vectors [https://fasttext.cc/docs/en/crawl-vectors.html]
standard_gru: simple 2-layer GRU with randomly initialized embeddings
bert: Bidirectional Encoder Representations from Transformers (BERT) from keras_bert [https://arxiv.org/abs/1810.04805]
distilbert: distilled, smaller, and faster BERT from Hugging Face transformers [https://arxiv.org/abs/1910.01108]
Is Multi-Label? False
maxlen is 350
done.


begin training using onecycle policy with max lr of 1e-05...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoc

In [6]:
import plotly.graph_objects as go

fig = go.Figure()
x = np.linspace(1,100, 100)
y1 = embedding_history.history['accuracy']
y2 = LSTM_history.history['accuracy']
y3 = convolution_history.history['accuracy']
y4 = bert_history.history['accuracy']
fig.add_trace(go.Scatter(x = x, y = y1, name = 'Embedding accuracy')),
fig.add_trace(go.Scatter(x = x, y = y2, name = 'LSTM accuracy')),
fig.add_trace(go.Scatter(x = x, y = y3, name = '1-D Convolution accuracy')),
fig.add_trace(go.Scatter(x = x, y = y4, name = 'DistilBERT accuracy'))
fig.update_layout(xaxis_title = 'Epochs', yaxis_title = 'accuracy', title = 'Accuracy of Model on Training Data')
fig.show()