# Import libraries

In [None]:
# modeling
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow.keras.layers as L
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import Adam

# result
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# visualization
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn-whitegrid')          # graph settings
plt.rcParams['figure.figsize'] = (12,5)    # graph settings

# data wrangling
import numpy as np 
import pandas as pd

# corpus
from nltk.corpus import stopwords

# string manipulation
import re
import spacy
import collections

In [None]:
import nltk 
nltk.download("stopwords") 

# Dataset

In [None]:
train = pd.read_csv('../input/covid-19-nlp-text-classification/Corona_NLP_train.csv', encoding='latin1')
test = pd.read_csv('../input/covid-19-nlp-text-classification/Corona_NLP_test.csv', encoding='latin1')
#train = pd.read_csv('Corona_NLP_train.csv', encoding='latin1')
#test = pd.read_csv('Corona_NLP_test.csv', encoding='latin1')
train.head()

## Dataset size

In [None]:
print('Examples in train data: {}'.format(len(train)))
print('Examples in test data: {}'.format(len(test)))

## Missing values

In [None]:
train.isna().sum()

In [None]:
# delete because, we dont use it
train = train.drop(columns=["Location"])
train

# Class distribution

In [None]:
dist_train = train['Sentiment'].value_counts()
dist_test = test['Sentiment'].value_counts()

def ditribution_plot(x, y, name):
    sns.barplot(x=x, y=y)
    plt.title(name)
    plt.show()

In [None]:
ditribution_plot(x=dist_train.index, y=dist_train.values, name='Class Distribution train')

In [None]:
ditribution_plot(x=dist_test.index, y=dist_test.values, name='Class Distribution test')

# Data preprocessing

In [None]:
import wordcloud
from wordcloud import WordCloud
allWords = ' '.join([twts for twts in train['OriginalTweet']])
wordCloud = WordCloud(width=500, height=300, random_state=21, max_font_size=110).generate(allWords)

plt.figure(figsize = (10, 8))
plt.imshow(wordCloud, interpolation="bilinear")
plt.axis('off')
plt.show()

In [None]:
dtf = train
dtf['word_count'] = dtf["OriginalTweet"].apply(lambda x: len(str(x).split(" ")))
dtf['char_count'] = dtf["OriginalTweet"].apply(lambda x: sum(len(word) for word in str(x).split(" ")))
dtf['sentence_count'] = dtf["OriginalTweet"].apply(lambda x: len(str(x).split(".")))
dtf['avg_word_length'] = dtf['char_count'] / dtf['word_count']
dtf['avg_sentence_lenght'] = dtf['word_count'] / dtf['sentence_count']
dtf.head()

In [None]:
X = train['OriginalTweet'].copy()
y = train['Sentiment'].copy()

## Cleaning

In [None]:
def data_cleaner(tweet):
    tweet = re.sub(r'http\S+', ' ', tweet)   # remove urls
    tweet = re.sub(r'<.*?>',' ', tweet)      # remove html tags
    tweet = re.sub(r'\d+',' ', tweet)        # remove digits
    tweet = re.sub(r'#\w+',' ', tweet)       # remove hashtags
    tweet = re.sub(r'@\w+',' ', tweet)       # remove mentions
    tweet = " ".join([word for word in tweet.split() if not word in stop_words])   # remove stop words
    return tweet

stop_words = stopwords.words('english')
X_cleaned = X.apply(data_cleaner)
X_cleaned.head()

In [None]:
allWords1 = ' '.join([twts for twts in X_cleaned])
wordCloud1 = WordCloud(width=500, height=300, random_state=21, max_font_size=110).generate(allWords1)

plt.figure(figsize = (10, 8))
plt.imshow(wordCloud1, interpolation="bilinear")
plt.axis('off')
plt.show()

## Tokenizing

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_cleaned)
X = tokenizer.texts_to_sequences(X_cleaned)
vocab_size = len(tokenizer.word_index) + 1

print("Vocabulary size: {}".format(vocab_size))
print("\nExample:\n")
print("Sentence:\n{}".format(X_cleaned[6]))
print("\nAfter tokenizing :\n{}".format(X[6]))

X = pad_sequences(X, padding='post')
print("\nAfter padding :\n{}".format(X[6]))

## Feature encoding

In [None]:
encoding = {
    'Extremely Negative': 0,
    'Negative': 0,
    'Neutral': 1,
    'Positive': 2,
    'Extremely Positive': 2
}

labels = ['Negative', 'Neutral', 'Positive']
y.replace(encoding, inplace=True)

In [None]:
y

# Model building and training

In [None]:
tf.keras.backend.clear_session()

# hyperparameters
EPOCHS = 2
BATCH_SIZE = 32
embedding_dim = 54
units = 256

model = tf.keras.Sequential([
    L.Embedding(vocab_size, embedding_dim, input_length=X.shape[1]),
    L.Bidirectional(L.GRU(units, return_sequences=True)),
    L.GlobalMaxPool1D(),
    L.Dropout(0.4),
    L.Dense(64, activation="relu"),
    L.Dropout(0.4),
    L.Dense(3)
])

model.compile(
    loss=SparseCategoricalCrossentropy(from_logits=True),
    optimizer='adam',
    metrics=['accuracy']
)

model.summary()

In [None]:
history = model.fit(X, y, epochs=2, validation_split=0.12, batch_size=BATCH_SIZE)

After 2 epochs, we get overfitting

In [None]:
def history_plot(history):
    plt.plot(history.history['loss'], label='train loss')
    plt.plot(history.history['val_loss'], label='validation loss')
    plt.title('Model Loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend()
    plt.show()

    plt.plot(history.history['accuracy'], label='train accuracy')
    plt.plot(history.history['val_accuracy'], label='validation accuracy')
    plt.title('Model Accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend()
    plt.show()

In [None]:
history_plot(history)

# Evaluation

## Preprocessing test data

In [None]:
X_test = test['OriginalTweet'].copy()
y_test = test['Sentiment'].copy()

X_test = X_test.apply(data_cleaner)
X_test = tokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(X_test, padding='post', maxlen=X.shape[1])
y_test.replace(encoding, inplace=True)

## Accuracy and loss

In [None]:
loss, acc = model.evaluate(X_test, y_test, verbose=0)
print('Test loss: {}'.format(loss))
print('Test Accuracy: {}'.format(acc))

## Confusion matrix

In [None]:
pred = np.argmax(model.predict(X_test), axis=-1)

In [None]:
conf = confusion_matrix(y_test, pred)

cm = pd.DataFrame(
    conf, index = [i for i in labels],
    columns = [i for i in labels]
)

sns.heatmap(cm, annot=True, fmt="d")
plt.show()

## Classification report

In [None]:
print(classification_report(y_test, pred, target_names=labels))