In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Import necessary libraries**

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
import re
import string
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
import tensorflow.keras.utils as ku 
import tensorflow as tf

In [None]:
train_path="/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_train.csv"
test_path="/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_test.csv"

train_data = pd.read_csv(train_path, encoding='latin_1')
test_data = pd.read_csv(test_path, encoding='latin_1')

**1. Go through the data - shape, columns, values, different labels**

In [None]:
train_data.head(1)

In [None]:
# Check the columns and the types
train_data.info()

In [None]:
# Check the size of the dataset
train_data.shape

**2. Handle missing values**

In [None]:
#Check the null values
train_data.isna().sum()

**Observation:** we can see that here "Location" field is having null value, but we only need "OrginalTweet" and "Sentiment" column for this, rest features doesn't make any difference.

**3. Get only desired columns**

In [None]:
train_data = train_data[['OriginalTweet', 'Sentiment']]
train_data.head()

**4. Percentage of sentiments appearance**

In [None]:
# categories of sentiments
np.unique(train_data['Sentiment'])

**Observation:** The sentiments are divided into 5 categories: 'Extremely Negative', 'Extremely Positive', 'Negative', 'Neutral', and 'Positive'

In [None]:
sns.countplot(y='Sentiment', data=train_data)
plt.title("Number of Sentiments in train data")

In [None]:
sns.countplot(y='Sentiment', data=test_data)
plt.title("Number of Sentiments in test data")

In [None]:
# Sentiments in train data
labels=['Extremely Negative', 'Extremely Positive', 'Negative', 'Neutral', 'Positive']
sizes = [
         train_data[train_data['Sentiment'] == 'Extremely Negative'].shape[0], 
         train_data[train_data['Sentiment'] == 'Extremely Positive'].shape[0],
         train_data[train_data['Sentiment'] == 'Negative'].shape[0], 
         train_data[train_data['Sentiment'] == 'Neutral'].shape[0],
         train_data[train_data['Sentiment'] == 'Positive'].shape[0]
        ]
plt.pie(sizes,labels=labels, data=train_data, autopct='%1.2f%%', shadow=True, startangle=90)
plt.title("Sentiments percentages in train data")
plt.axis("equal")

**Observation:** Most of the tweets are positive app. 27.75% followed by Negative sentiments 24.10%

In [None]:
# Sentiments in train data
labels=['Extremely Negative', 'Extremely Positive', 'Negative', 'Neutral', 'Positive']
sizes = [
         test_data[test_data['Sentiment'] == 'Extremely Negative'].shape[0], 
         test_data[test_data['Sentiment'] == 'Extremely Positive'].shape[0],
         test_data[test_data['Sentiment'] == 'Negative'].shape[0], 
         test_data[test_data['Sentiment'] == 'Neutral'].shape[0],
         test_data[test_data['Sentiment'] == 'Positive'].shape[0]
        ]
plt.pie(sizes,labels=labels, data=test_data, autopct='%1.2f%%', shadow=True, startangle=90)
plt.title("Sentiments percentages in test data")
plt.axis("equal")

In [None]:
pd.set_option('display.max_colwidth', -1)

**5. Remove unwanted text**

In [None]:
stop_words = stopwords.words("english")

In [None]:
# clean unwanted text like stopwords, @(Mention), https(url), #(Hashtag), punctuations
def removeUnwantedText(text):
    #remove urls
    text = re.sub(r'http\S+', " ", text)
    
    #remove mentions
    text = re.sub(r'@\w+',' ',text)
    
    #remove hastags
    text = re.sub(r'#\w+', ' ', text)
    
    #remove html tags
    text = re.sub('r<.*?>',' ', text)
    
     #remove stop words 
    text = text.split()
    text = " ".join([word for word in text if not word in stop_words])
    
    for punctuation in string.punctuation:
        text = text.replace(punctuation, "")
    
    return text

In [None]:
train_data['OriginalTweet'] = train_data['OriginalTweet'].apply(lambda x: removeUnwantedText(x))
test_data['OriginalTweet'] = test_data['OriginalTweet'].apply(lambda x: removeUnwantedText(x))

**6. As machine learning models only understands numeric, so do transformations**

In [None]:
# For sentiments, as these are categories, so lets do LabelEncoding for this
label_encoder = LabelEncoder()
train_data['Encoded_Sentiment'] = label_encoder.fit_transform(train_data['Sentiment'])
test_data['Encoded_Sentiment'] = label_encoder.fit_transform(test_data['Sentiment'])

In [None]:
train_data.head()

**Different Sentiments with values**
1. Extremely Negative --> 0 
2. Extremely Postive --> 1
3. Negative --> 2
4. Neutral --> 3
5. Positive --> 4

In [None]:
x_train = train_data['OriginalTweet']
y_train = train_data['Encoded_Sentiment']

x_test = test_data['OriginalTweet']
y_test = test_data['Encoded_Sentiment']

**Let us handle original tweets part**

In [None]:
train_sequence_length = np.max(x_train.apply(lambda x: len(x)))
test_sequence_length = np.max(x_test.apply(lambda x: len(x)))

In [None]:
# Which sentence is having maximum length
max_sequence_len = train_sequence_length
if test_sequence_length > train_sequence_length:
    max_sequence_len = test_sequence_length
max_sequence_len

**7. Preprocess the data**

In [None]:
train_tokenizer = Tokenizer()

train_tokenizer.fit_on_texts(x_train) # Assign number to text
total_words = len(train_tokenizer.word_index) + 1

# create input sequences using list of tokens
train_tokens = train_tokenizer.texts_to_sequences(x_train)

#Make all text of same length
train_input_sequences = pad_sequences(train_tokens, maxlen=max_sequence_len, padding='pre')

In [None]:
test_tokenizer = Tokenizer()

test_tokenizer.fit_on_texts(x_test) # Assign number to text

# create input sequences using list of tokens
test_tokens = test_tokenizer.texts_to_sequences(x_test)

#Make all text of same length
test_input_sequences = pad_sequences(test_tokens, maxlen=max_sequence_len, padding='pre')

In [None]:
# As we have 5 different sentiments
train_labels = ku.to_categorical(y_train, 5)
test_labels = ku.to_categorical(y_test, 5)

**8. Model creation**

In [None]:
model = tf.keras.Sequential([
    Embedding(total_words, 16, input_length=max_sequence_len),
    Bidirectional(LSTM(256, return_sequences=True)),
    tf.keras.layers.GlobalAveragePooling1D(),
    Dense(64, activation='relu'),
    Dense(5, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()


In [None]:
history = model.fit(train_input_sequences, train_labels, epochs=20, verbose=1, 
                    validation_data=(test_input_sequences, test_labels))

In [None]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(len(acc))

plt.plot(epochs, acc, 'bo', label='Training accuracy')
plt.plot(epochs, val_acc, 'b', label='Validation accuracy')
plt.title('Training and validation accuracy')
plt.legend()
plt.figure()

plt.plot(epochs, loss, 'bo', label='Training Loss')
plt.plot(epochs, val_loss, 'b', label='Validation Loss')
plt.title('Training and validation loss')
plt.legend()

plt.show()