In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns 
import matplotlib.pyplot as plt
import warnings 
warnings.filterwarnings("ignore")

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

![image.png](attachment:ccaf1b90-dade-46f0-a108-e0c3622759b6.png)

# Import Dataset

In [None]:
df=pd.read_csv("/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_train.csv",encoding='latin1')
df.head()

In [None]:
df.shape

# Unique Values and Counts of Sentiments

In [None]:
df["Sentiment"].unique()

In [None]:
sns.countplot(df["Sentiment"])

In [None]:
# Keeping the important columns only and removing the rest 
df=df[["OriginalTweet","Sentiment"]]

In [None]:
df.head()

# Visualizing unique Sentiments

In [None]:
labels=['Extremely Negative', 'Extremely Positive', 'Negative', 'Neutral', 'Positive']
sizes = [
         df[df['Sentiment'] == 'Extremely Negative'].shape[0], 
         df[df['Sentiment'] == 'Extremely Positive'].shape[0],
         df[df['Sentiment'] == 'Negative'].shape[0], 
         df[df['Sentiment'] == 'Neutral'].shape[0],
         df[df['Sentiment'] == 'Positive'].shape[0]
        ]
plt.pie(sizes,labels=labels, data=df, autopct='%1.2f%%', shadow=True, startangle=90)
plt.title("Sentiments percentages in train data")
plt.axis("equal")

# Combining similar Sentiments

In [None]:
#Combining the two sentiments as a single sentiment.
for i in ["Extremely Negative","Negative"]:
    df.loc[df["Sentiment"]==i,"Sentiment"]="Negative"

In [None]:
#Combining the two sentiments as a single sentiment.
for j in ["Extremely Positive","Positive"]:
    df.loc[df["Sentiment"]==j,"Sentiment"]="Positive"

In [None]:
#Now you can see that there are only three sentiments.
df["Sentiment"].unique()

In [None]:
sns.countplot(df["Sentiment"])

In [None]:
labels=['Negative', 'Neutral', 'Positive']
sizes = [
         
         df[df['Sentiment'] == 'Negative'].shape[0], 
         df[df['Sentiment'] == 'Neutral'].shape[0],
         df[df['Sentiment'] == 'Positive'].shape[0]
        ]
plt.pie(sizes,labels=labels, data=df, autopct='%1.2f%%', shadow=True, startangle=90)
plt.title("Sentiments percentages in train data")
plt.axis("equal")

In [None]:
X=df["OriginalTweet"]
y=df["Sentiment"]

# Data Preprocessing

# Importing NLP libraries

In [None]:
import re
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import one_hot

In [None]:
messages=X.copy()

In [None]:
messages.shape

In [None]:
messages.head()

In [None]:
ps=PorterStemmer()

In [None]:
corpus=[]
for i in range(len(messages)):
    #removing everything other than alphabets
    review=re.sub("[^a-zA-Z]"," ", str(messages[i]))
    #remove urls
    text = re.sub(r'http\S+', " ", str(messages[i]))
    #remove mentions
    text = re.sub(r'@\w+',' ', str(messages[i]))
    #remove hastags
    text = re.sub(r'#\w+', ' ', str(messages[i]))
    #remove html tags
    text = re.sub('r<.*?>',' ', str(messages[i]))
    #Lowering the tweets
    review=review.lower()
    #Converting into a list
    review=review.split()
    #Removing the Stopwords
    review=[ps.stem(word) for word in review if not word in stopwords.words("english")]
    #Joining the list 
    review=" ".join(review)
    corpus.append(review)

# Encoding the dependant variable 

In [None]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
y=le.fit_transform(y)

**Different Sentiments with values**

Extremely Negative --> 0

Extremely Postive --> 1

Negative --> 2

Neutral --> 3

Positive --> 4

In [None]:
#Setting the vocabulary size
voc_size=5000

In [None]:
#we will do one hot encoding for the corpus. It is alloting every word an index according to the vocabulary size
onehot=[one_hot(words,voc_size) for words in corpus]

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
#if length of sentence is not 20 than it will ad 0 in front of sentence such that length becomes 20
embedded_docs=pad_sequences(onehot,padding="pre",maxlen=305)

# Model Creation

In [None]:
from keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, GlobalAveragePooling1D, Input, GlobalMaxPool1D
from keras.optimizers import Adam
from tensorflow.keras.models import Sequential
from keras.utils.np_utils import to_categorical

In [None]:
model = Sequential([
    Embedding(voc_size+1,305, input_length=len(embedded_docs[0])),
    Dropout(0.5),
    Bidirectional(LSTM(200, return_sequences=True)),
    Dropout(0.5),
    GlobalMaxPool1D(),
    Dropout(0.5),
    Dense(3, activation='softmax')
])
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
len(embedded_docs),y.shape

In [None]:
#Converting y to categorical with 3 features
y = to_categorical(y,3)

In [None]:
#Creating new independent and dependent variables
import numpy as np
X_final=np.array(embedded_docs)
y_final=np.array(y)

# Train Test Split

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X_final,y_final,test_size=0.3,random_state=0)

# Training the Model

In [None]:
history=model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=5,batch_size=64)

# Visualising the results

In [None]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(len(acc))

plt.plot(epochs, acc, 'bo', label='Training accuracy')
plt.plot(epochs, val_acc, 'b', label='Validation accuracy')
plt.title('Training and validation accuracy')
plt.legend()
plt.figure()

plt.plot(epochs, loss, 'bo', label='Training Loss')
plt.plot(epochs, val_loss, 'b', label='Validation Loss')
plt.title('Training and validation loss')
plt.legend()

plt.show()