In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#importing necessary libraries
import numpy as np 
import pandas as pd
import nltk
from tensorflow.keras.layers import Embedding,LSTM,Dense,Dropout,Bidirectional
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix,classification_report

In [None]:
train=pd.read_csv('/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_train.csv',encoding='ISO-8859-1')
train.head()

In [None]:
test=pd.read_csv('/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_test.csv')
test.head()

In [None]:
train.shape,test.shape

In [None]:
train.info()

In [None]:
test.info()

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

*Since we are only interested in tweets and sentiment removing other columns from train and test sets.*

In [None]:
train.drop(columns=['TweetAt','UserName','ScreenName','Location'],axis=1,inplace=True)
test.drop(columns=['TweetAt','UserName','ScreenName','Location'],axis=1,inplace=True)

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
train.rename(columns={'OriginalTweet':'Tweet'},inplace=True)
test.rename(columns={'OriginalTweet':'Tweet'},inplace=True)

In [None]:
train['Sentiment'].value_counts()

In [None]:
test['Sentiment'].value_counts()

*In both train and test sets there are 5 classes instead of 3 classes i.e. Positive, Neutral and Negative.Therefore representing Extremely Positive tweets as Positive tweets and  Extremely Negative tweets as Negative tweets.*

In [None]:
train.Sentiment.replace({'Extremely Positive': 'Positive','Extremely Negative': 'Negative'},inplace=True)
test.Sentiment.replace({'Extremely Positive': 'Positive','Extremely Negative': 'Negative'},inplace=True)

In [None]:
train['Sentiment'].value_counts(normalize=True)*100

*Training dataset has 43.84 % of positive tweets,37.41% of the negative tweets and 18.74% of neutral tweets.*

In [None]:
test['Sentiment'].value_counts(normalize=True)*100

*Test dataset has 42.99% of negative tweets,40.70% of positive tweets and 16.29% of neutral tweets.*

***It can be seen that test dataset has more negative tweets then training dataset and less positive tweets than train dataset.However,distribution of neutral tweets is having minor difference in both sets.***

In [None]:
#mapping of sentiment values
train.replace({'Negative':0,'Neutral':1,'Positive':2},inplace=True)
test.replace({'Negative':0,'Neutral':1,'Positive':2},inplace=True)

In [None]:
train=train.drop_duplicates()
test=test.drop_duplicates()

In [None]:
nltk.download('stopwords')

In [None]:
#for lemmatization
lemma=WordNetLemmatizer()
#function for preprocessing
def preprocessing(review_text):
    review_text=re.sub(r'http\S+',' ',review_text) #removing the url
    review_text=re.sub('[^a-z-A-Z]',' ',review_text) #removing numbers and punctuation
    review_text=str(review_text).lower()   #converting all characters into lowercase
    review_text=review_text.split()
    review_text=" ".join([lemma.lemmatize(item) for item in review_text 
                 if item not in set(stopwords.words('english'))]) #removing stopwords
    return review_text

In [None]:
train['Tweet']=train['Tweet'].apply(lambda x : preprocessing(x))
test['Tweet']=test['Tweet'].apply(lambda x : preprocessing(x))

In [None]:
positive=train[train.Sentiment==2]['Tweet']
negative=train[train.Sentiment==0]['Tweet']
neutral=train[train.Sentiment==1]['Tweet']

In [None]:
#displaying top 20 words having highest frequency amongst all three classes
color=['Accent','Paired','Pastel1']
splitedData=[positive,negative,neutral]

for item in range(3):
    plt.figure(figsize=(10,10))
    pd.Series(' '.join([i for i in splitedData[item]]).split()).value_counts().head(20).plot(kind='bar',
                                                                                             colormap=color[item])

*Coronavirus is the most common word in amongst all of the three classes.*

In [None]:
x_train=train['Tweet']
y_train=train['Sentiment']
x_test=test['Tweet']
y_test=test['Sentiment']

In [None]:
max_len = np.max(x_train.apply(lambda x :len(x)))
max_len

In [None]:
#converting text to numeric using tokenizer with padding sequences to max length
tokenizer=Tokenizer()
tokenizer.fit_on_texts(x_train)
vocab_length = len(tokenizer.word_index) + 1

x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)

x_train = pad_sequences(x_train, maxlen=max_len, padding='post')
x_test = pad_sequences(x_test, maxlen=max_len, padding='post')

In [None]:
print("Vocab length:", vocab_length)
print("Max sequence length:", max_len)

In [None]:
#making use of bidirectional LSTM with dropout
embedding_dim=20
model1 = Sequential()
model1.add(Embedding(vocab_length,embedding_dim,input_length=max_len)) #The embedding layer
model1.add(Bidirectional(LSTM(50,dropout=0.5))) #Our LSTM layer
model1.add(Dense(32,activation='relu'))
model1.add(Dropout(0.5))
model1.add(Dense(3,activation='softmax'))


model1.compile(optimizer='adam',loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
#converting target into categorical array
from keras.utils import to_categorical

y_train = to_categorical(y_train, 3)
y_test = to_categorical(y_test, 3)

In [None]:
model1.fit(x_train, y_train,epochs=5,validation_split=0.2)

In [None]:
model1.evaluate(x_test,y_test)

In [None]:
pred = np.argmax(model1.predict(x_test),axis=-1)
cm = confusion_matrix(np.argmax(y_test,1),pred)
plt.figure(figsize=(10,5))
sns.heatmap(cm,annot=True)

In [None]:
print(classification_report(np.argmax(y_test,1),pred,target_names=['Negative','Neutral','Positive']))

*If you like this notebook then upvote and share it.*

*Do provide your valuable feedback.*

*Thank you.*