In [55]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [56]:
# opening the file
df = pd.read_csv('Corona_NLP.csv', encoding='latin1')
df.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [57]:
df.isnull().sum()

UserName            0
ScreenName          0
Location         8590
TweetAt             0
OriginalTweet       0
Sentiment           0
dtype: int64

In [58]:
# removing the null from the data
df1 = df.dropna()
df1


Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
5,3804,48756,"ÃT: 36.319708,-82.363649",16-03-2020,As news of the regionÂs first confirmed COVID...,Positive
6,3805,48757,"35.926541,-78.753267",16-03-2020,Cashier at grocery store was sharing his insig...,Positive
...,...,...,...,...,...,...
41147,44946,89898,"Brooklyn, NY",14-04-2020,YÂall really shitting that much more at home?...,Negative
41149,44948,89900,"Toronto, Ontario",14-04-2020,Still shocked by the number of #Toronto superm...,Negative
41150,44949,89901,OHIO,14-04-2020,I never that weÂd be in a situation &amp; wor...,Positive
41152,44951,89903,"Wellington City, New Zealand",14-04-2020,Airline pilots offering to stock supermarket s...,Neutral


In [59]:
# null values removed
df1.isnull().sum()

UserName         0
ScreenName       0
Location         0
TweetAt          0
OriginalTweet    0
Sentiment        0
dtype: int64

In [60]:
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

In [61]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\motap\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [62]:
sw = stopwords.words('english')
lm = WordNetLemmatizer()

In [63]:
df1['Sentiment'].value_counts()

Sentiment
Positive              9110
Negative              7763
Neutral               6172
Extremely Positive    5273
Extremely Negative    4249
Name: count, dtype: int64

In [64]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\motap\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [65]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\motap\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [66]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\motap\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

### preprocessing the covid tweets

In [67]:
msg = []
for i in df1['OriginalTweet']:
    t = re.sub('[^A-Za-z0-9]',' ',i)  # removal of punctuations
    t = t.lower()                     # conversion of words into lower case
    t = word_tokenize(t)              # tokenization of words
    t= [i for i in t if i not in sw]  # removing the stop words
    t = [lm.lemmatize(i) for i in t]  # lemmatization of words
    t = " ".join(t)                   # joining the sentences
    msg.append(t)

In [68]:
print(msg[:10])

['menyrbie phil gahan chrisitv http co ifz9fan2pa http co xx6ghgfzcc http co i2nlzdxno8', 'advice talk neighbour family exchange phone number create contact list phone number neighbour school employer chemist gp set online shopping account po adequate supply regular med order', 'coronavirus australia woolworth give elderly disabled dedicated shopping hour amid covid 19 outbreak http co binca9vp8p', 'news region first confirmed covid 19 case came sullivan county last week people flocked area store purchase cleaning supply hand sanitizer food toilet paper good tim dodson report http co cfxch7a2lu', 'cashier grocery store sharing insight covid 19 prove credibility commented civics class know talking http co iefdnehgdo', 'supermarket today buy toilet paper rebel toiletpapercrisis covid 19 http co evxkqlidaz', 'due covid 19 retail store classroom atlanta open walk business class next two week beginning monday march 16 continue process online phone order normal thank understanding http co kw

### Transform the words into vectors using Count Vectorizer

In [69]:
from sklearn.feature_extraction.text import CountVectorizer

In [70]:
cv = CountVectorizer(max_features=2000)
sm = cv.fit_transform(msg).toarray()

In [71]:
print(len(cv.get_feature_names_out()))

2000


In [72]:
sm.shape

(32567, 2000)

In [73]:
print(len(sm[0]))
print(len(sm[87]))

2000
2000


### Converting the 'Extremely Positive' and 'Extremely Negative' Sentiments to 'Positive' and 'Negative' sentiments respectively


In [74]:
df2 = df1.replace(to_replace ="Extremely Positive",
                 value ="Positive")

In [75]:
df3 =df2.replace(to_replace ="Extremely Negative",
                 value ="Negative")

In [76]:
df3.head(10)

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
5,3804,48756,"ÃT: 36.319708,-82.363649",16-03-2020,As news of the regionÂs first confirmed COVID...,Positive
6,3805,48757,"35.926541,-78.753267",16-03-2020,Cashier at grocery store was sharing his insig...,Positive
7,3806,48758,Austria,16-03-2020,Was at the supermarket today. Didn't buy toile...,Neutral
8,3807,48759,"Atlanta, GA USA",16-03-2020,Due to COVID-19 our retail store and classroom...,Positive
9,3808,48760,"BHAVNAGAR,GUJRAT",16-03-2020,"For corona prevention,we should stop to buy th...",Negative
10,3809,48761,"Makati, Manila",16-03-2020,All month there hasn't been crowding in the su...,Neutral
11,3810,48762,"Pitt Meadows, BC, Canada",16-03-2020,"Due to the Covid-19 situation, we have increas...",Positive


In [77]:
df3['Sentiment'].value_counts()

Sentiment
Positive    14383
Negative    12012
Neutral      6172
Name: count, dtype: int64

### converting the positive ,negative and neutral text into binary

In [78]:
from sklearn.preprocessing import LabelEncoder

In [79]:
lb = LabelEncoder()

In [80]:
df3['Sentiment'] = lb.fit_transform(df3['Sentiment'])

In [81]:
df3['Sentiment'].value_counts()

Sentiment
2    14383
0    12012
1     6172
Name: count, dtype: int64

In [82]:
# denoting postive tweets with - 2
# denoting negative tweets with - 0
# denoting neutral tweets with - 1


In [83]:
x = sm
y = df3['Sentiment']
print(type(x))
print(type(y))

<class 'numpy.ndarray'>
<class 'pandas.core.series.Series'>


### splitting the 20% of data for test data and 80% for training data

In [84]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.20)

In [85]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(26053, 2000)
(6514, 2000)
(26053,)
(6514,)


### now using Multinomial Naïve Bayes Classification and predicting the sentiment

In [86]:
from sklearn.naive_bayes import MultinomialNB

In [87]:
m1 = MultinomialNB()

In [88]:
m1.fit(x_train,y_train)

In [89]:
print("training score",m1.score(x_train,y_train))
print("testing score",m1.score(x_test,y_test))

training score 0.7096303688634706
testing score 0.6799201719373656


In [90]:
ypred_m1 = m1.predict(x_test)
print("The predicted sentiment", ypred_m1)

The predicted sentiment [1 2 2 ... 2 2 0]


### Computing Confusion matrix and classification report for Multinomial Naïve Bayes Classification


In [91]:
from sklearn.metrics import confusion_matrix,classification_report


In [92]:
cm = confusion_matrix(ypred_m1,y_test)
cm

array([[1703,  247,  521],
       [ 282,  739,  301],
       [ 454,  280, 1987]], dtype=int64)

In [93]:
print(classification_report(y_test,ypred_m1))

              precision    recall  f1-score   support

           0       0.69      0.70      0.69      2439
           1       0.56      0.58      0.57      1266
           2       0.73      0.71      0.72      2809

    accuracy                           0.68      6514
   macro avg       0.66      0.66      0.66      6514
weighted avg       0.68      0.68      0.68      6514



### now using logistic regression and predicting the sentiment

In [100]:
from sklearn.linear_model import LogisticRegression

In [101]:
m3 = LogisticRegression()

In [102]:
# prediction
m3.fit(x_train,y_train) # ml model will be trained on training data
ypred_m3 = m3.predict(x_test) # ml model trained on training data is used to generate the predictions on the test data
print("The predicted sentiment",ypred_m3)

The predicted sentiment [1 2 2 ... 2 2 0]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [103]:
#  checking  accuracy
print("training score",m3.score(x_train,y_train))
print("testing score",m3.score(x_test,y_test))

training score 0.8354508118067018
testing score 0.7789376727049432


In [104]:
# Computing Confusion matrix and classification report for logisticregression model.

In [105]:
cm2 = confusion_matrix(ypred_m3,y_test)
cm2

array([[1894,  174,  337],
       [ 223,  914,  206],
       [ 322,  178, 2266]], dtype=int64)

In [106]:
print(classification_report(y_test,ypred_m3))

              precision    recall  f1-score   support

           0       0.79      0.78      0.78      2439
           1       0.68      0.72      0.70      1266
           2       0.82      0.81      0.81      2809

    accuracy                           0.78      6514
   macro avg       0.76      0.77      0.77      6514
weighted avg       0.78      0.78      0.78      6514

