# MAJOR PROJECT

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("Corona_NLP.csv",encoding='latin1') #reading dataset

In [3]:
df.shape

(41157, 6)

In [4]:
import sklearn
import re

In [5]:
df.isnull().sum()

UserName            0
ScreenName          0
Location         8590
TweetAt             0
OriginalTweet       0
Sentiment           0
dtype: int64

In [6]:
df.pop('Location') #removing null values

0                              London
1                                  UK
2                           Vagabonds
3                                 NaN
4                                 NaN
                     ...             
41152    Wellington City, New Zealand
41153                             NaN
41154                             NaN
41155                             NaN
41156    i love you so much || he/him
Name: Location, Length: 41157, dtype: object

In [7]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk import word_tokenize as wt
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer

[nltk_data] Downloading package punkt to C:\Users\IITM
[nltk_data]     mech\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\IITM
[nltk_data]     mech\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\IITM
[nltk_data]     mech\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\IITM
[nltk_data]     mech\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [8]:
df.columns

Index(['UserName', 'ScreenName', 'TweetAt', 'OriginalTweet', 'Sentiment'], dtype='object')

In [9]:
sw = stopwords.words('English')
lem = WordNetLemmatizer()

In [10]:
msg = []                                            #pre processing
for i in df['OriginalTweet']:
    t = i.lower()
    t = re.sub('[^a-zA-Z]',' ',t)
    t = wt(t)
    t = [i for i in t if i not in sw]
    t = [lem.lemmatize(i) for i in t]
    t=' '.join(t)
    msg.append(t)

In [11]:
df['Sentiment'].replace("Extremely Positive","Positive")           #converting extremely positive and negative sentiments
df['Sentiment'].replace("Extremely Negative","Negative")

0         Neutral
1        Positive
2        Positive
3        Positive
4        Negative
           ...   
41152     Neutral
41153    Negative
41154    Positive
41155     Neutral
41156    Negative
Name: Sentiment, Length: 41157, dtype: object

In [12]:
df.dtypes

UserName          int64
ScreenName        int64
TweetAt          object
OriginalTweet    object
Sentiment        object
dtype: object

In [13]:
cv = CountVectorizer(max_features=1500)    #count vectorizer
res = cv.fit_transform(msg).toarray()
print(res)
print(cv.get_feature_names())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]




In [14]:
df = pd.DataFrame(res,columns = cv.get_feature_names())
df.head()

Unnamed: 0,able,absolutely,access,according,account,across,act,action,activity,actually,...,wuhan,year,yes,yesterday,yet,york,young,youtube,yr,zero
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
print(len(cv.get_feature_names()))
print(cv.get_feature_names())

1500




In [16]:
x = res
y = df['sentiment']
print(type(x))
print(type(y))

<class 'numpy.ndarray'>
<class 'pandas.core.series.Series'>


In [17]:
print(x.shape)
print(y.shape)

(41157, 1500)
(41157,)


In [18]:
from sklearn.model_selection import train_test_split

In [19]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25)   #splitting data
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(30867, 1500)
(10290, 1500)
(30867,)
(10290,)


In [20]:
import warnings
warnings.filterwarnings('ignore')

In [21]:
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()

In [22]:
from sklearn.naive_bayes import MultinomialNB

In [23]:
mnb = MultinomialNB()
mnb.fit(x_train,y_train)

MultinomialNB()

In [24]:
print('Training score',mnb.score(x_train,y_train))
print('Testing score',mnb.score(x_test,y_test))

Training score 0.9898921177957042
Testing score 0.989310009718173


In [25]:
y_pred_mnb = mnb.predict(x_test)
print(y_pred_mnb)

[0 0 0 ... 0 0 0]


In [26]:
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score

In [27]:
cm_mnb = confusion_matrix(y_test,y_pred_mnb)
print(cm_mnb)
print(classification_report(y_test,y_pred_mnb))

[[10152   100     3     0     0]
 [    5    28     0     0     0]
 [    0     0     0     0     0]
 [    0     1     0     0     0]
 [    0     1     0     0     0]]
              precision    recall  f1-score   support

           0       1.00      0.99      0.99     10255
           1       0.22      0.85      0.34        33
           2       0.00      0.00      0.00         0
           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         1

    accuracy                           0.99     10290
   macro avg       0.24      0.37      0.27     10290
weighted avg       1.00      0.99      0.99     10290



In [28]:
from sklearn.svm import SVC

In [29]:
mr = SVC(kernel='rbf')
mr.fit(x_train,y_train)

SVC()

In [30]:
print('Training score',mr.score(x_train,y_train))
print('Testing score',mr.score(x_test,y_test))

Training score 0.9998380147082645
Testing score 0.9994169096209913


In [31]:
y_pred_mr = mr.predict(x_test)
print(y_pred_mr)

[0 0 0 ... 0 0 0]


In [32]:
cm_mr = confusion_matrix(y_test,y_pred_mr)
print(cm_mr)
print(classification_report(y_test,y_pred_mr))

[[10255     0     0     0]
 [    4    29     0     0]
 [    0     1     0     0]
 [    0     1     0     0]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     10255
           1       0.94      0.88      0.91        33
           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         1

    accuracy                           1.00     10290
   macro avg       0.48      0.47      0.48     10290
weighted avg       1.00      1.00      1.00     10290



In [33]:
from sklearn.neighbors import KNeighborsClassifier

In [34]:
m2 = KNeighborsClassifier(n_neighbors=15)
m2.fit(x_train,y_train)

KNeighborsClassifier(n_neighbors=15)

In [35]:
print('Training score',m2.score(x_train,y_train))
print('Testing score',m2.score(x_test,y_test))

Training score 0.9970842647487608
Testing score 0.9969873663751215


In [36]:
y_pred_m2 = m2.predict(x_test)
print(y_pred_m2)

[0 0 0 ... 0 0 0]


In [37]:
print(confusion_matrix(y_test,y_pred_m2))
print(classification_report(y_test,y_pred_m2))

[[10255     0     0     0]
 [   29     4     0     0]
 [    0     1     0     0]
 [    0     1     0     0]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     10255
           1       0.67      0.12      0.21        33
           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         1

    accuracy                           1.00     10290
   macro avg       0.42      0.28      0.30     10290
weighted avg       1.00      1.00      1.00     10290



## RESULT :
 THE MODEL WITH BEST ACCURACY IS SVM CLASSIFIER