# Twitter Sentiment Analysis in NLP

## import libraries

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import warnings

warnings.filterwarnings('ignore')

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

## load data

In [3]:
df = pd.read_csv("Twitter Sentiments.csv")
df

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation
...,...,...,...
31957,31958,0,ate @user isz that youuu?ðððððð...
31958,31959,0,to see nina turner on the airwaves trying to...
31959,31960,0,listening to sad songs on a monday morning otw...
31960,31961,1,"@user #sikh #temple vandalised in in #calgary,..."


## Exploratory data analysis

In [4]:
df.shape

(31962, 3)

In [5]:
df['label'].value_counts()

0    29720
1     2242
Name: label, dtype: int64

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31962 entries, 0 to 31961
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      31962 non-null  int64 
 1   label   31962 non-null  int64 
 2   tweet   31962 non-null  object
dtypes: int64(2), object(1)
memory usage: 749.2+ KB


## Remove all words except alphabets and also remove stopwords(this,is,that etc) and aply stemming

In [7]:
ps = PorterStemmer()
corpus = []

for i in range(len(df)):
    print(i)
    rev = re.sub("[^a-zA-Z]",' ',df['tweet'][i])
    rev = rev.lower()
    rev = rev.split()
    rev = [ps.stem(word) for word in rev if word not in set(stopwords.words("english"))]
    rev = ' '.join(rev)
    corpus.append(rev)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
26962
26963
26964
26965
26966
26967
26968
26969
26970
26971
26972
26973
26974
26975
26976
26977
26978
26979
26980
26981
26982
26983
26984
26985
26986
26987
26988
26989
26990
26991
26992
26993
26994
26995
26996
26997
26998
26999
27000
27001
27002
27003
27004
27005
27006
27007
27008
27009
27010
27011
27012
27013
27014
27015
27016
27017
27018
27019
27020
27021
27022
27023
27024
27025
27026
27027
27028
27029
27030
27031
27032
27033
27034
27035
27036
27037
27038
27039
27040
27041
27042
27043
27044
27045
27046
27047
27048
27049
27050
27051
27052
27053
27054
27055
27056
27057
27058
27059
27060
27061
27062
27063
27064
27065
27066
27067
27068
27069
27070
27071
27072
27073
27074
27075
27076
27077
27078
27079
27080
27081
27082
27083
27084
27085
27086
27087
27088
27089
27090
27091
27092
27093
27094
27095
27096
27097
27098
27099
27100
27101
27102
27103
27104
27105
27106
27107
27108
27109
27110
27111
27112
27113
27114
27115
27116
27117

In [8]:
corpus

['user father dysfunct selfish drag kid dysfunct run',
 'user user thank lyft credit use caus offer wheelchair van pdx disapoint getthank',
 'bihday majesti',
 'model love u take u time ur',
 'factsguid societi motiv',
 'huge fan fare big talk leav chao pay disput get allshowandnogo',
 'user camp tomorrow user user user user user user user danni',
 'next school year year exam think school exam hate imagin actorslif revolutionschool girl',
 'love land allin cav champion cleveland clevelandcavali',
 'user user welcom gr',
 'ireland consum price index mom climb previou may blog silver gold forex',
 'selfish orlando standwithorlando pulseshoot orlandoshoot biggerproblem selfish heabreak valu love',
 'get see daddi today day gettingf',
 'user cnn call michigan middl school build wall chant tcot',
 'comment australia opkillingbay seashepherd helpcovedolphin thecov helpcovedolphin',
 'ouch junior angri got junior yugyoem omg',
 'thank paner thank posit',
 'retweet agre',
 'friday smile around

## Convert a collection of text documents to a matrix of token counts

Machines cannot understand characters and words. So when dealing with text data we need to represent it in numbers to be understood by the machine. Countvectorizer is a method to convert text to numerical data. To show you how it works let’s take an example:

text = [‘Hello my name is james, this is my python notebook’]

The text is transformed to a sparse matrix as shown below.
Hello  is   james   my    name    python      notebook   this
 1     2      1      2     1        1             1        1

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000)
X = cv.fit_transform(corpus).toarray()

In [10]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [11]:
X.shape

(31962, 5000)

## divide data into train and test

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, df['label'], test_size = 0.20, random_state = 0)


In [13]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((25569, 5000), (6393, 5000), (25569,), (6393,))

## define model

In [14]:
from sklearn.naive_bayes import MultinomialNB

In [15]:
mnb = MultinomialNB().fit(X_train,y_train)

## Test model

In [16]:
pred = mnb.predict(X_test)

In [17]:
pred

array([0, 0, 0, ..., 0, 0, 0])

In [18]:
y_test

13667    0
22090    0
21397    0
26091    0
23671    0
        ..
10840    0
2337     0
20627    0
15583    1
28627    0
Name: label, Length: 6393, dtype: int64

## Check accuracy and confusion matrix

In [28]:

from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
print(accuracy_score(y_test,pred)*100)
print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))

94.46269357109338
[[5754  231]
 [ 123  285]]
              precision    recall  f1-score   support

           0       0.98      0.96      0.97      5985
           1       0.55      0.70      0.62       408

    accuracy                           0.94      6393
   macro avg       0.77      0.83      0.79      6393
weighted avg       0.95      0.94      0.95      6393



## Difference between test data and predicted data

In [20]:
import numpy as np
fnl = pd.DataFrame(np.c_[y_test,pred],columns=['Actual','Predicted'])
fnl

Unnamed: 0,Actual,Predicted
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
...,...,...
6388,0,0
6389,0,0
6390,0,0
6391,1,0


## Save a model

In [21]:
import joblib

In [22]:
joblib.dump(mnb , 'Twitter_Sentiment_Analysis.pkl')

['Twitter_Sentiment_Analysis.pkl']

## Load a model

In [23]:
model = joblib.load('Twitter_Sentiment_Analysis.pkl')

## Test a model

In [30]:
def test_model(test_sentence):   # 0=spam , 1=ham
    rev = re.sub('[^a-zA-Z]',' ',test_sentence)
    rev = rev.lower()
    rev = rev.split()
    rev = [ps.stem(word) for word in rev if word not in set(stopwords.words('english'))]
    rev = ' '.join(rev)
    
    rev = cv.transform([rev]).toarray()
    
    output = model.predict(rev)[0]
    
    if output == 0:
        print("Positive Analysis")
    else:
        print("Negative Analysis")

In [29]:
test_model("This is a very interesting post")

Positive Analysis


In [26]:
test_model("user can call michigan middl school build wall chant tcot")

Negative Analysis


In [27]:
test_model("user see Golden temple with condemn act")

Negative Analysis


# Thank You!!!!!!!!!!!!!!!!!!