In [1]:
#import required libraries.

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re


In [2]:
#Read the .csv file using pandas
df = pd.read_csv("Twitter_Data.csv")

In [3]:
#get no. of rows and columns
df.shape

(162980, 2)

In [4]:
#To list first 5 records with header
df.head()

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [5]:
#To check if any non-null value present in the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162980 entries, 0 to 162979
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   clean_text  162976 non-null  object 
 1   category    162973 non-null  float64
dtypes: float64(1), object(1)
memory usage: 2.5+ MB


### Clean the data

In [6]:
# Clean HTML tags present in dataset
def strip_tags(text):         
    pattern = re.compile('<.*?>')   
    result = re.sub(pattern,'', str(text))
    return result  

In [7]:
#lower the string
def to_lower(str):
    return str.lower()

In [8]:
#apply clean html
df["clean_text"] = df["clean_text"].apply(strip_tags)

In [9]:
df["clean_text"]

0         when modi promised “minimum government maximum...
1         talk all the nonsense and continue all the dra...
2         what did just say vote for modi  welcome bjp t...
3         asking his supporters prefix chowkidar their n...
4         answer who among these the most powerful world...
                                ...                        
162975    why these 456 crores paid neerav modi not reco...
162976    dear rss terrorist payal gawar what about modi...
162977    did you cover her interaction forum where she ...
162978    there big project came into india modi dream p...
162979    have you ever listen about like gurukul where ...
Name: clean_text, Length: 162980, dtype: object

In [10]:
#apply string lower to each sentence
df["clean_text"] = df["clean_text"].apply(to_lower)

In [11]:
df["clean_text"]

0         when modi promised “minimum government maximum...
1         talk all the nonsense and continue all the dra...
2         what did just say vote for modi  welcome bjp t...
3         asking his supporters prefix chowkidar their n...
4         answer who among these the most powerful world...
                                ...                        
162975    why these 456 crores paid neerav modi not reco...
162976    dear rss terrorist payal gawar what about modi...
162977    did you cover her interaction forum where she ...
162978    there big project came into india modi dream p...
162979    have you ever listen about like gurukul where ...
Name: clean_text, Length: 162980, dtype: object

In [12]:
df['clean_text'][15:20]

15    vote such party and leadershipwho can take fas...
16                   vote modi who has not created jobs
17    through our vote ensure govt need and deserve ...
18    dont play with the words was talking about the...
19    didn’ write chowkidar does mean ’ anti modi tr...
Name: clean_text, dtype: object

### Allow only characters

In [13]:
# Keep only Alphabetiic characters in dataset by using ascii tables in HEX
def clean_data(text):     
    result = re.sub('([^\x61-\x7A ])+', '', text)
    return result  

In [14]:
df["clean_text"] = df["clean_text"].apply(clean_data)

In [15]:
df['clean_text'][15:20]

15    vote such party and leadershipwho can take fas...
16                   vote modi who has not created jobs
17    through our vote ensure govt need and deserve ...
18    dont play with the words was talking about the...
19    didn write chowkidar does mean  anti modi try ...
Name: clean_text, dtype: object

In [16]:
df.head()

Unnamed: 0,clean_text,category
0,when modi promised minimum government maximum ...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [17]:
df.iloc[19].clean_text

'didn write chowkidar does mean  anti modi try visit the plz not all who haven used are anti '

### Remove stop words

In [18]:
#import Natural language Toolkit library
import nltk

In [19]:
#Remove stopwords like "a","the","an" which can be ignored
from nltk.corpus import stopwords

In [20]:
#split the sentence and check if stopwords present in it, if not stop word then append it to another temp list till. 
#And clear the temp list after use
def remove_stopwords(text):
    temp = []
    for i in text.split():        
        if i not in stopwords.words('english'):
            temp.append(i)            
    result = temp[:]
    temp.clear()
    return result

In [21]:
#apply remove stop words function
df["clean_text"] = df["clean_text"].apply(remove_stopwords)

In [22]:
df.head()

Unnamed: 0,clean_text,category
0,"[modi, promised, minimum, government, maximum,...",-1.0
1,"[talk, nonsense, continue, drama, vote, modi]",0.0
2,"[say, vote, modi, welcome, bjp, told, rahul, m...",1.0
3,"[asking, supporters, prefix, chowkidar, names,...",1.0
4,"[answer, among, powerful, world, leader, today...",1.0


### Stem the words

In [23]:
#import library for stemming
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
ps = PorterStemmer()



In [24]:
#the words like "asked", "asking", "ask" will be stem to "ask" only
def stem_words(text):
    temp = []
    for i in text:
        temp.append(ps.stem(i))       
    result = temp[:]
    temp.clear()
    return result
    

In [25]:
df["clean_text"]

0         [modi, promised, minimum, government, maximum,...
1             [talk, nonsense, continue, drama, vote, modi]
2         [say, vote, modi, welcome, bjp, told, rahul, m...
3         [asking, supporters, prefix, chowkidar, names,...
4         [answer, among, powerful, world, leader, today...
                                ...                        
162975    [crores, paid, neerav, modi, recovered, congre...
162976    [dear, rss, terrorist, payal, gawar, modi, kil...
162977                    [cover, interaction, forum, left]
162978    [big, project, came, india, modi, dream, proje...
162979    [ever, listen, like, gurukul, discipline, main...
Name: clean_text, Length: 162980, dtype: object

In [26]:
df["clean_text"] = df["clean_text"].apply(stem_words)

In [27]:
#asking has been converted into "ask" on line 3
df.head()


Unnamed: 0,clean_text,category
0,"[modi, promis, minimum, govern, maximum, gover...",-1.0
1,"[talk, nonsens, continu, drama, vote, modi]",0.0
2,"[say, vote, modi, welcom, bjp, told, rahul, ma...",1.0
3,"[ask, support, prefix, chowkidar, name, modi, ...",1.0
4,"[answer, among, power, world, leader, today, t...",1.0


In [28]:
#Now again join words present into list to the sentence
def list_to_sentence(wl):
    return " ".join(wl)

In [29]:
#apply stem function
df["clean_text"] = df["clean_text"].apply(list_to_sentence)

In [69]:
df.head()

Unnamed: 0,clean_text,category
0,modi promis minimum govern maximum govern expe...,-1
1,talk nonsens continu drama vote modi,0
2,say vote modi welcom bjp told rahul main campa...,1
3,ask support prefix chowkidar name modi great s...,1
4,answer among power world leader today trump pu...,1


In [70]:
df["clean_text"]

0         modi promis minimum govern maximum govern expe...
1                      talk nonsens continu drama vote modi
2         say vote modi welcom bjp told rahul main campa...
3         ask support prefix chowkidar name modi great s...
4         answer among power world leader today trump pu...
                                ...                        
162975    crore paid neerav modi recov congress leader h...
162976    dear rss terrorist payal gawar modi kill plu m...
162977                            cover interact forum left
162978    big project came india modi dream project happ...
162979    ever listen like gurukul disciplin maintain ev...
Name: clean_text, Length: 162980, dtype: object

### Convert words to colums

#### To Have more accuracy we can increase the max_features

In [31]:
#convert all data into columns in such a way that words will be the column
#for this process import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=500)#most used 500 words

In [32]:
X = cv.fit_transform(df['clean_text']).toarray()

In [33]:
#we have 162980 sentences with words 97041
X.shape

(162980, 500)

In [66]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

#### Clean category column having float values

In [34]:
df.iloc[:,-1]

0        -1.0
1         0.0
2         1.0
3         1.0
4         1.0
         ... 
162975   -1.0
162976   -1.0
162977    0.0
162978    0.0
162979    1.0
Name: category, Length: 162980, dtype: float64

In [35]:
df["category"]

0        -1.0
1         0.0
2         1.0
3         1.0
4         1.0
         ... 
162975   -1.0
162976   -1.0
162977    0.0
162978    0.0
162979    1.0
Name: category, Length: 162980, dtype: float64

In [36]:
df["category"].isna().sum()

7

### Replace null values with 0

In [37]:
df.isnull().sum()

clean_text    0
category      7
dtype: int64

In [38]:
df["category"].isna().sum()

7

In [39]:
df["category"] = df["category"].replace(np.nan, 0)

In [40]:
df["category"].isna().sum()

0

In [41]:
#convert to integer
def float_to_int(a):
    return int(a)

In [42]:
df["category"] = df["category"].apply(float_to_int)

In [43]:
df["category"]

0        -1
1         0
2         1
3         1
4         1
         ..
162975   -1
162976   -1
162977    0
162978    0
162979    1
Name: category, Length: 162980, dtype: int64

In [44]:
y=df.iloc[:,-1].values

In [45]:
y.shape

(162980,)

### Process the data
#### use 20% of data as test data to process

Now, We have X and y values to process the data

In [46]:
from sklearn.model_selection import train_test_split
#process 20% test data
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [47]:
X_train.shape

(130384, 500)

In [48]:
y_train.shape

(130384,)

In [49]:
y_test.shape

(32596,)

#### Apply Naive Bayes to check accuracy

In [50]:
#import Naive bases major classes to check the accuracy
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB

In [51]:
#Create object of each class
gObj = GaussianNB()
mObj = MultinomialNB()
bObj = BernoulliNB()

In [52]:
#Pass the data to train
gObj.fit(X_train, y_train)
mObj.fit(X_train, y_train)
bObj.fit(X_train, y_train)

BernoulliNB()

In [53]:
#Create predict: The predict obj shape should be same as test obj shape
y_pred_gauss = gObj.predict(X_test)
y_pred_gauss.shape


(32596,)

In [54]:
y_pred_multinom = mObj.predict(X_test)
y_pred_multinom.shape

(32596,)

In [55]:
y_pred_berno = bObj.predict(X_test)
y_pred_berno.shape

(32596,)

### Scores

In [56]:
from sklearn.metrics import accuracy_score

In [57]:
#accuracy score for Gaussian
print("Gaussian", accuracy_score(y_test,y_pred_gauss))

Gaussian 0.6801754816541907


In [58]:
#accuracy score for Multinomial
print("Multinomial", accuracy_score(y_test,y_pred_multinom))

Multinomial 0.6761565836298933


In [59]:
#accuracy score for BernoulliNB
print("BernoulliNB", accuracy_score(y_test,y_pred_berno))

BernoulliNB 0.6964965026383605


### Output

In [60]:
#Crosschek - Output for positive
data = {
  "text": ["Discussion was very good"]
}    
dfx = pd.DataFrame(data)
Xcheck = cv.transform(dfx['text']).toarray()
print(bObj.predict(Xcheck)[0])
print(mObj.predict(Xcheck)[0])
print(gObj.predict(Xcheck)[0])

1
1
1


In [61]:
#Crosschek - Output for Negative text
#Result : it works
data = {
  "text": ["No one should check this. Worst experience. Harmful to childrens"]
}    
dfx = pd.DataFrame(data)
Xcheck = cv.transform(dfx['text']).toarray()
print(bObj.predict(Xcheck)[0])
print(mObj.predict(Xcheck)[0])
print(gObj.predict(Xcheck)[0])

0
1
0


In [62]:
#Crosschek - Output for Negative text
#Result : it works
data = {
  "text": ["The analysis is bad"]
}    
dfx = pd.DataFrame(data)
Xcheck = cv.transform(dfx['text']).toarray()
print(bObj.predict(Xcheck)[0])
print(mObj.predict(Xcheck)[0])
print(gObj.predict(Xcheck)[0])

-1
-1
-1


In [63]:
#Crosschek - Output for neutral
#Result : Multinomial fails
data = {
  "text": ["we are coming"]
}    
dfx = pd.DataFrame(data)
Xcheck = cv.transform(dfx['text']).toarray()
print(bObj.predict(Xcheck)[0])
print(mObj.predict(Xcheck)[0])
print(gObj.predict(Xcheck)[0])

0
1
0


In [64]:
#Crosschek - Output for neutral
#Result : None worked
data = {
  "text": ["The analysis is not bad nor good"]
}    
dfx = pd.DataFrame(data)
Xcheck = cv.transform(dfx['text']).toarray()
print(bObj.predict(Xcheck)[0])
print(mObj.predict(Xcheck)[0])
print(gObj.predict(Xcheck)[0])

1
1
-1
