In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer

In [2]:
tweets = pd.read_csv("Disaster_tweets_NB.csv")
tweets

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [3]:
tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [4]:
for i in tweets.target:

    if i == 1 :
        tweets["target"] = tweets["target"].replace([1], "real tweet")
    elif i == 0:
        tweets["target"] = tweets["target"].replace([0], "Fake tweet")

In [5]:
tweets.head(50)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,real tweet
1,4,,,Forest fire near La Ronge Sask. Canada,real tweet
2,5,,,All residents asked to 'shelter in place' are ...,real tweet
3,6,,,"13,000 people receive #wildfires evacuation or...",real tweet
4,7,,,Just got sent this photo from Ruby #Alaska as ...,real tweet
5,8,,,#RockyFire Update => California Hwy. 20 closed...,real tweet
6,10,,,#flood #disaster Heavy rain causes flash flood...,real tweet
7,13,,,I'm on top of the hill and I can see a fire in...,real tweet
8,14,,,There's an emergency evacuation happening now ...,real tweet
9,15,,,I'm afraid that the tornado is coming to our a...,real tweet


In [6]:
# cleaning data 
import re
stop_words = []
# Load the custom built Stopwords
with open("stopwords_en.txt","r") as sw:
    stop_words = sw.read()

In [7]:
stop_words = stop_words.split("\n")

In [8]:
stop_words

['a',
 "a's",
 'able',
 'about',
 'above',
 'according',
 'accordingly',
 'across',
 'actually',
 'after',
 'afterwards',
 'again',
 'against',
 "ain't",
 'all',
 'allow',
 'allows',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'an',
 'and',
 'another',
 'any',
 'anybody',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anyways',
 'anywhere',
 'apart',
 'appear',
 'appreciate',
 'appropriate',
 'are',
 "aren't",
 'around',
 'as',
 'aside',
 'ask',
 'asking',
 'associated',
 'at',
 'available',
 'away',
 'awfully',
 'b',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'believe',
 'below',
 'beside',
 'besides',
 'best',
 'better',
 'between',
 'beyond',
 'both',
 'brief',
 'but',
 'by',
 'c',
 "c'mon",
 "c's",
 'came',
 'can',
 "can't",
 'cannot',
 'cant',
 'cause',
 'causes',
 'certain',
 'certainly',
 'changes',
 'clearly',
 'co',
 'com',
 'come',
 'c

In [9]:
def cleaning_text(i):
    i = re.sub("[^A-Za-z" "]+"," ",i).lower()
    i = re.sub("[0-9" "]+"," ",i)
    w = []
    for word in i.split(" "):
        if len(word)>3:
            w.append(word)
    return (" ".join(w))

In [10]:
tweets.text = tweets.text.apply(cleaning_text)
tweets.text

0              deeds reason this earthquake allah forgive
1                      forest fire near ronge sask canada
2       residents asked shelter place being notified o...
3       people receive wildfires evacuation orders cal...
4       just sent this photo from ruby alaska smoke fr...
                              ...                        
7608    giant cranes holding bridge collapse into near...
7609    aria ahrary thetawniest control wild fires cal...
7610                           volcano hawaii http zdtoyd
7611    police investigating after bike collided with ...
7612    latest more homes razed northern california wi...
Name: text, Length: 7613, dtype: object

In [11]:
# removing empty rows
tweets = tweets.loc[tweets.text != " ",:]

In [12]:
# CountVectorizer
# Convert a collection of text documents to a matrix of token counts

# splitting data into train and test data sets 
from sklearn.model_selection import train_test_split

tweets_train, tweets_test = train_test_split(tweets, test_size = 0.2)

In [13]:
# creating a matrix of token counts for the entire text document 
def split_into_words(i):
    return [word for word in i.split(" ")]

In [14]:
# Defining the preparation of email texts into word count matrix format - Bag of Words
tweets_bow = CountVectorizer(analyzer = split_into_words).fit(tweets.text)
tweets_bow

CountVectorizer(analyzer=<function split_into_words at 0x0000026FB2292C10>)

In [15]:
# Defining BOW for all messages
all_tweets_matrix = tweets_bow.transform(tweets.text)
all_tweets_matrix

<7613x19280 sparse matrix of type '<class 'numpy.int64'>'
	with 74628 stored elements in Compressed Sparse Row format>

In [16]:
# For training messages
train_tweets_matrix = tweets_bow.transform(tweets_train.text)
train_tweets_matrix

<6090x19280 sparse matrix of type '<class 'numpy.int64'>'
	with 59841 stored elements in Compressed Sparse Row format>

In [17]:
# For testing messages
test_tweets_matrix = tweets_bow.transform(tweets_test.text)


In [18]:
# Learning Term weighting and normalizing on entire emails
tfidf_transformer = TfidfTransformer().fit(all_tweets_matrix)

# Preparing TFIDF for train emails
train_tfidf = tfidf_transformer.transform(train_tweets_matrix)
train_tfidf.shape # (row, column)

(6090, 19280)

In [19]:
# Preparing TFIDF for test emails
test_tfidf = tfidf_transformer.transform(test_tweets_matrix)
test_tfidf.shape #  (row, column)

(1523, 19280)

# **Preparing a naive bayes model on training data set**

In [20]:
from sklearn.naive_bayes import MultinomialNB as MB

In [21]:
tweets_train

Unnamed: 0,id,keyword,location,text,target
6495,9287,sunk,,silverstar felt romero cared mental stuff sunk...,Fake tweet
6285,8978,storm,NC || OR,cream cupcake wars storm content sara,Fake tweet
7605,10864,,,flip side walmart there bomb everyone evacuate...,real tweet
1456,2099,casualty,,casualty insurance jobs against hunt willinghe...,Fake tweet
1472,2121,catastrophe,,ultimate preparedness library http jyabyz prep...,real tweet
...,...,...,...,...,...
1449,2089,casualty,"Boulder, CO",greenharvard documenting climate change first ...,real tweet
833,1211,blizzard,lakewood colorado,fairx playoverwatch blizzardcs please blizzard...,Fake tweet
1647,2377,collapsed,"Kingston, Jamaica",another entity forced close montego result col...,real tweet
1383,1994,bush%20fires,,thing sure there will never bush fires scotlan...,Fake tweet


In [23]:
# Multinomial Naive Bayes
classifier_mb = MB()
classifier_mb.fit(train_tfidf, tweets_train.target)

MultinomialNB()

In [24]:
# Evaluation on Test Data
test_pred_m = classifier_mb.predict(test_tfidf)
accuracy_test_m = np.mean(test_pred_m == tweets_test.target)
accuracy_test_m

0.793827971109652

In [25]:
test_pred_m

array(['Fake tweet', 'real tweet', 'real tweet', ..., 'Fake tweet',
       'Fake tweet', 'Fake tweet'], dtype='<U10')

In [27]:
from sklearn.metrics import accuracy_score
accuracy_score(test_pred_m, tweets_test.target) 

0.793827971109652

In [28]:
pd.crosstab(test_pred_m, tweets_test.target)

target,Fake tweet,real tweet
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
Fake tweet,805,253
real tweet,61,404


In [30]:
# Training Data accuracy
train_pred_m = classifier_mb.predict(train_tfidf)
accuracy_train_m = np.mean(train_pred_m == tweets_train.target)
accuracy_train_m

0.9057471264367816

In [31]:
train_pred_m

array(['Fake tweet', 'Fake tweet', 'Fake tweet', ..., 'real tweet',
       'Fake tweet', 'Fake tweet'], dtype='<U10')

## Multinomial Naive Bayes changing default alpha for laplace smoothing
## if alpha = 0 then no smoothing is applied and the default alpha parameter is 
##  the smoothing process mainly solves the emergence of zero probability problem in the dataset.

In [33]:
classifier_mb_lap = MB(alpha = 3)
classifier_mb_lap.fit(train_tfidf, tweets_train.target)

MultinomialNB(alpha=3)

In [34]:
# Evaluation on Test Data after applying laplace
test_pred_lap = classifier_mb_lap.predict(test_tfidf)
accuracy_test_lap = np.mean(test_pred_lap == tweets_test.target)
accuracy_test_lap

0.7806959947472094

In [35]:
test_pred_lap

array(['Fake tweet', 'real tweet', 'real tweet', ..., 'Fake tweet',
       'Fake tweet', 'Fake tweet'], dtype='<U10')

In [37]:
from sklearn.metrics import accuracy_score
accuracy_score(test_pred_lap, tweets_test.target) 

0.7806959947472094

In [39]:
pd.crosstab(test_pred_lap, tweets_test.target)

target,Fake tweet,real tweet
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
Fake tweet,834,302
real tweet,32,355


In [40]:
# Training Data accuracy
train_pred_lap = classifier_mb_lap.predict(train_tfidf)
accuracy_train_lap = np.mean(train_pred_lap == tweets_train.target)
accuracy_train_lap

0.861576354679803

In [43]:
train_pred_lap

array(['Fake tweet', 'Fake tweet', 'Fake tweet', ..., 'real tweet',
       'Fake tweet', 'Fake tweet'], dtype='<U10')

In [41]:
train_pred_lap

array(['Fake tweet', 'Fake tweet', 'Fake tweet', ..., 'real tweet',
       'Fake tweet', 'Fake tweet'], dtype='<U10')