**Real or Not? NLP  with Diaster Tweets** 

This is a Kaggle project which aims at determining if the Diaster Tweets are Real or Fake.


In [88]:
#import libaries for data cleaning
import pandas as pd
import numpy as np
import seaborn as sns


#import libraries for Language processing
import re, string
import nltk
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn

#import libraries for Modelling
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


  import pandas.util.testing as tm


In [2]:
# Load files from  local
# Data can be downloaded from https://www.kaggle.com/c/nlp-getting-started/data
from google.colab import files
uploaded = files.upload()

Saving twi-test.csv to twi-test.csv
Saving twi-train.csv to twi-train.csv


In [64]:
#Creating Data Frames for data
train = pd.read_csv('twi-train.csv', dtype={"keyword": str, "location":str})
test= pd.read_csv('twi-test.csv', dtype={"keyword": str, "location":str})


**Preliminary Data Analysis**

In [65]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [66]:
train.describe()

Unnamed: 0,id,target
count,7613.0,7613.0
mean,5441.934848,0.42966
std,3137.11609,0.49506
min,1.0,0.0
25%,2734.0,0.0
50%,5408.0,0.0
75%,8146.0,1.0
max,10873.0,1.0


In [67]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


**Data Cleaning**

In [68]:
#dropping on-useful columns and null values
data = train.drop(['location'],axis=1)
keyword = pd.isnull(data['keyword']) 

data = data.drop(data[keyword].index,axis=0)
data=data.reset_index(drop=True)


In [71]:
data.head()

Unnamed: 0,id,keyword,text,target
0,48,ablaze,@bbcmtd Wholesale Markets ablaze http://t.co/l...,1
1,49,ablaze,We always try to bring the heavy. #metal #RT h...,0
2,50,ablaze,#AFRICANBAZE: Breaking news:Nigeria flag set a...,1
3,52,ablaze,Crying out for more! Set me ablaze,0
4,53,ablaze,On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,0


In [72]:
#Function to remove URLs in the data
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

In [73]:
#Function to remove HTML elements from  the data
def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)

In [74]:
#Function to remove Emoticon elements from  the data
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [75]:
def remove_punct(text):
    table=str.maketrans('','',string.punctuation)
    return text.translate(table)

In [76]:
#Function calls
data['text']=data['text'].apply(lambda x : remove_URL(x))
data['text']=data['text'].apply(lambda x : remove_html(x))
data['text']=data['text'].apply(lambda x: remove_emoji(x))
data['text']=data['text'].apply(lambda x : remove_punct(x))

In [82]:
#Dropping null values 
data['text'].dropna(inplace=True)

#Converting all text to lower case
data['text'] = [entry.lower() for entry in data['text']]


In [83]:
data.head()

Unnamed: 0,id,keyword,text,target
0,48,ablaze,bbcmtd wholesale markets ablaze,1
1,49,ablaze,we always try to bring the heavy metal rt,0
2,50,ablaze,africanbaze breaking newsnigeria flag set abla...,1
3,52,ablaze,crying out for more set me ablaze,0
4,53,ablaze,on plus side look at the sky last night it was...,0


In [26]:
corpus  = []
ps = PorterStemmer()
for i in range(data['text'].shape[0]):
    #Remove unwanted words
    text = re.sub("[^a-zA-Z]", ' ', data['text'][i])
    #Transform words to lowercase
    text = text.lower()
    text = text.split()
    #Remove stopwords then Stemming it
    text = [ps.stem(word) for word in text if not word in set(stopwords.words('english'))]
    text = ' '.join(text)
    #Append cleaned tweet to corpus
    corpus.append(text)

In [87]:
corpus[1:10]

['forest fire near la rong sask canada',
 'resid ask shelter place notifi offic evacu shelter place order expect',
 'peopl receiv wildfir evacu order california',
 'got sent photo rubi alaska smoke wildfir pour school',
 'rockyfir updat california hwi close direct due lake counti fire cafir wildfir',
 'flood disast heavi rain caus flash flood street manit colorado spring area',
 'im top hill see fire wood',
 'there emerg evacu happen build across street',
 'im afraid tornado come area']

**Modelling**




In [28]:
#convert the text to a matrix of TF-IDF features
Tfidf_vect = TfidfVectorizer(max_features=80000)
Tfidf_vect.fit(data['text'])
uniqueWords = Tfidf_vect.vocabulary_
cv = CountVectorizer(max_features = len(uniqueWords))
#Create Bag of Words Model , here X represent bag of word
X = cv.fit_transform(corpus).todense()
y = data['target'].values


In [29]:
#splitting the data into dependent and Independent variables
X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.2, random_state=4110)

Gaussian Naive Bayes

In [91]:
# Fitting Gaussian Naive Bayes
GNB= GaussianNB()
GNB.fit(X_train, y_train)

# Predicting  results
gnb_predict = GNB.predict(X_test)
# Making the Confusion Matrix

#Calculating Model Accuracy
print(classification_report(y_test,gnb_predict))
print(confusion_matrix(y_test,gnb_predict))



              precision    recall  f1-score   support

           0       0.76      0.46      0.58       857
           1       0.54      0.82      0.65       666

    accuracy                           0.62      1523
   macro avg       0.65      0.64      0.61      1523
weighted avg       0.67      0.62      0.61      1523

[[396 461]
 [123 543]]


K-Nearest Neighbors

In [92]:
knn = KNeighborsClassifier(n_neighbors = 7,weights = 'distance',algorithm = 'brute')
knn.fit(X_train, y_train)
knn_predict = knn.predict(X_test)
print(classification_report(y_test,knn_predict))
print(confusion_matrix(y_test,knn_predict))

              precision    recall  f1-score   support

           0       0.66      0.97      0.78       857
           1       0.90      0.35      0.50       666

    accuracy                           0.70      1523
   macro avg       0.78      0.66      0.64      1523
weighted avg       0.76      0.70      0.66      1523

[[831  26]
 [435 231]]


Decision Tree

In [54]:
dtree = DecisionTreeClassifier()
dtree.fit(X_train,y_train)
dtree_predict = dtree.predict(X_test)
print(classification_report(y_test,dtree_predict))
print(confusion_matrix(y_test,dtree_predict))

              precision    recall  f1-score   support

           0       0.76      0.84      0.80       857
           1       0.77      0.67      0.71       666

    accuracy                           0.76      1523
   macro avg       0.76      0.75      0.76      1523
weighted avg       0.76      0.76      0.76      1523

[[721 136]
 [223 443]]


Random Forest

In [57]:
rfc = RandomForestClassifier(n_estimators=50)
rfc.fit(X_train, y_train)
rfc_predict = rfc.predict(X_test)
print(classification_report(y_test,rfc_predict))
print(confusion_matrix(y_test,rfc_predict))

              precision    recall  f1-score   support

           0       0.77      0.90      0.83       857
           1       0.84      0.65      0.73       666

    accuracy                           0.79      1523
   macro avg       0.80      0.78      0.78      1523
weighted avg       0.80      0.79      0.79      1523

[[772  85]
 [232 434]]


We notice that Random Forest Classification Algorithm gives better accuracy