This notebook work on using Voting ensemble technique to classify disaster tweets from non disaster tweets

The Classification steps:
* Preprocessing
    * remove urls, stopwords, punctuations, and small words
    * translate emojis   
* Feature Extraction and Analysis
    * use unigram to explore frequent words on each class
    * use word cloud as explanatory analysis for unigram output
    * use bigram to explore frequent bigrams on each class
    * use bar graph to visualize bigram results
    * apply unigram analysis on both location and keyword columns
    * use bar graph to visualize unigram of location and keyword columns
    * format feature vector for both training and testing data
* Model Implementation
    * use Naive Bayes, Suppprt vector machine, K nearest neighbor, and logistic regression algorithms
    * split training data (0.33 for testing) and test it on each algorithm 
    * apply cross validation on each algorithm
    * use class report to evaluate each run
    * apply voting ensemble technique on the four algorithms
    * apply test feature vectors for each algorthim separately then on voting ensemble technique

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#load necessary libraries
import nltk
from nltk.corpus import stopwords
import re, string
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from emoji import UNICODE_EMOJI
import emoji
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [None]:
#load train and test data to dataframes
train_df=pd.read_csv('/kaggle/input/nlp-getting-started/train.csv',encoding="utf-8")
test_df=pd.read_csv('/kaggle/input/nlp-getting-started/test.csv',encoding="utf-8")
#visulaize first top 10 columns of train data
train_df.head(10)

In [None]:
#visulaize first top 10 columns of test data
test_df.head(10)

# Preprocessing

In [None]:
#preprocessing train data
#extract hashtags
train_df["hashtags"]=train_df["text"].apply(lambda x:re.findall(r"#(\w+)",x.lower()))
test_df["hashtags"]=test_df["text"].apply(lambda x:re.findall(r"#(\w+)",x.lower()))

#translate emojis to text
train_df["clean_text"]=train_df["text"].apply(lambda x: emoji.demojize(x))
test_df["clean_text"]=test_df["text"].apply(lambda x: emoji.demojize(x))

#length feature
train_df["len_text"] = train_df["clean_text"].apply(lambda x: len(x.split()))
test_df["len_text"] = test_df["clean_text"].apply(lambda x: len(x.split()))

#remove urls
train_df["clean_text"]=train_df["clean_text"].apply(lambda x: re.sub(r"http:\S+",'',x))
train_df["clean_text"]=train_df["clean_text"].apply(lambda x: re.sub(r"https:\S+",'',x))
test_df["clean_text"]=test_df["clean_text"].apply(lambda x: re.sub(r"http:\S+",'',x))
test_df["clean_text"]=test_df["clean_text"].apply(lambda x: re.sub(r"https:\S+",'',x))

#tokenize tweets
train_df["clean_text"]=train_df["clean_text"].apply(lambda x: nltk.word_tokenize(x.strip().lower()))
test_df["clean_text"]=test_df["clean_text"].apply(lambda x: nltk.word_tokenize(x.strip().lower()))

#remove punctuations from tweets
train_df["clean_text"]=train_df["clean_text"].apply(lambda x: [re.sub(r'['+string.punctuation+']','',y.strip()) for y in x])
test_df["clean_text"]=test_df["clean_text"].apply(lambda x: [re.sub(r'['+string.punctuation+']','',y.strip()) for y in x])

#load stopwords set
stopwrds = set(stopwords.words('english'))
#remove stop words from tweets
train_df["clean_text"]=train_df["clean_text"].apply(lambda x: [y for y in x if (y.strip() not in stopwrds)])
test_df["clean_text"]=test_df["clean_text"].apply(lambda x: [y for y in x if (y.strip() not in stopwrds)])

#remove new lines in tweets
train_df["clean_text"]=train_df["clean_text"].apply(lambda x: [re.sub('\\n','',y.strip()) for y in x])
test_df["clean_text"]=test_df["clean_text"].apply(lambda x: [re.sub('\\n','',y.strip()) for y in x])

#remove spaces and small words from tweets
train_df["clean_text"]=train_df["clean_text"].apply(lambda x: [y.strip() for y in x if (y.strip() != "") and len(y.strip())>2])
test_df["clean_text"]=test_df["clean_text"].apply(lambda x: [y.strip() for y in x if (y.strip() != "") and len(y.strip())>2])

#convert tokens of tweets to text
train_df["clean_text"]=train_df["clean_text"].apply(lambda x: ' '.join(x))
test_df["clean_text"]=test_df["clean_text"].apply(lambda x: ' '.join(x))

#convert tokens hashtags to text
train_df["hashtags"]=train_df["hashtags"].apply(lambda x: ' '.join(x))
test_df["hashtags"]=test_df["hashtags"].apply(lambda x: ' '.join(x))

#convert lower cases of keyword and location
train_df["keyword"]=train_df["keyword"].apply(lambda x: x if str(x).lower() == "nan" else x.lower())
train_df["location"]=train_df["location"].apply(lambda x: x if str(x).lower() == "nan" else x.lower())
test_df["keyword"]=test_df["keyword"].apply(lambda x: x if str(x).lower() == "nan" else x.lower())
test_df["location"]=test_df["location"].apply(lambda x: x if str(x).lower() == "nan" else x.lower())

#visualize data 
train_df.head(10)

In [None]:
test_df.head(10)

# Analysis 

**Data Statistics**

In [None]:
#total data length
print("length of train data",len(train_df))
print("length of test data",len(test_df))

# unique location and keyword size of data
print("Checking train location column values",len(train_df.location.unique()))
print("Checking train keyword column values",len(train_df.keyword.unique()))
print("Checking test location column values",len(test_df.location.unique()))
print("Checking test keyword column values",len(test_df.keyword.unique()))

#number of disaster tweets
print("disaster tweets", len(train_df[train_df["target"]==1]) )
print("non-disaster tweets", len(train_df[train_df["target"]==0]) )

**Graphical analysis**

In [None]:
plt.subplots(1,2,figsize=(10,5))
#visualize top 20 train unique keywords
plt.subplot(1,2,1)
train_df.keyword.value_counts()[:20].plot(kind="bar",title="Unique Keywords")

#visualize top 20 train unique locations
plt.subplot(1,2,2)
train_df.location.value_counts()[:20].plot(kind="bar",title="Unique Locations")

plt.show()

In [None]:
plt.subplots(1,2,figsize=(10,5))
#visualize top 20 disaster tweets and their keywords bar graph
plt.subplot(1,2,1)
train_df[train_df["target"]==1].keyword.value_counts()[:20].plot(kind="bar",title="Disaster tweets keywords")

#visualize top 20 non disaster tweets and their keywords bar graph
plt.subplot(1,2,2)
train_df[train_df["target"]==0].keyword.value_counts()[:20].plot(kind="bar",title="Non-Disaster tweets keywords")

plt.show()

In [None]:
plt.subplots(1,2,figsize=(10,5))
#visualize top 20 disaster tweets and their locations bar graph
plt.subplot(1,2,1)
train_df[train_df["target"]==1].location.value_counts()[:20].plot(kind="bar",title="Disaster tweets Locations")

#visualize top 20 non disaster tweets and their locations bar graph
plt.subplot(1,2,2)
train_df[train_df["target"]==0].location.value_counts()[:20].plot(kind="bar",title="Non-Disaster tweets Locations")

plt.show()

In [None]:
import seaborn as sns
sns.countplot(x = "len_text" ,data  = train_df[train_df["target"]==1])

In [None]:
sns.countplot(x = "len_text" ,data  = train_df[train_df["target"]==0])

# Feature Extraction

In [None]:
plt.subplots(1,2,figsize=(15,15))
plt.subplot(1,2,1)
#Uigram Frequency distribution for disaster tweets
#convert disaster tweets into single string
txt=' '.join(train_df[train_df["target"]==1]["clean_text"])
disaster_unigram=nltk.FreqDist(nltk.word_tokenize(txt))

#visualize unigram frequency distribution for disaster tweets using wordcloud
disaster_wc = WordCloud(width=800, height=400, max_words=100).generate_from_frequencies(disaster_unigram)
plt.title("Disaster Unigram Frequency Distribution")
plt.imshow(disaster_wc, interpolation="bilinear")

plt.subplot(1,2,2)
#Uigram Frequency distribution for non disaster tweets
#convert non disaster tweets into single string
txt=' '.join(train_df[train_df["target"]==0]["clean_text"])
nondisaster_unigram=nltk.FreqDist(nltk.word_tokenize(txt))

#visualize unigram frequency distribution for non disaster tweets using wordcloud
nondisaster_wc = WordCloud(width=800, height=400, max_words=100).generate_from_frequencies(nondisaster_unigram)
plt.title("Non Disaster Unigram Frequency Distribution")
plt.imshow(nondisaster_wc, interpolation="bilinear")

In [None]:
plt.subplots(1,2,figsize=(15,10))

plt.subplot(1,2,1)
#Bigram Frequency distribution for disaster tweets
#convert disaster tweets into single string
txt=' '.join(train_df[train_df["target"]==1]["clean_text"])
disaster_bigram=nltk.FreqDist(nltk.bigrams(nltk.word_tokenize(txt)))
tmplst=disaster_bigram.most_common(30)

#visualize Bigram frequency distribution for disaster tweets using bar graph
wrd,cnt=zip(*tmplst)
wrd=[ x+","+y for (x,y) in wrd]
plt.barh(wrd,cnt)
plt.title("Disaster Bigram BarGraph")

plt.subplot(1,2,2)
#Bigram Frequency distribution for non disaster tweets
#convert non disaster tweets into single string
txt=' '.join(train_df[train_df["target"]==0]["clean_text"])
nondisaster_bigram=nltk.FreqDist(nltk.bigrams(nltk.word_tokenize(txt)))
tmplst=nondisaster_bigram.most_common(30)

#visualize Bigram frequency distribution for non disaster tweets using bar graph
wrd,cnt=zip(*tmplst)
wrd=[ x+","+y for (x,y) in wrd]
plt.barh(wrd,cnt)
plt.title("Non Disaster Bigram BarGraph")
plt.show()


In [None]:
plt.subplots(1,2,figsize=(15,15))
plt.subplot(1,2,1)
#Uigram Frequency distribution for disaster hashtags
#convert disaster hashtags into single string
txt=' '.join(train_df[train_df["target"]==1]["hashtags"])
disaster_unigram_hash=nltk.FreqDist(nltk.word_tokenize(txt))

#visualize unigram frequency distribution for disaster hashtags using wordcloud
disaster_wc = WordCloud(width=800, height=400, max_words=100).generate_from_frequencies(disaster_unigram_hash)
plt.title("Disaster Unigram Frequency Distribution hashtags")
plt.imshow(disaster_wc, interpolation="bilinear")

plt.subplot(1,2,2)
#Uigram Frequency distribution for non disaster hashtags
#convert non disaster hashtags into single string
txt=' '.join(train_df[train_df["target"]==0]["hashtags"])
nondisaster_unigram_hash=nltk.FreqDist(nltk.word_tokenize(txt))

#visualize unigram frequency distribution for non disaster hashtags using wordcloud
nondisaster_wc = WordCloud(width=800, height=400, max_words=100).generate_from_frequencies(nondisaster_unigram_hash)
plt.title("Non Disaster Unigram Frequency Distribution hashtags")
plt.imshow(nondisaster_wc, interpolation="bilinear")

**Convert Tweet to train feature vector**

In [None]:
#compute unigram feature vector for tweet likelihood to disaster
train_df["unigram_disas"]=train_df["clean_text"].apply(lambda x: sum([disaster_unigram.get(wrd) for wrd in nltk.word_tokenize(x) if disaster_unigram.get(wrd)!=None])/len(disaster_unigram))

#compute unigram feature vector for tweet likelihood to non disaster
train_df["unigram_nondisas"]=train_df["clean_text"].apply(lambda x: sum([nondisaster_unigram.get(wrd) for wrd in nltk.word_tokenize(x) if nondisaster_unigram.get(wrd)!=None])/len(nondisaster_unigram))

#compute unigram feature vector for hashtags likelihood to disaster
train_df["unigram_disas_hash"]=train_df["hashtags"].apply(lambda x: sum([disaster_unigram_hash.get(wrd) for wrd in nltk.word_tokenize(x) if disaster_unigram_hash.get(wrd)!=None])/len(disaster_unigram_hash))

#compute unigram feature vector for hashtags likelihood to non disaster
train_df["unigram_nondisas_hash"]=train_df["hashtags"].apply(lambda x: sum([nondisaster_unigram_hash.get(wrd) for wrd in nltk.word_tokenize(x) if nondisaster_unigram_hash.get(wrd)!=None])/len(nondisaster_unigram_hash))

#compute bigram feature vector for tweet likelihood to disaster
train_df["bigram_disas"]=train_df["clean_text"].apply(lambda x: sum([disaster_bigram.get(wrd) for wrd in nltk.bigrams(nltk.word_tokenize(x)) if disaster_bigram.get(wrd)!=None])/len(disaster_bigram))

#compute bigram feature vector for tweet likelihood to non disaster
train_df["bigram_nondisas"]=train_df["clean_text"].apply(lambda x: sum([nondisaster_bigram.get(wrd) for wrd in nltk.bigrams(nltk.word_tokenize(x)) if nondisaster_bigram.get(wrd)!=None])/len(nondisaster_bigram))

key_disas=nltk.FreqDist(train_df[train_df["target"]==1]["keyword"])
#compute unigram keyword to disaster
train_df["key_disas"]=train_df["keyword"].apply(lambda x: sum([key_disas.get(x) if (x in key_disas.keys() and str(x).lower()!="nan") else 0])/len(key_disas))

key_nondisas=nltk.FreqDist(train_df[train_df["target"]==0]["keyword"])
#compute unigram keyword to non disaster
train_df["key_nondisas"]=train_df["keyword"].apply(lambda x: sum([key_nondisas.get(x) if (x in key_nondisas.keys() and str(x).lower() != "nan") else 0])/len(key_nondisas))

loc_disas=nltk.FreqDist(train_df[train_df["target"]==1]["location"])
#compute unigram location to disaster
train_df["loc_disas"]=train_df["location"].apply(lambda x: sum([loc_disas.get(x) if (x in loc_disas.keys() and str(x).lower()!="nan") else 0])/len(loc_disas))

loc_nondisas=nltk.FreqDist(train_df[train_df["target"]==0]["location"])
#compute unigram location to non disaster
train_df["loc_nondisas"]=train_df["location"].apply(lambda x: sum([loc_nondisas.get(x) if (x in loc_nondisas.keys() and str(x).lower() != "nan") else 0])/len(loc_nondisas))
train_df.head(5)



In [None]:
#define feature vectors for training dataset
train_feature_vectors = train_df[['unigram_disas', 'unigram_nondisas', 'unigram_disas_hash', 'unigram_nondisas_hash', 'bigram_disas',
          'bigram_nondisas', 'key_disas', 'key_nondisas', 'loc_disas','loc_nondisas']]
train_feature_vectors.head(5)

**Convert Tweet to test feature vector**

In [None]:
#compute unigram feature vector for tweet likelihood to disaster
test_df["unigram_disas"]=test_df["clean_text"].apply(lambda x: sum([disaster_unigram.get(wrd) for wrd in nltk.word_tokenize(x) if disaster_unigram.get(wrd)!=None])/len(disaster_unigram) )

#compute unigram feature vector for tweet likelihood to non disaster
test_df["unigram_nondisas"]=test_df["clean_text"].apply(lambda x: sum([nondisaster_unigram.get(wrd) for wrd in nltk.word_tokenize(x) if nondisaster_unigram.get(wrd)!=None])/len(nondisaster_unigram))

#compute unigram feature vector for hashtags likelihood to disaster
test_df["unigram_disas_hash"]=test_df["hashtags"].apply(lambda x: sum([disaster_unigram_hash.get(wrd) for wrd in nltk.word_tokenize(x) if disaster_unigram_hash.get(wrd)!=None])/len(disaster_unigram_hash))

#compute unigram feature vector for hashtags likelihood to non disaster
test_df["unigram_nondisas_hash"]=test_df["hashtags"].apply(lambda x: sum([nondisaster_unigram_hash.get(wrd) for wrd in nltk.word_tokenize(x) if nondisaster_unigram_hash.get(wrd)!=None])/len(nondisaster_unigram_hash))

#compute bigram feature vector for tweet likelihood to disaster
test_df["bigram_disas"]=test_df["clean_text"].apply(lambda x: sum([disaster_bigram.get(wrd) for wrd in nltk.bigrams(nltk.word_tokenize(x)) if disaster_bigram.get(wrd)!=None])/len(disaster_bigram) if x.strip()!='' else 0)

#compute bigram feature vector for tweet likelihood to non disaster
test_df["bigram_nondisas"]=test_df["clean_text"].apply(lambda x: sum([nondisaster_bigram.get(wrd) for wrd in nltk.bigrams(nltk.word_tokenize(x)) if nondisaster_bigram.get(wrd)!=None])/len(nondisaster_bigram) if x.strip()!='' else 0)


In [None]:
#compute unigram keyword to disaster
test_df["key_disas"]=test_df["keyword"].apply(lambda x: sum([key_disas.get(x) if (x in key_disas.keys() and str(x).lower()!="nan") else 0])/len(key_disas))

#compute unigram keyword to non disaster
test_df["key_nondisas"]=test_df["keyword"].apply(lambda x: sum([key_nondisas.get(x) if (x in key_nondisas.keys() and str(x).lower() != "nan") else 0])/len(key_nondisas))

#compute unigram location to disaster
test_df["loc_disas"]=test_df["location"].apply(lambda x: sum([loc_disas.get(x) if (x in loc_disas.keys() and str(x).lower()!="nan") else 0])/len(loc_disas))

#compute unigram location to non disaster
test_df["loc_nondisas"]=test_df["location"].apply(lambda x: sum([loc_nondisas.get(x) if (x in loc_nondisas.keys() and str(x).lower() != "nan") else 0])/len(loc_nondisas))


In [None]:
#define feature vectors for testing dataset
test_feature_vectors = test_df[['unigram_disas', 'unigram_nondisas', 'unigram_disas_hash', 'unigram_nondisas_hash', 'bigram_disas',
          'bigram_nondisas', 'key_disas', 'key_nondisas', 'loc_disas','loc_nondisas']]
test_feature_vectors.head(5)

# Model Building & Training

In [None]:
#split train data 
Y=train_df['target']
X_train, X_test, y_train, y_test = train_test_split(train_feature_vectors, Y, test_size=0.33, random_state=42)

In [None]:
#Naive Bayes Classifier
nb_clf = GaussianNB()
#train classifier on train data after splitting
nb_clf.fit(X_train,y_train)
print(nb_clf.get_params())
print("split training score",nb_clf.score(X_test,y_test))

#train over all trained data applying cross validation
print("NB cross validation scores",cross_validate(nb_clf,train_feature_vectors,Y,cv=5))
print(classification_report(Y, nb_clf.predict(train_feature_vectors)))

In [None]:
#Support Vector Machine
svm_clf = SVC(probability=True)
#train classifier on train data after splitting
svm_clf.fit(X_train,y_train)
print(svm_clf.get_params())
print("split training score",svm_clf.score(X_test,y_test))

#train over all trained data applying cross validation
print("SVM cross validation scores",cross_validate(svm_clf,train_feature_vectors,Y,cv=5))
print(classification_report(Y, svm_clf.predict(train_feature_vectors)))

In [None]:
#K nearest neighbor
knn_clf = KNeighborsClassifier()
#train classifier on train data after splitting
knn_clf.fit(X_train,y_train)
print(knn_clf.get_params())
print("split training score",knn_clf.score(X_test,y_test))

#train over all trained data applying cross validation
print("KNN cross validation scores",cross_validate(knn_clf,train_feature_vectors,Y,cv=5))
print(classification_report(Y, knn_clf.predict(train_feature_vectors)))

In [None]:
#Logistic Regression
logReg_clf = LogisticRegression()
#train classifier on train data after splitting
logReg_clf.fit(X_train,y_train)
print(logReg_clf.get_params())
print("split training score",logReg_clf.score(X_test,y_test))

#train over all trained data applying cross validation
print("LR cross validation score",cross_validate(logReg_clf,train_feature_vectors,Y,cv=5))
print(classification_report(Y, logReg_clf.predict(train_feature_vectors)))

# Model Testing

In [None]:
#Naive Bayes Model on test features
nb_clf.predict(test_feature_vectors)

In [None]:
#SVM Model on test features
svm_clf.predict(test_feature_vectors)

In [None]:
#KNN Model on test features
knn_clf.predict(test_feature_vectors)

In [None]:
#Logistic regression model on test features
logReg_clf.predict(test_feature_vectors)

# Voting Classifier on NB,SVM,LR,and KNN models

In [None]:
from sklearn.ensemble import VotingClassifier
voting_clf = VotingClassifier(estimators=[('NB', nb_clf), ('SVM', svm_clf), ('KNN', knn_clf),('LogReg',logReg_clf)], voting='soft')
#train over all trained data applying cross validation
voting_clf.fit(train_feature_vectors,Y)
print("Voting score",voting_clf.score(train_feature_vectors,Y))
print(classification_report(Y, voting_clf.predict(train_feature_vectors)))


In [None]:
#testing model on test feature vectors
vals=voting_clf.predict(test_feature_vectors)

#save in submission dataframe
submission=pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv')
submission['id']=test_df['id']
submission['target']=vals
submission.head(10)

In [None]:
submission.to_csv('sample_submission.csv',index=False)