In [156]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import re
import nltk
import string
import warnings
from wordcloud import WordCloud, STOPWORDS
import pickle

# plotly imports
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

pd.set_option('display.max_colwidth', 200)
warnings.filterwarnings('ignore', category=DeprecationWarning)

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import gensim
from gensim.models.doc2vec import LabeledSentence
from tqdm import tqdm
tqdm.pandas(desc='progress-bar')

from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [157]:
df = pd.read_csv('dataset.csv')
df.head()

Unnamed: 0,text,choose_one,tidy_tweet,polarity,review_len,word_count
0,Just happened a terrible car crash,Relevant,just happened a terrible car crash,-1.0,34,6
1,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,Relevant,our deeds are the reason of this earthquake may allah forgive us all,0.0,69,13
2,"Heard about #earthquake is different cities, stay safe everyone.",Relevant,heard about earthquake is different cities stay safe everyone,0.25,64,9
3,"there is a forest fire at spot pond, geese are fleeing across the street, I cannot save them all",Relevant,there is a forest fire at spot pond geese are fleeing across the street i cannot save them all,0.0,96,19
4,Forest fire near La Ronge Sask. Canada,Relevant,forest fire near la ronge sask canada,0.1,38,7


In [158]:
# Tokenizing
def tokenize(text):
    tweet_tokens = re.split('\W+', text)
    return tweet_tokens

df['tidy_tweet_tokens'] = df['tidy_tweet'].apply(lambda x: tokenize(x))

In [159]:
# Removing stopwords
stopwords = nltk.corpus.stopwords.words('english')

In [160]:
def remove_stopwords(tokens):
    text = [word for word in tokens if word not in stopwords]
    return text

df['tidy_tweet_wo_stopw'] = df['tidy_tweet_tokens'].apply(lambda x: remove_stopwords(x))

In [161]:
# Stemming
stemmer = nltk.PorterStemmer()

def stemming(tokens):
    res = [stemmer.stem(word) for word in tokens]
    return res

df['tidy_tweet_stems'] = df['tidy_tweet_wo_stopw'].apply(lambda x: stemming(x))

In [162]:
df.head()

Unnamed: 0,text,choose_one,tidy_tweet,polarity,review_len,word_count,tidy_tweet_tokens,tidy_tweet_wo_stopw,tidy_tweet_stems
0,Just happened a terrible car crash,Relevant,just happened a terrible car crash,-1.0,34,6,"[just, happened, a, terrible, car, crash]","[happened, terrible, car, crash]","[happen, terribl, car, crash]"
1,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,Relevant,our deeds are the reason of this earthquake may allah forgive us all,0.0,69,13,"[our, deeds, are, the, reason, of, this, earthquake, may, allah, forgive, us, all]","[deeds, reason, earthquake, may, allah, forgive, us]","[deed, reason, earthquak, may, allah, forgiv, us]"
2,"Heard about #earthquake is different cities, stay safe everyone.",Relevant,heard about earthquake is different cities stay safe everyone,0.25,64,9,"[heard, about, earthquake, is, different, cities, stay, safe, everyone, ]","[heard, earthquake, different, cities, stay, safe, everyone, ]","[heard, earthquak, differ, citi, stay, safe, everyon, ]"
3,"there is a forest fire at spot pond, geese are fleeing across the street, I cannot save them all",Relevant,there is a forest fire at spot pond geese are fleeing across the street i cannot save them all,0.0,96,19,"[there, is, a, forest, fire, at, spot, pond, geese, are, fleeing, across, the, street, i, cannot, save, them, all]","[forest, fire, spot, pond, geese, fleeing, across, street, cannot, save]","[forest, fire, spot, pond, gees, flee, across, street, cannot, save]"
4,Forest fire near La Ronge Sask. Canada,Relevant,forest fire near la ronge sask canada,0.1,38,7,"[forest, fire, near, la, ronge, sask, canada]","[forest, fire, near, la, ronge, sask, canada]","[forest, fire, near, la, rong, sask, canada]"


In [163]:
df.shape

(10876, 9)

In [164]:
# Stitching together
tokens = []
for i,d in enumerate(df['tidy_tweet_stems']):
    tokens.append(' '.join(d))

In [165]:
df['tidy_tweet_final'] = tokens

In [166]:
# Removing "Can't Decide" rows
df.drop(df[df['choose_one'] == "Can't Decide"].index, inplace=True)

In [167]:
df = df.reset_index(drop=True)

In [170]:
df.choose_one.value_counts()

Not Relevant    6187
Relevant        4673
Name: choose_one, dtype: int64

In [171]:
def encoder(x):
    if x == 'Relevant':
        return 1
    if x == 'Not Relevant':
        return 0

In [172]:
df['choose_one'] = df['choose_one'].apply(lambda x: encoder(x))

In [173]:
df.choose_one.value_counts()

0    6187
1    4673
Name: choose_one, dtype: int64

In [174]:
df.head()

Unnamed: 0,text,choose_one,tidy_tweet,polarity,review_len,word_count,tidy_tweet_tokens,tidy_tweet_wo_stopw,tidy_tweet_stems,tidy_tweet_final
0,Just happened a terrible car crash,1,just happened a terrible car crash,-1.0,34,6,"[just, happened, a, terrible, car, crash]","[happened, terrible, car, crash]","[happen, terribl, car, crash]",happen terribl car crash
1,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1,our deeds are the reason of this earthquake may allah forgive us all,0.0,69,13,"[our, deeds, are, the, reason, of, this, earthquake, may, allah, forgive, us, all]","[deeds, reason, earthquake, may, allah, forgive, us]","[deed, reason, earthquak, may, allah, forgiv, us]",deed reason earthquak may allah forgiv us
2,"Heard about #earthquake is different cities, stay safe everyone.",1,heard about earthquake is different cities stay safe everyone,0.25,64,9,"[heard, about, earthquake, is, different, cities, stay, safe, everyone, ]","[heard, earthquake, different, cities, stay, safe, everyone, ]","[heard, earthquak, differ, citi, stay, safe, everyon, ]",heard earthquak differ citi stay safe everyon
3,"there is a forest fire at spot pond, geese are fleeing across the street, I cannot save them all",1,there is a forest fire at spot pond geese are fleeing across the street i cannot save them all,0.0,96,19,"[there, is, a, forest, fire, at, spot, pond, geese, are, fleeing, across, the, street, i, cannot, save, them, all]","[forest, fire, spot, pond, geese, fleeing, across, street, cannot, save]","[forest, fire, spot, pond, gees, flee, across, street, cannot, save]",forest fire spot pond gees flee across street cannot save
4,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada,0.1,38,7,"[forest, fire, near, la, ronge, sask, canada]","[forest, fire, near, la, ronge, sask, canada]","[forest, fire, near, la, rong, sask, canada]",forest fire near la rong sask canada


In [175]:
df.to_pickle('final-data.pickle')