In [1]:
import nltk
import pandas as pd
from pandas_profiling import ProfileReport
import string
import re
import numpy as np
import sys
import os

wn=nltk.WordNetLemmatizer()
stopwords = nltk.corpus.stopwords.words('english')
train_data=pd.read_csv("data/train.csv")

## Data Cleaning 

In [2]:
def clean_text(text):
    text="".join([i.lower() for i in text if i not in string.punctuation])
    #text=re.sub('@\w+','',text)
    tokens=[i for i in re.split('\W+',text) if  i !='']
    tokens=[i for i in tokens if i not in stopwords]
    return tokens
    
    
def clean_text2(text):
    text="".join([i.lower() for i in text if i not in string.punctuation]) # remove punctuation
    #text=re.sub('@\w+','',text) #remove words starting with @ these are usually usernames
    text=re.sub('[0-9]+','',text) #remove everything starting with numbers
    text=re.sub(r'http\S+', '', text)
    tokens=[i for i in re.split('\W+',text) if  i !=''] 
    tokens=[i for i in tokens if i not in stopwords] # remove stopwords
    text=" ".join([wn.lemmatize(word) for word in tokens])
    return text


def removeusername(x):
    return re.sub('@\w+','',x)

def removeemoji(x):
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', x) # no emoji

In [3]:
train_data['Rmusername']=train_data['text'].apply(lambda x : removeusername(x))

train_data['Remoji']=train_data['Rmusername'].apply(lambda x : removeemoji(x))

train_data['Cleaned_text']=train_data['Remoji'].apply(lambda x :clean_text2(x))

DV=pd.DataFrame(train_data['target'])

In [4]:
train_data

Unnamed: 0,id,keyword,location,text,target,Rmusername,Remoji,Cleaned_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this #earthquake M...,Our Deeds are the Reason of this #earthquake M...,deed reason earthquake may allah forgive u
1,4,,,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask. Canada,Forest fire near La Ronge Sask. Canada,forest fire near la ronge sask canada
2,5,,,All residents asked to 'shelter in place' are ...,1,All residents asked to 'shelter in place' are ...,All residents asked to 'shelter in place' are ...,resident asked shelter place notified officer ...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"13,000 people receive #wildfires evacuation or...","13,000 people receive #wildfires evacuation or...",people receive wildfire evacuation order calif...
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby #Alaska as ...,Just got sent this photo from Ruby #Alaska as ...,got sent photo ruby alaska smoke wildfire pour...
...,...,...,...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1,Two giant cranes holding a bridge collapse int...,Two giant cranes holding a bridge collapse int...,two giant crane holding bridge collapse nearby...
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1,The out of control wild fires in California ...,The out of control wild fires in California ...,control wild fire california even northern par...
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,utckm volcano hawaii
7611,10872,,,Police investigating after an e-bike collided ...,1,Police investigating after an e-bike collided ...,Police investigating after an e-bike collided ...,police investigating ebike collided car little...


## Count vectorizer

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(ngram_range=(1,2))
cv1=cv.fit(train_data['Cleaned_text'])
X_CV=cv1.transform(train_data['Cleaned_text'])
print(X_CV.shape)
#print(cv.get_feature_names())
Xcv_train=pd.DataFrame(X_CV.toarray())
Xcv_train.columns=cv.get_feature_names()

(7613, 57104)


In [8]:
Xcv_train.columns[Xcv_train.sum()>200]

Index(['amp', 'fire', 'get', 'im', 'like', 'new', 'one', 'via'], dtype='object')

In [9]:
Xcv_train.columns[Xcv_train.sum()==1]

Index(['aa ayyo', 'aa battery', 'aa near', 'aaaa', 'aaaa ok', 'aaaaaaallll',
       'aaaaaaallll ûªm', 'aaaaaand', 'aaaaaand there', 'aaarrrgghhh',
       ...
       'ûóher', 'ûóher upper', 'ûókody', 'ûókody vine', 'ûónegligence',
       'ûónegligence firework', 'ûótech', 'ûótech business', 'ûówe',
       'ûówe work'],
      dtype='object', length=46166)

In [10]:
Xcv_train.drop(Xcv_train.columns[Xcv_train.sum()==1],axis=1,inplace=True) # Removing words low frequency words
Xcv_train.drop(['amp', 'dont', 'get', 'im', 'like', 'new', 'one','people', 'via'],axis=1,inplace=True) #removing high frequency words
print(Xcv_train.shape)
#print(Xcv_train.columns)

(7613, 10929)


## Training RandomForestClassifier

In [15]:
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(Xcv_train, DV, test_size=0.2,
                                                    random_state=0)


In [16]:
param={
    'max_depth':[10,20,50,100],
    'n_estimators':[50,100,150]
    

}

In [18]:
rf = RandomForestClassifier()
clf = RandomizedSearchCV(rf, param, random_state=0)
rf_model=clf.fit(X_train,y_train.values.ravel())
randomforest_model2=pd.DataFrame(clf.cv_results_).sort_values('mean_test_score', ascending=False)

In [19]:
randomforest_model2

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
1,40.961664,0.259803,0.170313,0.002575,150,100,"{'n_estimators': 150, 'max_depth': 100}",0.779146,0.749589,0.765189,0.77422,0.760263,0.765681,0.010419,1
3,27.274104,0.243921,0.123703,0.001182,100,100,"{'n_estimators': 100, 'max_depth': 100}",0.775041,0.743842,0.768473,0.77422,0.760263,0.764368,0.011541,2
8,14.174195,0.866907,0.083299,0.00868,50,100,"{'n_estimators': 50, 'max_depth': 100}",0.777504,0.747947,0.76601,0.771757,0.756979,0.764039,0.010522,3
5,24.310394,0.610912,0.114521,0.000263,150,50,"{'n_estimators': 150, 'max_depth': 50}",0.746305,0.728243,0.737274,0.743842,0.738095,0.738752,0.006261,4
7,16.08703,0.096608,0.08735,0.00072,100,50,"{'n_estimators': 100, 'max_depth': 50}",0.738916,0.727422,0.745484,0.738916,0.740558,0.738259,0.00593,5
0,8.162378,0.15707,0.061582,0.000483,50,50,"{'n_estimators': 50, 'max_depth': 50}",0.743842,0.716749,0.738095,0.747947,0.732348,0.735796,0.010885,6
9,3.89343,0.228324,0.051733,0.001931,50,20,"{'n_estimators': 50, 'max_depth': 20}",0.695402,0.667488,0.695402,0.678982,0.697865,0.687028,0.011868,7
2,7.191709,0.010842,0.063609,0.000413,100,20,"{'n_estimators': 100, 'max_depth': 20}",0.691297,0.661741,0.698686,0.671593,0.697865,0.684236,0.014902,8
4,5.710839,0.079857,0.065977,0.002301,150,10,"{'n_estimators': 150, 'max_depth': 10}",0.651888,0.632184,0.66092,0.633826,0.662562,0.648276,0.012998,9
6,3.823422,0.002383,0.055458,0.000831,100,10,"{'n_estimators': 100, 'max_depth': 10}",0.650246,0.633005,0.656814,0.626437,0.649425,0.643186,0.011478,10


## Training XGBClassifier

In [21]:
from xgboost import XGBClassifier

for i in [10,20,50]:
    for j in [50,100,150]:
        xgb=XGBClassifier(max_depth=i,n_estimators=j)
        xgbmodel=xgb.fit(X_train,y_train.values.ravel())
        
        y_pred=xgbmodel.predict(X_test)
        
        print("Max depth: {}, N_estimator: {}, Recall : {}, Precision : {}, Accuracy : {}, F1-score : {}".format(i,j,round(recall_score(y_test,y_pred),3),round(precision_score(y_test,y_pred),3),round(accuracy_score(y_test,y_pred),3),round(f1_score(y_test,y_pred),3)))

In [22]:
xgb=XGBClassifier(max_depth=50,n_estimators=50)
xgbmodel=xgb.fit(X_train,y_train.values.ravel())

y_pred=xgbmodel.predict(X_test)
print("Max depth: {}, N_estimator: {}, Recall : {}, Precision : {}, Accuracy : {}, F1-score : {}".format(20,50,round(recall_score(y_test,y_pred),3),round(precision_score(y_test,y_pred),3),round(accuracy_score(y_test,y_pred),3),round(f1_score(y_test,y_pred),3)))


Max depth: 20, N_estimator: 50, Recall : 0.666, Precision : 0.794, Accuracy : 0.788, F1-score : 0.724


In [23]:
test_data=pd.read_csv("data/test.csv")

test_data['Rmusername']=test_data['text'].apply(lambda x : removeusername(x))

test_data['Remoji']=test_data['Rmusername'].apply(lambda x : removeemoji(x))

test_data['Cleaned_text']=test_data['Remoji'].apply(lambda x :clean_text2(x))


X_CV_test=cv1.transform(test_data['Cleaned_text'])
print(X_CV_test.shape)

#print(cv1.get_feature_names())


X_CV_test=pd.DataFrame(X_CV_test.toarray())
X_CV_test.columns=cv1.get_feature_names()
print(X_CV_test.shape)

X_CV_test=X_CV_test[Xcv_train.columns]
print(X_CV_test.shape)




(3263, 57104)
(3263, 57104)
(3263, 10929)


In [32]:
y_pred=xgbmodel.predict(X_CV_test)

resultdic=dict()
resultdic['id']=test_data['id'].values
resultdic['text']=test_data['text'].values
resultdic['Prediction']=y_pred
final_result=pd.DataFrame(resultdic)


In [33]:
final_result[1:10]

Unnamed: 0,id,text,Prediction
1,2,"Heard about #earthquake is different cities, s...",1
2,3,"there is a forest fire at spot pond, geese are...",1
3,9,Apocalypse lighting. #Spokane #wildfires,1
4,11,Typhoon Soudelor kills 28 in China and Taiwan,1
5,12,We're shaking...It's an earthquake,1
6,21,They'd probably still show more life than Arse...,0
7,22,Hey! How are you?,0
8,27,What a nice hat?,0
9,29,Fuck off!,0


In [None]:
final_result.to_csv("testpredictions.csv",index=False)