# Natural Language Processing with Disaster Tweets

In [1]:
import pandas as pd

df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

df_train

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


## EDA

In [2]:
df_train['keyword'].value_counts()

fatalities               45
deluge                   42
armageddon               42
harm                     41
body%20bags              41
                         ..
forest%20fire            19
epicentre                12
threat                   11
inundation               10
radiation%20emergency     9
Name: keyword, Length: 221, dtype: int64

In [3]:
len(df_train[df_train['keyword'].isna() == False])

7552

In [4]:
df_train[df_train['keyword'].isna() == False]

Unnamed: 0,id,keyword,location,text,target
31,48,ablaze,Birmingham,@bbcmtd Wholesale Markets ablaze http://t.co/l...,1
32,49,ablaze,Est. September 2012 - Bristol,We always try to bring the heavy. #metal #RT h...,0
33,50,ablaze,AFRICA,#AFRICANBAZE: Breaking news:Nigeria flag set a...,1
34,52,ablaze,"Philadelphia, PA",Crying out for more! Set me ablaze,0
35,53,ablaze,"London, UK",On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,0
...,...,...,...,...,...
7578,10830,wrecked,,@jt_ruff23 @cameronhacker and I wrecked you both,0
7579,10831,wrecked,"Vancouver, Canada",Three days off from work and they've pretty mu...,0
7580,10832,wrecked,London,#FX #forex #trading Cramer: Iger's 3 words tha...,0
7581,10833,wrecked,Lincoln,@engineshed Great atmosphere at the British Li...,0


In [5]:
target_lens = len(df_train[df_train['target'] == 0]),len(df_train[df_train['target'] == 1])
target_lens

(4342, 3271)

In [6]:
loc_perc_nontarget = len(df_train[(df_train['location'].isna() == False) & (df_train['target'] == 0)]) / target_lens[0]
loc_perc_nontarget

0.6642100414555504

In [7]:
loc_perc_target = len(df_train[(df_train['location'].isna() == False) & (df_train['target'] == 1)]) / target_lens[1]
loc_perc_target

0.671354325894222

This tells us that whether or not a tweet contains a location doesn't necessarily differ from disaster tweets to non-disaster tweets, so that shouldn't necessarily be taken into account in our model.

In [8]:
key_perc_nontarget = len(df_train[(df_train['keyword'].isna() == False) & (df_train['target'] == 0)]) / target_lens[0]
key_perc_nontarget

0.9956241363426992

In [9]:
key_perc_target = len(df_train[(df_train['keyword'].isna() == False) & (df_train['target'] == 1)]) / target_lens[1]
key_perc_target

0.9871598899419138

Once again, this tells us that whether or not a tweet contains a keyword doesn't necessarily differ from disaster tweets to non-disaster tweets, so that shouldn't necessarily be taken into account into our model.

## Baseline Model

In [10]:
corpus = df_train['text'].fillna('')
corpus

0       Our Deeds are the Reason of this #earthquake M...
1                  Forest fire near La Ronge Sask. Canada
2       All residents asked to 'shelter in place' are ...
3       13,000 people receive #wildfires evacuation or...
4       Just got sent this photo from Ruby #Alaska as ...
                              ...                        
7608    Two giant cranes holding a bridge collapse int...
7609    @aria_ahrary @TheTawniest The out of control w...
7610    M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...
7611    Police investigating after an e-bike collided ...
7612    The Latest: More Homes Razed by Northern Calif...
Name: text, Length: 7613, dtype: object

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score

vec = TfidfVectorizer(norm=None)
vec.fit(corpus)
tf_idf_sparse = pd.DataFrame.sparse.from_spmatrix(vec.transform(corpus))

X_train = tf_idf_sparse
y_train = df_train['target']

pipeline = make_pipeline(
    KNeighborsClassifier()
)

mean_accuracy = cross_val_score(pipeline, X_train, y_train, cv=10, scoring='accuracy').mean()
mean_accuracy

0.5730988373496677

In [12]:
from sklearn.dummy import DummyClassifier
dummy_clf = DummyClassifier(strategy='most_frequent')
cross_val_score(dummy_clf, X_train, y_train, cv=10, scoring='accuracy').mean()

0.5703401726558162

## Iteration 1: Model Tuning

In [14]:
from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(
    pipeline,
    param_grid={#'kneighborsclassifier__n_neighbors': range(1,50),
                'kneighborsclassifier__metric': ['euclidean','minkowski','cosine']
               },
    scoring='f1_macro',
    cv=5
)

#cross_val_score(grid_search,X=X_train,y=y_train,scoring='accuracy',cv=10).mean()

In [15]:
grid_search.fit(X_train,y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('kneighborsclassifier',
                                        KNeighborsClassifier())]),
             param_grid={'kneighborsclassifier__metric': ['euclidean',
                                                          'minkowski',
                                                          'cosine']},
             scoring='f1_macro')

In [16]:
grid_search.best_params_

{'kneighborsclassifier__metric': 'cosine'}

In [17]:
grid_search = GridSearchCV(
    pipeline,
    param_grid={'kneighborsclassifier__n_neighbors': range(1,20),
                'kneighborsclassifier__metric': ['euclidean','minkowski','cosine']
               },
    scoring='f1_macro',
    cv=5
)

In [18]:
grid_search.fit(X_train,y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('kneighborsclassifier',
                                        KNeighborsClassifier())]),
             param_grid={'kneighborsclassifier__metric': ['euclidean',
                                                          'minkowski',
                                                          'cosine'],
                         'kneighborsclassifier__n_neighbors': range(1, 20)},
             scoring='f1_macro')

In [19]:
grid_search.best_params_

{'kneighborsclassifier__metric': 'cosine',
 'kneighborsclassifier__n_neighbors': 11}

In [14]:
model = KNeighborsClassifier(n_neighbors=11,metric='cosine')
model.fit(X_train,y_train)

KNeighborsClassifier(metric='cosine', n_neighbors=11)

In [15]:
cross_val_score(model, X_train, y_train, cv=10, scoring='f1_macro').mean()

0.6824634661127833

In [16]:
cross_val_score(pipeline, X_train, y_train, cv=10, scoring='f1_macro').mean()

0.3785480521728119

In [17]:
cross_val_score(dummy_clf, X_train, y_train, cv=10, scoring='f1_macro').mean()

0.363195271101306

In [18]:
from sklearn.model_selection import GridSearchCV

grid_search_accuracy = GridSearchCV(
    pipeline,
    param_grid={'kneighborsclassifier__n_neighbors': range(1,20),
                'kneighborsclassifier__metric': ['euclidean','minkowski','cosine']
               },
    scoring='accuracy',
    cv=5
)

In [19]:
grid_search_accuracy.fit(X_train,y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('kneighborsclassifier',
                                        KNeighborsClassifier())]),
             param_grid={'kneighborsclassifier__metric': ['euclidean',
                                                          'minkowski',
                                                          'cosine'],
                         'kneighborsclassifier__n_neighbors': range(1, 20)},
             scoring='accuracy')

In [20]:
grid_search_accuracy.best_params_

{'kneighborsclassifier__metric': 'cosine',
 'kneighborsclassifier__n_neighbors': 11}

In [24]:
cross_val_score(model, X_train, y_train, cv=10, scoring='accuracy').mean()

0.7013016441275984

In [21]:
def predict_disaster(tweet):
    return model.predict(vec.transform([tweet]))[0]

In [22]:
submission = pd.read_csv('sample_submission.csv')
submission['target'] = df_test['text'].apply(predict_disaster)
submission.to_csv('submission_final.csv',index=False)