# Disaster Tweets - NLP for EXTREME Beginners

Predict which Tweets are about real disasters and which ones are not





In [None]:

import numpy as np 
import pandas as pd 
import os

from sklearn import model_selection as sk_model_selection
from sklearn.feature_extraction import text as sk_fe_text
from sklearn import svm as sk_svm
from sklearn import metrics as sk_metrics

<a id="1"></a>
<h2 style='background:#FA497A; border:0; color:white'><center>Data Loading<center><h2>

In [None]:
#Reading CSV(comma seperated values) file using pandas
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv('test.csv')
df_submission = pd.read_csv('sample_submission.csv')

In [None]:
print(f'df_train shape: {df_train.shape}')
#.head shows first 5 entries from dataset
df_train.head()

df_train shape: (7613, 5)


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [None]:
#Sum of missing values {i.e is Not Available}
df_train.isna().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [None]:
#help(df_train.isna)

<a id="2"></a>
<h2 style='background:#FA497A; border:0; color:white'><center>TF-IDF preprocessing<center><h2>


In [None]:
#Extracting "text column" from dataset for X_train and "target column" for y_train
#.values ,  Return a Numpy representation of the DataFrame.
X_train = df_train["text"]
y_train = df_train["target"]

In [None]:
#TfidfVectorizer Converts text into vector format
#For extremely easy understanding of TF-IDF please visit https://www.capitalone.com/tech/machine-learning/understanding-tf-idf/
tfidf = sk_fe_text.TfidfVectorizer(stop_words = 'english')
tfidf.fit(X_train)
print(X_train)
#.transform returns vectors : sparse matrix of shape (n_samples, n_features) and Tf-idf-weighted document-term matrix.
X_train = tfidf.transform(X_train)
print(X_train)

0       Our Deeds are the Reason of this #earthquake M...
1                  Forest fire near La Ronge Sask. Canada
2       All residents asked to 'shelter in place' are ...
3       13,000 people receive #wildfires evacuation or...
4       Just got sent this photo from Ruby #Alaska as ...
                              ...                        
7608    Two giant cranes holding a bridge collapse int...
7609    @aria_ahrary @TheTawniest The out of control w...
7610    M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...
7611    Police investigating after an e-bike collided ...
7612    The Latest: More Homes Razed by Northern Calif...
Name: text, Length: 7613, dtype: object
  (0, 15499)	0.3982041761665894
  (0, 7572)	0.510614541608432
  (0, 6311)	0.3541772503550105
  (0, 5429)	0.510614541608432
  (0, 1844)	0.44106404207379424
  (1, 16431)	0.5020741962299169
  (1, 16087)	0.5020741962299169
  (1, 12979)	0.32208070183508675
  (1, 10967)	0.3694302900637648
  (1, 7563)	0.3138500244679633
  (1, 3

<a id="3"></a>
<h2 style='background:#FA497A; border:0; color:white'><center>SVM Training<center><h2>


Using GridSearchCV to find the best parameters for SVM

In [None]:
#trying different values for C and gama
#Gamma decides that how much curvature we want in a decision boundary
parameters = { 
    'C': [0.01, 0.1, 1],
    'gamma': [0.7, 1, 'auto', 'scale']
}
#If the hyperplane classifies the dataset linearly then the algorithm we call it as SVC 
#and the algorithm that separates the dataset by non-linear approach then we call it as SVM

model = sk_svm.SVC(
    kernel='rbf', 
    class_weight='balanced',
    random_state=42,
)
#The GridSearchCV instance implements the usual estimator API: when “fitting” it 
#on a dataset all the possible combinations of parameter values are evaluated and the best combination is retained.
model = sk_model_selection.GridSearchCV(
    model, 
    parameters, 
    cv=5,
    scoring='f1',
    n_jobs=-1,
)

model.fit(X_train, y_train)

print(f'Best parameters: {model.best_params_}')
print(f'Mean cross-validated F1 score of the best_estimator: {model.best_score_:.3f}')


Best parameters: {'C': 1, 'gamma': 0.7}
Mean cross-validated F1 score of the best_estimator: 0.591


##Testing

In [None]:
#Extracting "text column" from dataset for X_test 
X_test = df_test["text"]
X_test

0                      Just happened a terrible car crash
1       Heard about #earthquake is different cities, s...
2       there is a forest fire at spot pond, geese are...
3                Apocalypse lighting. #Spokane #wildfires
4           Typhoon Soudelor kills 28 in China and Taiwan
                              ...                        
3258    EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259    Storm in RI worse than last hurricane. My city...
3260    Green Line derailment in Chicago http://t.co/U...
3261    MEG issues Hazardous Weather Outlook (HWO) htt...
3262    #CityofCalgary has activated its Municipal Eme...
Name: text, Length: 3263, dtype: object

<a id="100"></a>
<h2 style='background:#FA497A; border:0; color:white'><center>Submission<center><h2>

In [None]:
X_test = df_test["text"]
X_test = tfidf.transform(X_test)


y_test_pred = model.predict(X_test)

In [None]:
df_submission["target"] = y_test_pred
df_submission.to_csv("submission.csv",index=False)

In [None]:
df_submission