In [2]:
import pandas as pd
import numpy as np
import spacy
import re
from nltk.corpus import stopwords

In [3]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [4]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [5]:
train_df

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [6]:
test_df

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,,,Storm in RI worse than last hurricane. My city...
3260,10868,,,Green Line derailment in Chicago http://t.co/U...
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...


In [7]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [8]:
train_df = train_df.drop(['id','keyword','location'], axis=1)

In [9]:
nlp = spacy.load('en_core_web_sm')

In [10]:
def data_cleaning(text):
    processed_text = []
    doc = nlp(text)
    for token in doc:
        if token.is_bracket or token.is_stop or token.like_email or token.like_url:
            continue
        elif token.is_alpha:
            processed_text.append(token.lemma_.lower())
        elif token.like_num:
            processed_text.append(token.text)
    return " ".join(processed_text)

In [11]:
train_df['text'] = train_df['text'].apply(data_cleaning)

In [12]:
train_df

Unnamed: 0,text,target
0,deed reason earthquake allah forgive,1
1,forest fire near la ronge sask canada,1
2,resident ask shelter place notify officer evac...,1
3,"13,000 people receive wildfire evacuation orde...",1
4,got send photo ruby alaska smoke wildfire pour...,1
...,...,...
7608,giant crane hold bridge collapse nearby home,1
7609,control wild fire california northern state tr...,1
7610,km s volcano hawaii,1
7611,police investigate e bike collide car little p...,1


In [13]:
train_df['vector'] = train_df['text'].apply(lambda x: nlp(x).vector)

In [14]:
train_df

Unnamed: 0,text,target,vector
0,deed reason earthquake allah forgive,1,"[-0.89539003, -0.5874631, 0.08199149, -0.11371..."
1,forest fire near la ronge sask canada,1,"[0.07400574, -0.65609676, 0.013855304, -0.0177..."
2,resident ask shelter place notify officer evac...,1,"[-0.117496446, -0.99588853, -0.09731466, 0.196..."
3,"13,000 people receive wildfire evacuation orde...",1,"[-0.0027969608, -0.56024474, -0.19756572, 0.53..."
4,got send photo ruby alaska smoke wildfire pour...,1,"[-0.11239079, -0.8295834, -0.023216102, 0.2934..."
...,...,...,...
7608,giant crane hold bridge collapse nearby home,1,"[0.31862608, -0.64759594, 0.1749067, 0.1417640..."
7609,control wild fire california northern state tr...,1,"[0.034349207, -0.72486347, 0.3763735, -0.00739..."
7610,km s volcano hawaii,1,"[-0.29387802, -0.42656365, 0.15048648, 0.29720..."
7611,police investigate e bike collide car little p...,1,"[-0.18011016, -0.75524974, 0.18318109, 0.01033..."


**Independent and dependent variables**

In [15]:
df = train_df['vector'].tolist()
y = train_df['target']

In [16]:
df = pd.DataFrame(df)

In [17]:
df.isna().sum()

0     6
1     6
2     6
3     6
4     6
     ..
91    6
92    6
93    6
94    6
95    6
Length: 96, dtype: int64

In [18]:
y

0       1
1       1
2       1
3       1
4       1
       ..
7608    1
7609    1
7610    1
7611    1
7612    1
Name: target, Length: 7613, dtype: int64

In [19]:
df['target'] = y

In [20]:
df.dropna(inplace=True)

In [21]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,87,88,89,90,91,92,93,94,95,target
0,-0.895390,-0.587463,0.081991,-0.113714,-0.014137,-0.064646,0.452483,1.026274,-0.082400,0.298462,...,-0.556970,-0.417347,0.113828,0.142716,0.111567,0.696028,0.721950,-0.108424,0.060130,1
1,0.074006,-0.656097,0.013855,-0.017742,-0.134844,0.016132,0.271434,0.336150,0.213258,-0.019220,...,-0.570109,-0.703247,0.863128,-0.240738,0.066787,1.015142,0.689716,-0.029478,0.553038,1
2,-0.117496,-0.995889,-0.097315,0.196346,-0.073871,0.036694,0.537511,1.121843,-0.324463,0.024909,...,-0.586454,-1.263514,0.349022,0.121524,0.388857,0.729882,0.797178,-0.350199,-0.139567,1
3,-0.002797,-0.560245,-0.197566,0.532752,0.576862,-0.314657,0.334603,0.795719,-0.068185,-0.115510,...,-0.508185,-0.876154,0.347084,0.124063,-0.259225,0.937639,0.716441,0.048756,-0.071078,1
4,-0.112391,-0.829583,-0.023216,0.293478,-0.582142,0.161157,0.121242,0.424783,0.063690,-0.101462,...,-0.394347,-0.752432,0.826523,-0.008746,-0.137176,0.792115,0.773200,0.030032,0.229144,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7608,0.318626,-0.647596,0.174907,0.141764,-0.314060,0.072563,0.249417,0.509106,0.468901,-0.135762,...,-0.493622,-1.189895,1.171815,-0.142032,0.041124,0.931174,0.840161,-0.085842,0.152812,1
7609,0.034349,-0.724863,0.376373,-0.007399,-0.242269,0.137670,0.013584,0.379860,0.346273,-0.512194,...,-0.311138,-1.155591,1.439204,0.004883,0.172133,1.265559,0.895384,0.035129,0.255190,1
7610,-0.293878,-0.426564,0.150486,0.297206,0.033403,-0.429485,0.713133,1.422183,1.013595,0.447605,...,0.321616,-0.619466,0.036494,-0.448501,0.139824,1.152019,0.066802,-0.474980,0.770276,1
7611,-0.180110,-0.755250,0.183181,0.010334,-0.290267,-0.145843,0.498589,0.845360,-0.119036,-0.024466,...,-0.529604,-0.787873,0.393475,-0.135648,0.137705,0.880755,0.782657,-0.241021,0.429154,1


In [22]:
X = df.iloc[:,:-1]
y = df['target']

In [23]:
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,86,87,88,89,90,91,92,93,94,95
0,-0.895390,-0.587463,0.081991,-0.113714,-0.014137,-0.064646,0.452483,1.026274,-0.082400,0.298462,...,-0.008982,-0.556970,-0.417347,0.113828,0.142716,0.111567,0.696028,0.721950,-0.108424,0.060130
1,0.074006,-0.656097,0.013855,-0.017742,-0.134844,0.016132,0.271434,0.336150,0.213258,-0.019220,...,0.169486,-0.570109,-0.703247,0.863128,-0.240738,0.066787,1.015142,0.689716,-0.029478,0.553038
2,-0.117496,-0.995889,-0.097315,0.196346,-0.073871,0.036694,0.537511,1.121843,-0.324463,0.024909,...,-0.048081,-0.586454,-1.263514,0.349022,0.121524,0.388857,0.729882,0.797178,-0.350199,-0.139567
3,-0.002797,-0.560245,-0.197566,0.532752,0.576862,-0.314657,0.334603,0.795719,-0.068185,-0.115510,...,-0.213854,-0.508185,-0.876154,0.347084,0.124063,-0.259225,0.937639,0.716441,0.048756,-0.071078
4,-0.112391,-0.829583,-0.023216,0.293478,-0.582142,0.161157,0.121242,0.424783,0.063690,-0.101462,...,0.158607,-0.394347,-0.752432,0.826523,-0.008746,-0.137176,0.792115,0.773200,0.030032,0.229144
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7608,0.318626,-0.647596,0.174907,0.141764,-0.314060,0.072563,0.249417,0.509106,0.468901,-0.135762,...,0.136646,-0.493622,-1.189895,1.171815,-0.142032,0.041124,0.931174,0.840161,-0.085842,0.152812
7609,0.034349,-0.724863,0.376373,-0.007399,-0.242269,0.137670,0.013584,0.379860,0.346273,-0.512194,...,0.267460,-0.311138,-1.155591,1.439204,0.004883,0.172133,1.265559,0.895384,0.035129,0.255190
7610,-0.293878,-0.426564,0.150486,0.297206,0.033403,-0.429485,0.713133,1.422183,1.013595,0.447605,...,-0.253790,0.321616,-0.619466,0.036494,-0.448501,0.139824,1.152019,0.066802,-0.474980,0.770276
7611,-0.180110,-0.755250,0.183181,0.010334,-0.290267,-0.145843,0.498589,0.845360,-0.119036,-0.024466,...,-0.174900,-0.529604,-0.787873,0.393475,-0.135648,0.137705,0.880755,0.782657,-0.241021,0.429154


In [24]:
y

0       1
1       1
2       1
3       1
4       1
       ..
7608    1
7609    1
7610    1
7611    1
7612    1
Name: target, Length: 7607, dtype: int64

**Train-Test splitt**

In [25]:
from sklearn.model_selection import train_test_split

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=0)

**Scalling**

In [27]:
from sklearn.preprocessing import MinMaxScaler

In [28]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)

In [29]:
X_test = scaler.transform(X_test)

In [30]:
X_train

array([[0.78015634, 0.49084987, 0.73520523, ..., 0.50196738, 0.60739949,
        0.37212321],
       [0.74470213, 0.42755234, 0.47951727, ..., 0.39256869, 0.48134214,
        0.46901389],
       [0.78753899, 0.45439096, 0.58358209, ..., 0.43277843, 0.41036072,
        0.39927737],
       ...,
       [0.60159221, 0.27664141, 0.70444831, ..., 0.30980856, 0.45451838,
        0.37770822],
       [0.66980138, 0.57335868, 0.57163737, ..., 0.46992664, 0.58804344,
        0.45331886],
       [0.76695359, 0.3707581 , 0.55909839, ..., 0.45000782, 0.35410569,
        0.34590893]])

In [31]:
y_train

3774    1
260     0
2915    1
5667    0
4817    1
       ..
4934    1
3264    1
1653    1
2607    0
2732    0
Name: target, Length: 6085, dtype: int64

**SVM Classifier**

In [32]:
from sklearn.svm import SVC

In [33]:
svm = SVC()
svm.fit(X_train,y_train)

SVC()

In [34]:
y_pred = svm.predict(X_test)

In [35]:
accuracy = accuracy_score(y_test,y_pred)
accuracy

0.6800262812089356

In [36]:
y_pred

array([0, 1, 1, ..., 0, 0, 1], dtype=int64)

In [37]:
cf = confusion_matrix(y_test,y_pred)
cf

array([[684, 204],
       [283, 351]], dtype=int64)

In [38]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.71      0.77      0.74       888
           1       0.63      0.55      0.59       634

    accuracy                           0.68      1522
   macro avg       0.67      0.66      0.66      1522
weighted avg       0.68      0.68      0.68      1522



**RandomForestClassifier**

In [39]:
from sklearn.ensemble import RandomForestClassifier

In [42]:
rf = RandomForestClassifier()

In [43]:
rf.fit(X_train,y_train)

RandomForestClassifier()

In [65]:
y_pred_rf = rf.predict(X_test)

In [66]:
accuracy_rf = accuracy_score(y_test,y_pred_rf)
accuracy_rf

0.6944809461235217

In [67]:
cf_rf = confusion_matrix(y_test,y_pred_rf)
cf_rf

array([[731, 157],
       [308, 326]], dtype=int64)

In [68]:
print(classification_report(y_test,y_pred_rf))

              precision    recall  f1-score   support

           0       0.70      0.82      0.76       888
           1       0.67      0.51      0.58       634

    accuracy                           0.69      1522
   macro avg       0.69      0.67      0.67      1522
weighted avg       0.69      0.69      0.69      1522



In [40]:
from sklearn.model_selection import GridSearchCV

In [41]:
def best_model(model):
    print(model.best_score_)    
    print(model.best_params_)
    print(model.best_estimator_)

In [42]:
param_grid = {'max_depth': [9, 10, 12, 15],'n_estimators':[100,150]}  # 'max_features': [64,76,88,96]
RandFor_grid = GridSearchCV(RandomForestClassifier(), param_grid, cv=5, refit=True, verbose=0)
RandFor_grid.fit(X_train,y_train)
best_model(RandFor_grid)

0.6862777321281841
{'max_depth': 12, 'n_estimators': 150}
RandomForestClassifier(max_depth=12, n_estimators=150)


In [43]:
y_grid_rf = RandFor_grid.predict(X_test)

In [44]:
accuracy_rf = accuracy_score(y_test,y_grid_rf)
accuracy_rf

0.695137976346912

In [45]:
cf_rf = confusion_matrix(y_test,y_grid_rf)
cf_rf

array([[704, 184],
       [280, 354]], dtype=int64)

In [46]:
print(classification_report(y_test,y_grid_rf))

              precision    recall  f1-score   support

           0       0.72      0.79      0.75       888
           1       0.66      0.56      0.60       634

    accuracy                           0.70      1522
   macro avg       0.69      0.68      0.68      1522
weighted avg       0.69      0.70      0.69      1522



In [47]:
y_grid_rf

array([0, 1, 1, ..., 0, 0, 1], dtype=int64)

**XGBoost Classifier**

In [48]:
from xgboost import XGBClassifier

In [49]:
xgb = XGBClassifier()
xgb.fit(X_train,y_train)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)

In [50]:
y_pred_xgb = xgb.predict(X_test)

In [51]:
accuracy_xgb = accuracy_score(y_test,y_pred_xgb)
accuracy_xgb

0.6668856767411301

In [52]:
cf_xgb = confusion_matrix(y_test,y_pred_xgb)
cf_xgb

array([[665, 223],
       [284, 350]], dtype=int64)

In [53]:
print(classification_report(y_test,y_pred_xgb))

              precision    recall  f1-score   support

           0       0.70      0.75      0.72       888
           1       0.61      0.55      0.58       634

    accuracy                           0.67      1522
   macro avg       0.66      0.65      0.65      1522
weighted avg       0.66      0.67      0.66      1522



In [55]:
param_grid = {'max_depth': [9,12,15], 'gamma': [0.01,0.001,0.001],'min_child_weight':[1,5,10], 'learning_rate': [0.05,0.1, 0.2, 0.3], 'n_estimators':[100,150]}
xgb_grid = GridSearchCV(XGBClassifier(), param_grid, cv=5, refit=True, verbose=0)
xgb_grid.fit(X_train,y_train)
best_model(xgb_grid)

0.7018898931799507
{'gamma': 0.001, 'learning_rate': 0.05, 'max_depth': 15, 'min_child_weight': 5, 'n_estimators': 150}
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=0.001, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.05, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=15, max_leaves=None,
              min_child_weight=5, missing=nan, monotone_constraints=None,
              n_estimators=150, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)


In [56]:
y_grid_xgb = xgb_grid.predict(X_test)

In [57]:
accuracy_xgb_grid = accuracy_score(y_test,y_grid_xgb)
accuracy_xgb_grid

0.695137976346912

In [58]:
cf_xgb_grid = confusion_matrix(y_test,y_grid_xgb)
cf_xgb_grid

array([[707, 181],
       [283, 351]], dtype=int64)

In [59]:
print(classification_report(y_test,y_grid_xgb))

              precision    recall  f1-score   support

           0       0.71      0.80      0.75       888
           1       0.66      0.55      0.60       634

    accuracy                           0.70      1522
   macro avg       0.69      0.67      0.68      1522
weighted avg       0.69      0.70      0.69      1522



**Predicting Test set**

In [66]:
test_X

array([[0.62460347, 0.61565903, 0.46367274, ..., 0.38170516, 0.48221029,
        0.33350597],
       [0.65018535, 0.46583835, 0.74057679, ..., 0.41925785, 0.34973092,
        0.37163187],
       [0.73578255, 0.3850607 , 0.56606055, ..., 0.44212285, 0.43244476,
        0.41126395],
       ...,
       [0.62548315, 0.37680865, 0.4188104 , ..., 0.36881009, 0.47925582,
        0.3948884 ],
       [0.57764481, 0.43482179, 0.55934776, ..., 0.37599631, 0.44876776,
        0.39062741],
       [0.5934712 , 0.45656041, 0.64172883, ..., 0.32415138, 0.42598102,
        0.44843877]])

In [61]:
test_df = test_df.drop(['id','keyword','location'], axis=1)
test_df['text'] = test_df['text'].apply(data_cleaning)

In [62]:
test_df['vector'] = test_df['text'].apply(lambda x: nlp(x).vector)

In [63]:
test_X = test_df['vector'].tolist()

In [64]:
test_X = pd.DataFrame(test_X)

In [65]:
test_X = scaler.transform(test_X)

**Predicting test data**

In [68]:
test_y = xgb_grid.predict(test_X)

In [69]:
test_y

array([1, 0, 1, ..., 1, 1, 1])

In [74]:
submission = pd.read_csv('sample_submission.csv')

In [75]:
submission['target'] = test_y

In [77]:
submission.to_csv('sample_submission.csv',index=False)