Kaggle Natural Language Processing competition entry. 

In [84]:
import numpy as np
import pandas as pd
from scipy.sparse import hstack

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
import xgboost as xgb

#### Load data into dataframes

In [85]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
y_train = train['target']
train.drop(['target'], axis=1, inplace=True)
df = pd.concat((train, test), axis=0)

#### Preliminary analysis

In [88]:
df.head(3)

Unnamed: 0,id,keyword,location,text
0,1,,,Our Deeds are the Reason of this #earthquake M...
1,4,,,Forest fire near La Ronge Sask. Canada
2,5,,,All residents asked to 'shelter in place' are ...


In [89]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10876 entries, 0 to 3262
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        10876 non-null  int64 
 1   keyword   10789 non-null  object
 2   location  7238 non-null   object
 3   text      10876 non-null  object
dtypes: int64(1), object(3)
memory usage: 424.8+ KB


In [90]:
y_train.value_counts() # Dataset is somewhat unbalanced

0    4342
1    3271
Name: target, dtype: int64

In [91]:
df['location'].unique().shape # Too many missing and unique values. Will not use the location column.

(4522,)

In [92]:
df.drop(['id', 'location'], axis=1, inplace=True)

In [93]:
# %20 is space in UTF-8
df['keyword'].unique()

array([nan, 'ablaze', 'accident', 'aftershock', 'airplane%20accident',
       'ambulance', 'annihilated', 'annihilation', 'apocalypse',
       'armageddon', 'army', 'arson', 'arsonist', 'attack', 'attacked',
       'avalanche', 'battle', 'bioterror', 'bioterrorism', 'blaze',
       'blazing', 'bleeding', 'blew%20up', 'blight', 'blizzard', 'blood',
       'bloody', 'blown%20up', 'body%20bag', 'body%20bagging',
       'body%20bags', 'bomb', 'bombed', 'bombing', 'bridge%20collapse',
       'buildings%20burning', 'buildings%20on%20fire', 'burned',
       'burning', 'burning%20buildings', 'bush%20fires', 'casualties',
       'casualty', 'catastrophe', 'catastrophic', 'chemical%20emergency',
       'cliff%20fall', 'collapse', 'collapsed', 'collide', 'collided',
       'collision', 'crash', 'crashed', 'crush', 'crushed', 'curfew',
       'cyclone', 'damage', 'danger', 'dead', 'death', 'deaths', 'debris',
       'deluge', 'deluged', 'demolish', 'demolished', 'demolition',
       'derail', 'der

In [96]:
# OneHotEncoder can't handle NaN values. 
df.fillna('missing', inplace=True)

In [97]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10876 entries, 0 to 3262
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   keyword  10876 non-null  object
 1   text     10876 non-null  object
dtypes: object(2)
memory usage: 254.9+ KB


In [99]:
X_train = df[:len(train)]
X_test = df[len(train):]

In [100]:
enc = OneHotEncoder(sparse=True, handle_unknown='error')

In [101]:
# OneHotEncoder accepts 2D arrays. Convert keyword column to numpy array and reshape.
train_keywords_arr = X_train['keyword'].to_numpy(copy=True).reshape(-1,1)
enc.fit(train_keywords_arr)
train_keywords = enc.transform(train_keywords_arr)
test_keywords_arr = X_test['keyword'].to_numpy(copy=True).reshape(-1,1)
test_keywords = enc.transform(test_keywords_arr)

In [102]:
train_keywords.shape

(7613, 222)

In [103]:
test_keywords.shape

(3263, 222)

#### Bag-of-Words

In [104]:
vect = CountVectorizer(min_df=5, dtype=np.float64)
vect.fit(X_train['text'])

CountVectorizer(dtype=<class 'numpy.float64'>, min_df=5)

In [105]:
vect.vocabulary_

{'our': 1744,
 'are': 208,
 'the': 2417,
 'reason': 1967,
 'of': 1700,
 'this': 2432,
 'earthquake': 780,
 'may': 1513,
 'allah': 151,
 'us': 2582,
 'all': 150,
 'forest': 984,
 'fire': 947,
 'near': 1647,
 'la': 1358,
 'canada': 441,
 'residents': 2009,
 'asked': 230,
 'to': 2462,
 'shelter': 2168,
 'in': 1246,
 'place': 1823,
 'being': 304,
 'by': 421,
 'officers': 1706,
 'no': 1668,
 'other': 1742,
 'evacuation': 844,
 'or': 1736,
 'orders': 1739,
 'expected': 866,
 '13': 17,
 'people': 1793,
 'wildfires': 2698,
 'california': 427,
 'just': 1328,
 'got': 1068,
 'sent': 2142,
 'photo': 1805,
 'from': 1005,
 'alaska': 145,
 'as': 226,
 'smoke': 2227,
 'into': 1276,
 'school': 2108,
 'update': 2574,
 'hwy': 1225,
 '20': 30,
 'closed': 525,
 'both': 371,
 'due': 769,
 'lake': 1364,
 'county': 591,
 'flood': 964,
 'disaster': 712,
 'heavy': 1144,
 'rain': 1941,
 'causes': 467,
 'flash': 958,
 'flooding': 965,
 'streets': 2317,
 'colorado': 542,
 'springs': 2272,
 'areas': 210,
 'on': 172

In [106]:
train_text = vect.transform(X_train['text'])
test_text = vect.transform(X_test['text'])

In [107]:
train_text.shape

(7613, 2795)

In [108]:
test_text.shape

(3263, 2795)

In [109]:
# Concatenate sparse matrices created from keyword and text columns
X_train = hstack((train_keywords, train_text))
X_test = hstack((test_keywords, test_text))

#### Logistic Regression

In [110]:
# Adding class weights to Logistic Regression model, to account for unbalance in the data, 
# decreased the model performance slightly.

param_grid = {'C':[0.001, 0.01, 0.1, 1, 10]}
grid = GridSearchCV(LogisticRegression(max_iter=1000), param_grid, cv=5, scoring='f1')
grid.fit(X_train, y_train)
print("Best cross validation score:{:.2f}".format(grid.best_score_))
print("Best parameters: ", grid.best_params_)

Best cross validation score:0.61
Best parameters:  {'C': 0.1}


#### XGBoost

In [114]:
x = xgb.XGBClassifier(learning_rate=0.02, n_estimators=1000, objective='binary:logistic', 
                      eval_metric='logloss', use_label_encoder=False)
param_grid = {'max_depth':[4, 5, 6, 7]}
grid = GridSearchCV(x, param_grid, cv=5, scoring='f1')
grid.fit(X_train, y_train)
print("Best cross validation score:{:.2f}".format(grid.best_score_))
print("Best parameters: ", grid.best_params_)

Best cross validation score:0.56
Best parameters:  {'max_depth': 7}


#### Make prediction

In [29]:
logreg = LogisticRegression(max_iter=1000, C=0.1).fit(X_train, y_train)
y_pred = logreg.predict(X_test)

In [30]:
y_pred

array([1, 0, 1, ..., 1, 1, 0])

In [31]:
submission = pd.DataFrame(y_pred, columns=['target'], index=test['id'])

In [32]:
submission.head(3)

Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
0,1
2,0
3,1


In [314]:
submission.to_csv('submission.csv')