In [49]:
import pandas as pd
import numpy as np
import re

import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import f1_score

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.pipeline import Pipeline

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Disaster Tweets

This notebook is dedicated to solving Kaggle's [Natural Language Processing with Disaster Tweets](https://www.kaggle.com/c/nlp-getting-started/overview) challenge. This a **supervised binary classification** task in which the features are tweet information, and the target is a value of 1 if the tweet is about a real disaster, and 0 if not. As an exercise, **I will only be using the text of the tweets to make predictions.** This may affect accuracy, but the point of the exercise as far as we're concerned is to see how much can be gleaned exclusively from the text of the tweets.

This notebook will eventually feature several different models, and thus runs the risk of being somewhat long. With this in mind, here at the top I will keep a running list of the models used along with the current best score of that model on Kaggle's test data. (Note that the evaluation metric for this challenge is the **F1 score, not accuracy**.) The models will be listed in descending order of highest-achieved F1 score.

1. [_K_-nearest neighbors](#knn) (Bag of words): **0.79037**
2. Decision Tree (Bag of words): **0.76616**


## A first look at the data

First we import the training/validation data. I've stored these locally in my `raw_data` folder; they can be downloaded from Kaggle [here](https://www.kaggle.com/c/nlp-getting-started/data).

In [37]:
train_data = pd.read_csv('raw_data/train.csv')
X_test = pd.read_csv('raw_data/test.csv')
sample_submission = pd.read_csv('raw_data/sample_submission.csv')

In [3]:
train_data.shape

(7613, 5)

In [4]:
train_data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [5]:
train_data['target'].value_counts()

0    4342
1    3271
Name: target, dtype: int64

We'll divide the training data into a 70-30 train-validate split.

In [6]:
X_tv = train_data.drop(columns = ['target'])
y_tv = train_data['target']

X_train, X_val, y_train, y_val = train_test_split(X_tv, y_tv, test_size = .3, random_state = 42)

In [7]:
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((5329, 4), (2284, 4), (5329,), (2284,))

## Bag-of-words models

### Defining functions for cleaning data

First we define a function `text_clean()` that cleans a given bit of text by removing stopwords, hyperlinks, and punctuation; renders the remaining text all lowercase; and lemmatizes this text (i.e., reduces each word to its root form).

In [8]:
lemmatizer = WordNetLemmatizer()

In [9]:
def text_clean(text):
    out_text = text
    stop_words = stopwords.words('english')
    
    # Remove hyperlinks
    out_text = re.sub(r'http\S+', '', out_text)
    
    # Remove punctuation
    for punct in punctuation:
        out_text = out_text.replace(punct, '')
        
    out_text = out_text.lower()
    out_text_token = word_tokenize(out_text)
    out_text_token = [lemmatizer.lemmatize(word) for word in out_text_token if word not in stop_words]
    
    out_text = ' '.join(out_text_token)
    
    return out_text

In [10]:
X_train.head()

Unnamed: 0,id,keyword,location,text
1186,1707,bridge%20collapse,,Ashes 2015: AustraliaÛªs collapse at Trent Br...
4071,5789,hail,"Carol Stream, Illinois",GREAT MICHIGAN TECHNIQUE CAMP\nB1G THANKS TO @...
5461,7789,police,Houston,CNN: Tennessee movie theater shooting suspect ...
5787,8257,rioting,,Still rioting in a couple of hours left until ...
7445,10656,wounds,Lake Highlands,Crack in the path where I wiped out this morni...


In [11]:
print(f"TEXT IN:\n{X_train['text'][7445]}\n\nTEXT OUT:\n{text_clean(X_train['text'][7445])}")

TEXT IN:
Crack in the path where I wiped out this morning during beach run. Surface wounds on left elbow and right knee. http://t.co/yaqRSximph

TEXT OUT:
crack path wiped morning beach run surface wound left elbow right knee


Next we define a function `transform_func` that takes the feature dataset and transforms it in the following ways:

1. It first removes all features but the text. (For the time being we will **only** make use of a tweet's text, leaving open the possibility that future developments will incorporate other features as well.)
2. It applies the above `text_clean` function to all rows to clean the text data.

`transform_func` will then be used as the first part of our bag-of-words machine learning pipeline.

In [12]:
def transform_func(feature_data):
    # Trim to just the text (feature_data['text']) and apply text_clean
    return feature_data['text'].apply(text_clean)

Finally, we create an `sklearn` `FunctionTransformer` called `text_clean_tran`. This essentially makes `transform_func` into something that can be integrated into an `sklearn` pipeline:

In [13]:
text_clean_tran = FunctionTransformer(transform_func)

### BOW Model 1: _K_-nearest neighbors <a id = 'knn'></a>

The first model we will test will be a _k_-nearest neighbors model. First we build the basic pipeline. The pipeline will look as follows:

1. Transform the data using `text_clean_tran`.
2. Vectorize data using either a pure count vectorizer or TF-IDF (depending on results of cross-validation).
3. Apply a _k_-nearest neighbors model (with number of neighbors and possible weighting determined by grid search).

Let's start with a very basic _k_-nearest-neighbors pipeline, using a pure count vectorizer and `sklearn`'s default _k_-neighbors settings (5 neighbors, unweighted), and see how it does via cross-validation on the training data:

In [53]:
knn_pipe = Pipeline([
    ('text_cleaning', text_clean_tran),
    ('vectorize', CountVectorizer()),
    ('knn', KNeighborsClassifier())
])

Now let's try a 5-fold cross-validation on the training data. Note that we really care about the F1 score instead of accuracy, since the evaluation for the Kaggle challenge is based on F1 score. However, I'll provide both.

In [60]:
knn_cv = cross_validate(estimator = knn_pipe, X = X_train, y = y_train, scoring = ['f1', 'accuracy'], cv = 5)
print(f"MEAN F1: {round(knn_cv['test_f1'].mean(), 4)}")
print(f"MEAN ACCURACY: {round(knn_cv['test_accuracy'].mean(), 4)}")

MEAN F1: 0.4211
MEAN ACCURACY: 0.6731


So on our most basic _k_-neighbors model, we achieve an F1 of about 0.42. To see how much we can improve on that, we will grid search to find the best hyperparameters.

Note that for reasons of runtime, the grid search below is only the last in a series of grid searches that I performed, testing a variety of ranges for the number of neighbors in our _k_-neighbors model.



In [18]:
knn_params = {
    'vectorize' : [CountVectorizer(), TfidfVectorizer()],
    'knn__n_neighbors' : list(range(15,20)),
    'knn__weights' : ['uniform', 'distance'],
}

In [19]:
knn_grid = GridSearchCV(estimator = knn_pipe, param_grid = knn_params, scoring = 'f1')

In [20]:
knn_grid.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('text_cleaning',
                                        FunctionTransformer(func=<function transform_func at 0x7f0424fcb4d0>)),
                                       ('vectorize', CountVectorizer()),
                                       ('knn', KNeighborsClassifier())]),
             param_grid={'knn__n_neighbors': [15, 16, 17, 18, 19],
                         'knn__weights': ['uniform', 'distance'],
                         'vectorize': [CountVectorizer(), TfidfVectorizer()]},
             scoring='f1')

In [23]:
print(f'BEST SCORE (F1): {round(knn_grid.best_score_, 3)}')
print(f'BEST VECTORIZATION METHOD : {knn_grid.best_params_["vectorize"]}')
print(f'BEST NUMBER OF NEIGHBORS: {knn_grid.best_params_["knn__n_neighbors"]}')
print(f'BEST WEIGHTING (UNIFORM VS. DISTANCE) : {knn_grid.best_params_["knn__weights"]}')

BEST SCORE (F1): 0.726
BEST VECTORIZATION METHOD : TfidfVectorizer()
BEST NUMBER OF NEIGHBORS: 16
BEST WEIGHTING (UNIFORM VS. DISTANCE) : distance


In plain English, the best settings of hyperparameters seem to be:
1. The data are vectorized via **TF-IDF**.
2. The **16** nearest neighbors vote on classification.
3. The neighbors' votes are **weighted by distance**, so that closer neighbors have a greater say in determining the classification of the data point.

Now to properly test this model, we create a new pipeline `knn_final` with these hyperparameter settings, train it on the training data (`X_train`, `y_train`), and test it on the validation data (`X_val`, `y_val`).

In [24]:
knn_final = Pipeline([
    ('text_cleaning', text_clean_tran),
    ('vectorize', TfidfVectorizer()),
    ('knn', KNeighborsClassifier(n_neighbors = 16, weights = 'distance'))
])

In [25]:
knn_final.fit(X_train, y_train)

Pipeline(steps=[('text_cleaning',
                 FunctionTransformer(func=<function transform_func at 0x7f0424fcb4d0>)),
                ('vectorize', TfidfVectorizer()),
                ('knn',
                 KNeighborsClassifier(n_neighbors=16, weights='distance'))])

In [26]:
y_val_pred = knn_final.predict(X_val)

In [28]:
val_score = f1_score(y_true = y_val, y_pred = y_val_pred)
val_score

0.7192393736017897

So it turns out that this model gives us an F1 of about .72 for the validation set. By way of comparison, if we just guess 1 ("This is a real disaster") every time, we get an F1 of .59:

In [35]:
f1_score(y_true = y_val, y_pred = np.ones(len(y_val), dtype = int))

0.5944615384615385

Finally, we fit this model on the training and validation set together, in order to then run it on the test set:

In [36]:
knn_final.fit(X_tv, y_tv)

Pipeline(steps=[('text_cleaning',
                 FunctionTransformer(func=<function transform_func at 0x7f0424fcb4d0>)),
                ('vectorize', TfidfVectorizer()),
                ('knn',
                 KNeighborsClassifier(n_neighbors=16, weights='distance'))])

In [38]:
knn_predictions = knn_final.predict(X_test)

We then make a data frame that fits Kaggle's required format, and ship that out as a .csv:

In [46]:
knn_pred_df = pd.DataFrame({'id' : X_test['id'], 'target' : knn_predictions})
knn_pred_df.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,0
4,11,1


In [47]:
knn_pred_df.to_csv('knn_pred.csv', sep = ',', index = False)

And according to Kaggle, this lands us with an F1 score of **0.79037**. Not bad!

### BOW Model 2: Decision Tree

In [50]:
tree_pipe = Pipeline([
    ('text_cleaning', text_clean_tran),
    ('vectorize', CountVectorizer()),
    ('tree', DecisionTreeClassifier())
])

In [61]:
tree_cv = cross_validate(estimator = tree_pipe, X = X_train, y = y_train, scoring = ['f1', 'accuracy'], cv = 5)
print(f"MEAN F1: {round(tree_cv['test_f1'].mean(), 4)}")
print(f"MEAN ACCURACY: {round(tree_cv['test_accuracy'].mean(), 4)}")

MEAN F1: 0.713
MEAN ACCURACY: 0.7553


In [64]:
tree_pipe.get_params()

{'memory': None,
 'steps': [('text_cleaning',
   FunctionTransformer(func=<function transform_func at 0x7f0424fcb4d0>)),
  ('vectorize', CountVectorizer()),
  ('tree', DecisionTreeClassifier())],
 'text_cleaning': FunctionTransformer(func=<function transform_func at 0x7f0424fcb4d0>),
 'text_cleaning__accept_sparse': False,
 'text_cleaning__check_inverse': True,
 'text_cleaning__func': <function __main__.transform_func>,
 'text_cleaning__inv_kw_args': None,
 'text_cleaning__inverse_func': None,
 'text_cleaning__kw_args': None,
 'text_cleaning__validate': False,
 'tree': DecisionTreeClassifier(),
 'tree__ccp_alpha': 0.0,
 'tree__class_weight': None,
 'tree__criterion': 'gini',
 'tree__max_depth': None,
 'tree__max_features': None,
 'tree__max_leaf_nodes': None,
 'tree__min_impurity_decrease': 0.0,
 'tree__min_samples_leaf': 1,
 'tree__min_samples_split': 2,
 'tree__min_weight_fraction_leaf': 0.0,
 'tree__random_state': None,
 'tree__splitter': 'best',
 'vectorize': CountVectorizer(),
 'v

In [65]:
tree_params = {
    'vectorize' : [CountVectorizer(), TfidfVectorizer()],
    'tree__min_samples_split' : [2, 6, 10],
}

In [66]:
tree_grid = GridSearchCV(estimator = tree_pipe, param_grid = tree_params, scoring = 'f1')

In [67]:
tree_grid.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('text_cleaning',
                                        FunctionTransformer(func=<function transform_func at 0x7f0424fcb4d0>)),
                                       ('vectorize', CountVectorizer()),
                                       ('tree', DecisionTreeClassifier())]),
             param_grid={'tree__min_samples_split': [2, 6, 10],
                         'vectorize': [CountVectorizer(), TfidfVectorizer()]},
             scoring='f1')

In [68]:
print(f'BEST SCORE (F1): {round(tree_grid.best_score_, 3)}')
print(f'BEST VECTORIZATION METHOD : {tree_grid.best_params_["vectorize"]}')
print(f'BEST MIN. SAMPLES TO SPLIT: {tree_grid.best_params_["tree__min_samples_split"]}')

BEST SCORE (F1): 0.719
BEST VECTORIZATION METHOD : CountVectorizer()
BEST MIN. SAMPLES TO SPLIT: 6


In [69]:
tree_final = Pipeline([
    ('text_cleaning', text_clean_tran),
    ('vectorize', CountVectorizer()),
    ('tree', DecisionTreeClassifier(min_samples_split = 6))
])

In [70]:
tree_final.fit(X_train, y_train)

Pipeline(steps=[('text_cleaning',
                 FunctionTransformer(func=<function transform_func at 0x7f0424fcb4d0>)),
                ('vectorize', CountVectorizer()),
                ('tree', DecisionTreeClassifier(min_samples_split=6))])

In [72]:
y_val_pred = tree_final.predict(X_val)
val_score = f1_score(y_true = y_val, y_pred = y_val_pred)
val_score

0.7061288632792039

In [73]:
tree_final.fit(X_tv, y_tv)

Pipeline(steps=[('text_cleaning',
                 FunctionTransformer(func=<function transform_func at 0x7f0424fcb4d0>)),
                ('vectorize', CountVectorizer()),
                ('tree', DecisionTreeClassifier(min_samples_split=6))])

In [74]:
tree_predictions = tree_final.predict(X_test)

In [75]:
tree_pred_df = pd.DataFrame({'id' : X_test['id'], 'target' : tree_predictions})
tree_pred_df.to_csv('tree_pred.csv', sep = ',', index = False)