In [72]:
import pandas as pd
import numpy as np
import re

import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV

from sklearn.neighbors import KNeighborsClassifier

from sklearn.pipeline import Pipeline

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Disaster Tweets

This notebook is dedicated to solving Kaggle's [Natural Language Processing with Disaster Tweets](https://www.kaggle.com/c/nlp-getting-started/overview) challenge. This a **supervised binary classification** task in which the features are tweet information, and the target is a value of 1 if the tweet is about a real disaster, and 0 if not.

## A first look at the data

First we import the training/validation data. I've stored these locally in my `raw_data` folder; they can be downloaded from Kaggle [here](https://www.kaggle.com/c/nlp-getting-started/data).

In [3]:
train_data = pd.read_csv('raw_data/train.csv')
test_data = pd.read_csv('raw_data/test.csv')
sample_submission = pd.read_csv('raw_data/sample_submission.csv')

In [4]:
train_data.shape

(7613, 5)

In [5]:
train_data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [6]:
train_data['target'].value_counts()

0    4342
1    3271
Name: target, dtype: int64

We'll divide the training data into a 70-30 train-validate split.

In [7]:
X_tv = train_data.drop(columns = ['target'])
y_tv = train_data['target']

X_train, X_val, y_train, y_val = train_test_split(X_tv, y_tv, test_size = .3, random_state = 42)

In [8]:
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((5329, 4), (2284, 4), (5329,), (2284,))

## Bag-of-words models

### Defining functions for cleaning data

First we define a function `text_clean()` that cleans a given bit of text by removing stopwords, hyperlinks, and punctuation; renders the remaining text all lowercase; and lemmatizes this text (i.e., reduces each word to its root form).

In [9]:
lemmatizer = WordNetLemmatizer()

In [17]:
def text_clean(text):
    out_text = text
    stop_words = stopwords.words('english')
    
    # Remove hyperlinks
    out_text = re.sub(r'http\S+', '', out_text)
    
    # Remove punctuation
    for punct in punctuation:
        out_text = out_text.replace(punct, '')
        
    out_text = out_text.lower()
    out_text_token = word_tokenize(out_text)
    out_text_token = [lemmatizer.lemmatize(word) for word in out_text_token if word not in stop_words]
    
    out_text = ' '.join(out_text_token)
    
    return out_text

In [18]:
X_train.head()

Unnamed: 0,id,keyword,location,text
1186,1707,bridge%20collapse,,Ashes 2015: AustraliaÛªs collapse at Trent Br...
4071,5789,hail,"Carol Stream, Illinois",GREAT MICHIGAN TECHNIQUE CAMP\nB1G THANKS TO @...
5461,7789,police,Houston,CNN: Tennessee movie theater shooting suspect ...
5787,8257,rioting,,Still rioting in a couple of hours left until ...
7445,10656,wounds,Lake Highlands,Crack in the path where I wiped out this morni...


In [28]:
print(f"TEXT IN:\n{X_train['text'][7445]}\n\nTEXT OUT:\n{text_clean(X_train['text'][7445])}")

TEXT IN:
Crack in the path where I wiped out this morning during beach run. Surface wounds on left elbow and right knee. http://t.co/yaqRSximph

TEXT OUT:
crack path wiped morning beach run surface wound left elbow right knee


Next we define a function `transform_func` that takes the feature dataset and transforms it in the following ways:

1. It first removes all features but the text. (For the time being we will **only** make use of a tweet's text, leaving open the possibility that future developments will incorporate other features as well.)
2. It applies the above `text_clean` function to all rows to clean the text data.

`transform_func` will then be used as the first part of our bag-of-words machine learning pipeline.

In [29]:
def transform_func(feature_data):
    # Trim to just the text (feature_data['text']) and apply text_clean
    return feature_data['text'].apply(text_clean)

In [36]:
print(transform_func(X_train))

1186    ash 2015 australiaûªs collapse trent bridge a...
4071    great michigan technique camp b1g thanks bmurp...
5461    cnn tennessee movie theater shooting suspect k...
5787                 still rioting couple hour left class
7445    crack path wiped morning beach run surface wou...
                              ...                        
5226    eganator2000 arent many obliteration server al...
5390    panic attack bc dont enough money drug alcohol...
860     omron hem712c automatic blood pressure monitor...
7603    official say quarantine place alabama home pos...
7270     moved england five year ago today whirlwind time
Name: text, Length: 5329, dtype: object


Finally, we create an `sklearn` `FunctionTransformer` called `text_clean_tran`. This essentially makes `transform_func` into something that can be integrated into an `sklearn` pipeline:

In [30]:
text_clean_tran = FunctionTransformer(transform_func)

### BOW Model \#1: _K_-nearest neighbors

The first model we will test will be a _k_-nearest neighbors model. First we build the basic pipeline. The pipeline will look as follows:

1. Transform the data using `text_clean_tran`.
2. Vectorize data using either a pure count vectorizer or TF-IDF (depending on results of cross-validation).
3. Apply a _k_-nearest neighbors model (with number of neighbors and possible weighting determined by grid search).

Let's start with a very basic _k_-nearest-neighbors pipeline, using a pure count vectorizer and `sklearn`'s default _k_-neighbors settings (5 neighbors, unweighted), and see how it does via cross-validation on the training data:

In [67]:
knn_pipe = Pipeline([
    ('text_cleaning', text_clean_tran),
    ('vectorize', CountVectorizer()),
    ('knn', KNeighborsClassifier())
])

Now let's try a 5-fold cross-validation on the training data:

In [68]:
knn_cv = cross_validate(estimator = knn_pipe, X = X_train, y = y_train, scoring = 'accuracy', cv = 5)
print(f"MEAN ACCURACY: {round(100 * knn_cv['test_score'].mean(), 1)}%")

MEAN ACCURACY: 67.3%


Our mean accuracy in our cross-validation was about 67.3%. To give a sense of how good this is, if one were to just guess the most common target value in the training set--`0`, i.e., not an actual disaster--every time, one would obtain an accuracy upon cross-validation of about 56.7%:

In [58]:
y_train.value_counts()

0    3024
1    2305
Name: target, dtype: int64

In [54]:
round(100 * max(y_train.value_counts()) / len(y_train), 1)

56.7

So even the most basic KNN model is a significant improvement on a naive model that just guesses "No" every time. But while 67.3% accuracy is nothing to shake a stick at, we'd of course like to improve upon this as much as possible.



In [93]:
knn_params = {
    'vectorize' : [CountVectorizer(), TfidfVectorizer()],
    'knn__n_neighbors' : list(range(15,20)),
    'knn__weights' : ['uniform', 'distance'],
}

In [99]:
knn_grid = GridSearchCV(estimator = knn_pipe, param_grid = knn_params, scoring = 'f1')

In [100]:
knn_grid.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('text_cleaning',
                                        FunctionTransformer(func=<function transform_func at 0x7fde925e7ef0>)),
                                       ('vectorize', CountVectorizer()),
                                       ('knn', KNeighborsClassifier())]),
             param_grid={'knn__n_neighbors': [15, 16, 17, 18, 19],
                         'knn__weights': ['uniform', 'distance'],
                         'vectorize': [CountVectorizer(), TfidfVectorizer()]},
             scoring='f1')

In [101]:
knn_grid.best_params_

{'knn__n_neighbors': 16,
 'knn__weights': 'distance',
 'vectorize': TfidfVectorizer()}

In [102]:
knn_grid.best_score_

0.7262806011070915

In [103]:
knn_final = Pipeline([
    ('text_cleaning', text_clean_tran),
    ('vectorize', TfidfVectorizer()),
    ('knn', KNeighborsClassifier(n_neighbors = 16, weights = 'distance'))
])

In [104]:
knn_final_cv = cross_validate(estimator = knn_final, X = X_train, y = y_train, scoring = ['f1', 'accuracy'], cv = 5)

In [107]:
print(f"MEAN ACCURACY: {knn_final_cv['test_accuracy'].mean()}\nMEAN F1: {knn_final_cv['test_f1'].mean()}")

MEAN ACCURACY: 0.7830748090796184
MEAN F1: 0.7262806011070915
