In [77]:
import pandas as pd
import numpy as np
import re

from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer

from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline

# Disaster Tweets

This notebook is dedicated to solving Kaggle's [Natural Language Processing with Disaster Tweets](https://www.kaggle.com/c/nlp-getting-started/overview) challenge. This a **supervised binary classification** task in which the features are tweet information, and the target is a value of 1 if the tweet is about a real disaster, and 0 if not.

## A first look at the data

First we import the training/validation data. I've stored these locally in my `raw_data` folder; they can be downloaded from Kaggle [here](https://www.kaggle.com/c/nlp-getting-started/data).

In [4]:
train_data = pd.read_csv('raw_data/train.csv')
test_data = pd.read_csv('raw_data/test.csv')
sample_submission = pd.read_csv('raw_data/sample_submission.csv')

In [5]:
train_data.shape

(7613, 5)

In [6]:
train_data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [9]:
train_data['target'].value_counts()

0    4342
1    3271
Name: target, dtype: int64

We'll divide the training data into a 70-30 train-validate split. And for now let's assume we'll **only** be using the text of the tweet and not the other information.

In [79]:
X_tv = train_data.drop(columns = ['target'])
y_tv = train_data['target']

X_train, X_val, y_train, y_val = train_test_split(X_tv, y_tv, test_size = .3, random_state = 42)

In [80]:
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((5329, 4), (2284, 4), (5329,), (2284,))

## Defining a function for cleaning the data

In [81]:
lemmatizer = WordNetLemmatizer()

In [82]:
def text_clean(text):
    out_text = text
    stop_words = stopwords.words('english')
    
    # Remove hyperlinks
    out_text = re.sub(r'http\S+', '', out_text)
    
    # Remove punctuation
    for punct in punctuation:
        out_text = out_text.replace(punct, '')
        
    out_text = out_text.lower()
    out_text_token = word_tokenize(out_text)
    out_text_token = [lemmatizer.lemmatize(word) for word in out_text_token if word not in stop_words]
    
    out_text = ' '.join(out_text_token)
    
    return out_text

In [83]:
X_train.head()

Unnamed: 0,id,keyword,location,text
1186,1707,bridge%20collapse,,Ashes 2015: AustraliaÛªs collapse at Trent Br...
4071,5789,hail,"Carol Stream, Illinois",GREAT MICHIGAN TECHNIQUE CAMP\nB1G THANKS TO @...
5461,7789,police,Houston,CNN: Tennessee movie theater shooting suspect ...
5787,8257,rioting,,Still rioting in a couple of hours left until ...
7445,10656,wounds,Lake Highlands,Crack in the path where I wiped out this morni...


In [84]:
print(X_train['text'][7445])

Crack in the path where I wiped out this morning during beach run. Surface wounds on left elbow and right knee. http://t.co/yaqRSximph


In [85]:
print(text_clean(X_train['text'][7445]))

crack path wiped morning beach run surface wound left elbow right knee


In [None]:
def transformfunc(feature_data):
    # Trim to just the text
    out_data = feature_data['text']
    out_data = out