In [50]:
import sys
sys.path.append('..')
import re

import numpy as np
import pickle
import pandas as pd
import sqlalchemy
import nltk
nltk.download(['punkt', 'wordnet'])
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline

[nltk_data] Downloading package punkt to /home/sgm/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/sgm/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [51]:
def load_data(database_filepath):
    engine = sqlalchemy.create_engine("sqlite:///%s" % database_filepath)
    df = pd.read_sql_table('disaster_messages', engine)
    X = df.message.copy()
    y = df[df.columns[4:]].copy()
    y = y.apply(pd.to_numeric)
    categories = df.columns[4:].copy()
    return X, y, categories

In [52]:
X,y,categories = load_data('../data/DisasterResponse.db')

In [53]:
X.head()

0    Weather update - a cold front from Cuba that c...
1              Is the Hurricane over or is it not over
2                      Looking for someone but no name
3    UN reports Leogane 80-90 destroyed. Only Hospi...
4    says: west side of Haiti, rest of the country ...
Name: message, dtype: object

In [54]:
y.dtypes

related                   int64
request                   int64
offer                     int64
aid_related               int64
medical_help              int64
medical_products          int64
search_and_rescue         int64
security                  int64
military                  int64
child_alone               int64
water                     int64
food                      int64
shelter                   int64
clothing                  int64
money                     int64
missing_people            int64
refugees                  int64
death                     int64
other_aid                 int64
infrastructure_related    int64
transport                 int64
buildings                 int64
electricity               int64
tools                     int64
hospitals                 int64
shops                     int64
aid_centers               int64
other_infrastructure      int64
weather_related           int64
floods                    int64
storm                     int64
fire    

In [55]:
categories

Index(['related', 'request', 'offer', 'aid_related', 'medical_help',
       'medical_products', 'search_and_rescue', 'security', 'military',
       'child_alone', 'water', 'food', 'shelter', 'clothing', 'money',
       'missing_people', 'refugees', 'death', 'other_aid',
       'infrastructure_related', 'transport', 'buildings', 'electricity',
       'tools', 'hospitals', 'shops', 'aid_centers', 'other_infrastructure',
       'weather_related', 'floods', 'storm', 'fire', 'earthquake', 'cold',
       'other_weather', 'direct_report'],
      dtype='object')

In [56]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2)

In [57]:
def tokenize(text):
    url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    detected_urls = re.findall(url_regex, text)
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")

    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens

In [61]:
def build_model(verbose=False):
    pipeline = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier(MLPClassifier()))
    ])
    if verbose:
        for p in pipeline.get_params().keys():
           print(p)

    parameters = {
        'tfidf__norm': ['l2', 'l1'],
        'clf__estimator__hidden_layer_sizes': [
            (50,),
            (50, 25),
            (50, 25, 10)
        ],
        'clf__estimator__learning_rate_init': [
            0.001,
            0.01
        ]
    }

    cv = GridSearchCV(pipeline, parameters)

    return cv

In [62]:
model = build_model(False)

In [None]:
model.fit(X_train, Y_train)

