In [1]:
# import libraries
import sys
import re
import pickle
import numpy as np
import pandas as pd
from sqlalchemy import create_engine
from random import randrange

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV, train_test_split

import warnings
warnings.filterwarnings('ignore')

In [2]:
# load data from database
database_file = '../data/DisasterResponse.db'
table_name = database_file.split('.')[2].split('/')[-1]

engine = create_engine('sqlite:///{}'.format(database_file))
df = pd.read_sql(table_name, engine)

df.head(4)

Unnamed: 0,id,message,original,genre,related,request,offer,aid_related,medical_help,medical_products,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,1,0,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct,1,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0


In [3]:
# Defining feature and target dataframes
X,Y = df['message'], df.iloc[:,4:]

# Target labels
category_names = Y.columns

# Mapping any values except 0/1 for each column with randomly selected 0/1
for column in category_names:
    Y[column] = Y[column].map(lambda x: randrange(0,2) if x > 1 or x < 0 else x)

print(X, Y)

0        Weather update - a cold front from Cuba that c...
1                  Is the Hurricane over or is it not over
2                          Looking for someone but no name
3        UN reports Leogane 80-90 destroyed. Only Hospi...
4        says: west side of Haiti, rest of the country ...
                               ...                        
26211    The training demonstrated how to enhance micro...
26212    A suitable candidate has been selected and OCH...
26213    Proshika, operating in Cox's Bazar municipalit...
26214    Some 2,000 women protesting against the conduc...
26215    A radical shift in thinking came about as a re...
Name: message, Length: 26216, dtype: object        related  request  offer  aid_related  medical_help  medical_products  \
0            1        0      0            0             0                 0   
1            1        0      0            1             0                 0   
2            1        0      0            0             0             

In [4]:
"""Write a tokenization function to process your text data"""

def tokenize(text):
    
    # Convert text to lowercase and remove punctuation
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())
    
    # Tokenize words
    words = word_tokenize(text)
    
    # remove stop words
    stopwords_ = stopwords.words("english")
    words = [word for word in words if word not in stopwords_]
    
    # extract root form of words
    words = [WordNetLemmatizer().lemmatize(word, pos='v') for word in words]

    return words

In [5]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y)

In [6]:
def fit_and_predict(pipeline, X_train, X_test, Y_train):
    pipeline.fit(X_train, Y_train)
    
    # Predictions on test for the pipeline provided
    Y_pred = pipeline.predict(X_test)
    
    return Y_pred

In [14]:
def test_model(Y_pred, Y_test):
    Y_pred = pd.DataFrame(Y_pred, columns=[Y_test.columns])

    print(classification_report(Y_test.values, y_pred, target_names=category_names))
    
    # print accuracy score
    print('Accuracy: {}'.format(np.mean(Y_test.values == y_pred)))

In [8]:
pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('classifier', MultiOutputClassifier(RandomForestClassifier())),
])

In [9]:
# fit and predict y_pred on pipeline
y_pred = fit_and_predict(pipeline, X_train, X_test, Y_train)

In [10]:
test_model(y_pred, Y_test)

Accuracy: 0.9482080493676466


In [11]:
# Improved model
pipeline_improved = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultiOutputClassifier(OneVsRestClassifier(LinearSVC())))
                    ])


# parameters added
parameters = {'vect__ngram_range': ((1, 1), (1, 2)),
              'vect__max_df': (0.75, 1.0),
            }

# create model
model = GridSearchCV(estimator=pipeline, param_grid=parameters, verbose=3, cv=3)

In [12]:
y_pred_improved = fit_and_predict(model, X_train, X_test, Y_train)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV] vect__max_df=0.75, vect__ngram_range=(1, 1) .....................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  vect__max_df=0.75, vect__ngram_range=(1, 1), score=0.263, total= 7.8min
[CV] vect__max_df=0.75, vect__ngram_range=(1, 1) .....................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  7.8min remaining:    0.0s


[CV]  vect__max_df=0.75, vect__ngram_range=(1, 1), score=0.264, total= 7.8min
[CV] vect__max_df=0.75, vect__ngram_range=(1, 1) .....................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 15.6min remaining:    0.0s


[CV]  vect__max_df=0.75, vect__ngram_range=(1, 1), score=0.258, total= 7.8min
[CV] vect__max_df=0.75, vect__ngram_range=(1, 2) .....................
[CV]  vect__max_df=0.75, vect__ngram_range=(1, 2), score=0.267, total=19.4min
[CV] vect__max_df=0.75, vect__ngram_range=(1, 2) .....................
[CV]  vect__max_df=0.75, vect__ngram_range=(1, 2), score=0.262, total=20.9min
[CV] vect__max_df=0.75, vect__ngram_range=(1, 2) .....................
[CV]  vect__max_df=0.75, vect__ngram_range=(1, 2), score=0.262, total=21.1min
[CV] vect__max_df=1.0, vect__ngram_range=(1, 1) ......................
[CV]  vect__max_df=1.0, vect__ngram_range=(1, 1), score=0.265, total= 3.8min
[CV] vect__max_df=1.0, vect__ngram_range=(1, 1) ......................
[CV]  vect__max_df=1.0, vect__ngram_range=(1, 1), score=0.266, total= 4.1min
[CV] vect__max_df=1.0, vect__ngram_range=(1, 1) ......................
[CV]  vect__max_df=1.0, vect__ngram_range=(1, 1), score=0.258, total= 4.1min
[CV] vect__max_df=1.0, vect__ng

[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed: 128.1min finished


In [15]:
test_model(y_pred_improved, Y_test)

                        precision    recall  f1-score   support

               related       0.83      0.95      0.89      4991
               request       0.85      0.51      0.63      1045
                 offer       0.00      0.00      0.00        28
           aid_related       0.75      0.69      0.72      2660
          medical_help       0.74      0.08      0.14       535
      medical_products       0.81      0.08      0.14       328
     search_and_rescue       0.89      0.05      0.09       167
              security       1.00      0.02      0.03       125
              military       0.50      0.05      0.09       211
           child_alone       0.00      0.00      0.00         0
                 water       0.90      0.41      0.56       425
                  food       0.83      0.54      0.66       705
               shelter       0.83      0.34      0.48       563
              clothing       0.67      0.08      0.14       103
                 money       0.88      

In [17]:
def save_model(model, model_filepath):
    
    """
    This function saves the model to a Python pickle file

    Args:
    model: Trained model
    model_filepath: Location to save the model

    Returns:
    none - Saves the model to pickle file
    """

    # save model to pickle file
    pickle.dump(model, open(model_filepath, 'wb'))
    
    print('Trained model saved!')

In [18]:
save_model(model, 'classifier.pkl')

Trained model saved!
