# ML Pipeline Preparation

### 1. Import libraries and load data from database
- Import Python libraries
- Load dataset from database with [`read_sql_table`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_table.html)
- Define feature and target variables X and Y

In [1]:
# import libraries
import nltk
nltk.download(['averaged_perceptron_tagger', 'wordnet'])
from nltk import pos_tag
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from helpers import report_classification
import numpy as np
import pandas as pd
import pickle
import re

from sqlalchemy import create_engine

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.multioutput import MultiOutputClassifier
from sklearn.naive_bayes import MultinomialNB

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/user/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# load data from database
engine = create_engine('sqlite:///web_app/data/DisasterResponse.db')
df = pd.read_sql_table('DisasterResponseData', engine)
X = df['message']
Y = df[df.columns[-36:]]

### 2. Write a tokenization function to process text data

In [3]:
def tokenize(text):
    """
    Transforms a text to clean tokens, where every token is a word converted to lower case,
    passed to a part-of-speech tagger and lemmatized accordingly.
    Words recognized as stopwords are ommitted.
    
    Input:
        text (str)
        
    Output:
        clean_tokens (list): list of clean tokens (words converted to lower case and lemmatized)
        
    """
    
    tokenizer = RegexpTokenizer('\w+')
    lemmatizer = WordNetLemmatizer()

    tokens = tokenizer.tokenize(text.lower())
    
    clean_tokens = []
    
    for word, tag in pos_tag(tokens):
        if tag[0] in ['A', 'R', 'N', 'V']:
            tag = tag[0].lower()
            clean_token = lemmatizer.lemmatize(word, pos=tag)
        else:
            clean_token = word
            
        if clean_token not in stopwords.words('english'):
            clean_tokens.append(clean_token)
        
    return clean_tokens

### 3. Build a machine learning pipeline
This machine pipeline takes in the `message` column as input and outputs classification results on the other 36 categories in the dataset.

In [4]:
pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(RandomForestClassifier(warm_start=True))),
], verbose=True)

### 4. Train pipeline
- Split data into train and test sets
- Train pipeline in batches

In [5]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y)

In [6]:
pipeline.fit(X_train, Y_train)

[Pipeline] .............. (step 1 of 3) Processing vect, total= 4.3min
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.1s
[Pipeline] ............... (step 3 of 3) Processing clf, total=22.0min


Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function tokenize at...
                                                                        ccp_alpha=0.0,
                                                                        class_weight=None,
                                                                        criterion='gini',
                                                                   

### 5. Test model
Report the precision, recall, f1 score for each output category of the dataset, and overall accuracy score.

In [7]:
Y_pred = pipeline.predict(X_test)

report_classification(Y_test, Y_pred)

related
	Precision: 0.81%
	Recall: 0.82%
	F1 Score: 0.81%

request
	Precision: 0.88%
	Recall: 0.89%
	F1 Score: 0.88%

offer
	Precision: 0.99%
	Recall: 0.99%
	F1 Score: 0.99%

aid_related
	Precision: 0.79%
	Recall: 0.79%
	F1 Score: 0.79%

medical_help
	Precision: 0.90%
	Recall: 0.92%
	F1 Score: 0.89%

medical_products
	Precision: 0.94%
	Recall: 0.95%
	F1 Score: 0.93%

search_and_rescue
	Precision: 0.97%
	Recall: 0.97%
	F1 Score: 0.96%

security
	Precision: 0.97%
	Recall: 0.98%
	F1 Score: 0.97%



  _warn_prf(average, modifier, msg_start, len(result))


military
	Precision: 0.96%
	Recall: 0.97%
	F1 Score: 0.95%

child_alone
	Precision: 1.00%
	Recall: 1.00%
	F1 Score: 1.00%

water
	Precision: 0.96%
	Recall: 0.96%
	F1 Score: 0.95%

food
	Precision: 0.94%
	Recall: 0.94%
	F1 Score: 0.93%

shelter
	Precision: 0.93%
	Recall: 0.93%
	F1 Score: 0.92%

clothing
	Precision: 0.99%
	Recall: 0.99%
	F1 Score: 0.98%

money
	Precision: 0.97%
	Recall: 0.98%
	F1 Score: 0.97%

missing_people
	Precision: 0.97%
	Recall: 0.99%
	F1 Score: 0.98%

refugees
	Precision: 0.96%
	Recall: 0.97%
	F1 Score: 0.96%

death
	Precision: 0.96%
	Recall: 0.96%
	F1 Score: 0.94%

other_aid
	Precision: 0.82%
	Recall: 0.86%
	F1 Score: 0.81%

infrastructure_related
	Precision: 0.89%
	Recall: 0.93%
	F1 Score: 0.90%

transport
	Precision: 0.94%
	Recall: 0.96%
	F1 Score: 0.94%

buildings
	Precision: 0.95%
	Recall: 0.96%
	F1 Score: 0.94%

electricity
	Precision: 0.97%
	Recall: 0.98%
	F1 Score: 0.97%

tools
	Precision: 0.99%
	Recall: 0.99%
	F1 Score: 0.99%

hospitals
	Precision: 0.98%


0.9477036924015868

### 6. Improve model
Use grid search to find better parameters. 

In [8]:
pipeline.get_params()

{'memory': None,
 'steps': [('vect',
   CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                   dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                   lowercase=True, max_df=1.0, max_features=None, min_df=1,
                   ngram_range=(1, 1), preprocessor=None, stop_words=None,
                   strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                   tokenizer=<function tokenize at 0x7ffab08ce950>,
                   vocabulary=None)),
  ('tfidf',
   TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)),
  ('clf',
   MultiOutputClassifier(estimator=RandomForestClassifier(bootstrap=True,
                                                          ccp_alpha=0.0,
                                                          class_weight=None,
                                                          criterion='gini',
                                                          max_depth=No

In [9]:
parameters = {
    'clf__estimator': (MultinomialNB(), 
                       RandomForestClassifier(warm_start=True))
}

cv = GridSearchCV(pipeline, parameters, cv=3, n_jobs=1, verbose=10)
cv

GridSearchCV(cv=3, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('vect',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        prep

In [10]:
cv.fit(X_train, Y_train)

Fitting 3 folds for each of 2 candidates, totalling 6 fits
[CV] clf__estimator=MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True) 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[Pipeline] .............. (step 1 of 3) Processing vect, total= 2.8min
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.0s
[Pipeline] ............... (step 3 of 3) Processing clf, total=   0.6s
[CV]  clf__estimator=MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), score=0.162, total= 4.3min
[CV] clf__estimator=MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True) 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  4.3min remaining:    0.0s


[Pipeline] .............. (step 1 of 3) Processing vect, total= 2.8min
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.0s
[Pipeline] ............... (step 3 of 3) Processing clf, total=   0.6s
[CV]  clf__estimator=MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), score=0.170, total= 4.2min
[CV] clf__estimator=MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True) 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  8.5min remaining:    0.0s


[Pipeline] .............. (step 1 of 3) Processing vect, total= 2.8min
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.0s
[Pipeline] ............... (step 3 of 3) Processing clf, total=   0.6s
[CV]  clf__estimator=MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), score=0.161, total= 4.2min
[CV] clf__estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=True) 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 12.7min remaining:    0.0s


[Pipeline] .............. (step 1 of 3) Processing vect, total= 2.8min
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.0s
[Pipeline] ............... (step 3 of 3) Processing clf, total=10.8min
[CV]  clf__estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=True), score=0.260, total=15.6min
[CV] clf__estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                     

[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed: 28.4min remaining:    0.0s


[Pipeline] .............. (step 1 of 3) Processing vect, total= 2.8min
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.1s
[Pipeline] ............... (step 3 of 3) Processing clf, total=10.9min
[CV]  clf__estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=True), score=0.268, total=15.7min
[CV] clf__estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                     

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 44.1min remaining:    0.0s


[Pipeline] .............. (step 1 of 3) Processing vect, total= 2.8min
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.0s
[Pipeline] ............... (step 3 of 3) Processing clf, total=10.8min
[CV]  clf__estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=True), score=0.261, total=15.6min


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed: 59.7min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed: 59.7min finished


[Pipeline] .............. (step 1 of 3) Processing vect, total= 4.2min
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.1s
[Pipeline] ............... (step 3 of 3) Processing clf, total=22.4min


GridSearchCV(cv=3, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('vect',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        prep

In [11]:
cv.best_estimator_

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function tokenize at...
                                                                        ccp_alpha=0.0,
                                                                        class_weight=None,
                                                                        criterion='gini',
                                                                   

In [12]:
cv.best_params_

{'clf__estimator': RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                        criterion='gini', max_depth=None, max_features='auto',
                        max_leaf_nodes=None, max_samples=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, n_estimators=100,
                        n_jobs=None, oob_score=False, random_state=None,
                        verbose=0, warm_start=True)}

In [13]:
pipeline_2 = cv.best_estimator_

In [14]:
pipeline_2.fit(X_train, Y_train)

[Pipeline] .............. (step 1 of 3) Processing vect, total= 4.2min
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.1s
[Pipeline] ............... (step 3 of 3) Processing clf, total=22.7min


Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function tokenize at...
                                                                        ccp_alpha=0.0,
                                                                        class_weight=None,
                                                                        criterion='gini',
                                                                   

### 7. Test model
Show the precision, recall and overall accuracy of the tuned model. 

In [15]:
Y_pred_2 = pipeline_2.predict(X_test)

report_classification(Y_test, Y_pred_2)

related
	Precision: 0.81%
	Recall: 0.83%
	F1 Score: 0.81%

request
	Precision: 0.89%
	Recall: 0.89%
	F1 Score: 0.88%

offer
	Precision: 0.99%
	Recall: 0.99%
	F1 Score: 0.99%



  _warn_prf(average, modifier, msg_start, len(result))


aid_related
	Precision: 0.79%
	Recall: 0.79%
	F1 Score: 0.79%

medical_help
	Precision: 0.89%
	Recall: 0.92%
	F1 Score: 0.89%

medical_products
	Precision: 0.94%
	Recall: 0.95%
	F1 Score: 0.93%

search_and_rescue
	Precision: 0.97%
	Recall: 0.97%
	F1 Score: 0.96%

security
	Precision: 0.97%
	Recall: 0.98%
	F1 Score: 0.97%

military
	Precision: 0.96%
	Recall: 0.97%
	F1 Score: 0.95%

child_alone
	Precision: 1.00%
	Recall: 1.00%
	F1 Score: 1.00%

water
	Precision: 0.95%
	Recall: 0.96%
	F1 Score: 0.95%

food
	Precision: 0.93%
	Recall: 0.94%
	F1 Score: 0.93%

shelter
	Precision: 0.93%
	Recall: 0.93%
	F1 Score: 0.92%

clothing
	Precision: 0.98%
	Recall: 0.99%
	F1 Score: 0.98%

money
	Precision: 0.97%
	Recall: 0.98%
	F1 Score: 0.97%

missing_people
	Precision: 0.97%
	Recall: 0.99%
	F1 Score: 0.98%

refugees
	Precision: 0.95%
	Recall: 0.97%
	F1 Score: 0.95%

death
	Precision: 0.95%
	Recall: 0.96%
	F1 Score: 0.94%

other_aid
	Precision: 0.82%
	Recall: 0.86%
	F1 Score: 0.80%

infrastructure_relat

0.9477418370460787

### 8. Export model as a pickle file

In [16]:
with open('web_app/models/classifier.pkl', 'wb') as f:
    pickle.dump(pipeline_2, f)

### 9. Use this notebook to complete `train_classifier.py`

In [17]:
%%writefile web_app/models/train_classifier.py
import pickle
import re
import sys

import nltk
import numpy as np
import pandas as pd
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline
from sqlalchemy import create_engine

nltk.download(['averaged_perceptron_tagger', 'wordnet', 'stopwords'])


def load_data(database_filepath):
    """
    Creates an Engine instance with the path provided, and reads the SQL table that stores the cleaned data.
    Returns features, labels and category names for the dataset.
    """
    engine = create_engine('sqlite:///' + database_filepath)
    
    df = pd.read_sql_table('DisasterResponseData', engine)
    
    X = df['message']
    y = df[df.columns[-36:]]
    
    return X, y, df.columns[-36:]


def tokenize(text):
    """
    Transforms a text to clean tokens, where every token is a word converted to lower case,
    passed to a part-of-speech tagger and lemmatized accordingly.
    Words recognized as stopwords are ommitted.
    
    Input:
        text (str)
        
    Output:
        clean_tokens (list): list of clean tokens (words converted to lower case and lemmatized)
        
    """
    
    tokenizer = RegexpTokenizer('\w+')
    lemmatizer = WordNetLemmatizer()

    tokens = tokenizer.tokenize(text.lower())
    
    clean_tokens = []
    
    for word, tag in pos_tag(tokens):
        if tag[0] in ['A', 'R', 'N', 'V']:
            tag = tag[0].lower()
            clean_token = lemmatizer.lemmatize(word, pos=tag)
        else:
            clean_token = word
            
        if clean_token not in stopwords.words('english'):
            clean_tokens.append(clean_token)
        
    return clean_tokens


def build_model():
    """
    No input needed. Returns a pipeline with the next steps:
        1. vect - Converts a collection of text documents to a matrix of token counts
        2. tfidf - Transforms a count matrix to a normalized *term-frequency* or 
                  *term-frequency times inverse document-frequency* representation
        3. clf - Multi target random forest classification. It reuses the solution 
                    of the previous call to fit and add more estimators to the ensemble.    
    """
    
    model = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier(RandomForestClassifier(warm_start=True))),
    ], verbose=True)
    
    return model


def evaluate_model(model, X_test, Y_test, category_names):
    """
    Takes the model, X and Y test set and category names, and returns precision, recall and F1 score 
    for every feature in the dataset, and the overall accuracy of the model.
    
    Input:
        model ():
        X_test (pandas.core.series.Series): a subset of Y with the purpose of testing the model
        Y_test (pandas.core.series.Series): predictions made with X_test by the model
        category_names (): 
        
    Output:
        Prints out the following format
            feature_name
                Precision: __%
                Recall: __%
                F1 Score: __%
                
                ...
                
                Accuracy Score: __%
                
        And also returns the full value of accuracy.
    """
    
    Y_pred = model.predict(X_test)
    
    for idx, col in enumerate(category_names):
        set_Y_pair = (Y_test[col], Y_pred[:, idx])
        avg='weighted'
        rep_col = "{}\n\tPrecision: {:.2f}%\n\tRecall: {:.2f}%\n\tF1 Score: {:.2f}%\n".format(col,
                                                                                 precision_score(*set_Y_pair, average=avg), 
                                                                                 recall_score(*set_Y_pair, average=avg), 
                                                                                 f1_score(*set_Y_pair, average=avg))
        print(rep_col)
        
    print('Accuracy Score: {:.2f}%'.format(np.mean(Y_pred.values == Y_pred)))

    return np.mean(Y_pred.values == Y_pred)


def save_model(model, model_filepath):
    """
    Takes in the trained model and a path where to store it.
    Dumps the model in a pickle to reuse it later.
    """
    with open(model_filepath, 'wb') as f:
        pickle.dump(model, f)


def main():
    if len(sys.argv) == 3:
        database_filepath, model_filepath = sys.argv[1:]
        print('Loading data...\n    DATABASE: {}'.format(database_filepath))
        X, Y, category_names = load_data(database_filepath)
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
        
        print('Building model...')
        model = build_model()
        
        print('Training model...')
        model.fit(X_train, Y_train)
        
        print('Evaluating model...')
        evaluate_model(model, X_test, Y_test, category_names)

        print('Saving model...\n    MODEL: {}'.format(model_filepath))
        save_model(model, model_filepath)

        print('Trained model saved!')

    else:
        print('Please provide the filepath of the disaster messages database '\
              'as the first argument and the filepath of the pickle file to '\
              'save the model to as the second argument. \n\nExample: python '\
              'train_classifier.py ../data/DisasterResponse.db classifier.pkl')


if __name__ == '__main__':
    main()

Overwriting web_app/models/train_classifier.py
