# ML Pipeline Preparation

### 1. Import libraries and load data from database
- Import Python libraries
- Load dataset from database with [`read_sql_table`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_table.html)
- Define feature and target variables X and Y

In [1]:
# import libraries
import nltk
nltk.download(['averaged_perceptron_tagger', 'wordnet'])
from nltk import pos_tag
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from helpers impr
import numpy as np
import pandas as pd
import pickle
import re

from sqlalchemy import create_engine

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.multioutput import MultiOutputClassifier
from sklearn.naive_bayes import MultinomialNB

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/faustina/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/faustina/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# load data from database
engine = create_engine('sqlite:///web_app/data/DisasterResponse.db')
df = pd.read_sql_table('DisasterResponseData', engine)
X = df['message']
Y = df[df.columns[-36:]]

### 2. Write a tokenization function to process text data

In [3]:
def tokenize(text):
    """
    Transforms a text to clean tokens, where every token is a word converted to lower case,
    passed to a part-of-speech tagger and lemmatized accordingly.
    Words recognized as stopwords are ommitted.
    
    Input:
        text (str)
        
    Output:
        clean_tokens (list): list of clean tokens (words converted to lower case and lemmatized)
        
    """
    
    tokenizer = RegexpTokenizer('\w+')
    lemmatizer = WordNetLemmatizer()

    tokens = tokenizer.tokenize(text.lower())
    
    clean_tokens = []
    
    for word, tag in pos_tag(tokens):
        if tag[0] in ['A', 'R', 'N', 'V']:
            tag = tag[0].lower()
            clean_token = lemmatizer.lemmatize(word, pos=tag)
        else:
            clean_token = word
            
        if clean_token not in stopwords.words('english'):
            clean_tokens.append(clean_token)
        
    return clean_tokens

### 3. Build a machine learning pipeline
This machine pipeline takes in the `message` column as input and outputs classification results on the other 36 categories in the dataset.

In [4]:
pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(RandomForestClassifier(warm_start=True))),
], verbose=True)

### 4. Train pipeline
- Split data into train and test sets
- Train pipeline in batches

In [5]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y)

In [6]:
pipeline.fit(X_train, Y_train)

[Pipeline] .............. (step 1 of 3) Processing vect, total= 4.4min
[Pipeline] ............. (step 2 of 3) Processing tfidf, total=   0.1s
[Pipeline] ............... (step 3 of 3) Processing clf, total=23.9min


Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function tokenize at...
                                                                        ccp_alpha=0.0,
                                                                        class_weight=None,
                                                                        criterion='gini',
                                                                   

### 5. Test model
Report the precision, recall, f1 score for each output category of the dataset, and overall accuracy score.

In [8]:
Y_pred = pipeline.predict(X_test)

report_classification(Y_test, Y_pred)

related
	Precision: 0.81%
	Recall: 0.82%
	F1 Score: 0.81%

request
	Precision: 0.90%
	Recall: 0.90%
	F1 Score: 0.89%

offer
	Precision: 0.99%
	Recall: 1.00%
	F1 Score: 0.99%

aid_related
	Precision: 0.77%
	Recall: 0.78%
	F1 Score: 0.77%

medical_help
	Precision: 0.91%
	Recall: 0.93%
	F1 Score: 0.90%

medical_products
	Precision: 0.95%
	Recall: 0.96%
	F1 Score: 0.94%



  _warn_prf(average, modifier, msg_start, len(result))


search_and_rescue
	Precision: 0.96%
	Recall: 0.97%
	F1 Score: 0.96%

security
	Precision: 0.96%
	Recall: 0.98%
	F1 Score: 0.97%

military
	Precision: 0.96%
	Recall: 0.97%
	F1 Score: 0.96%

child_alone
	Precision: 1.00%
	Recall: 1.00%
	F1 Score: 1.00%

water
	Precision: 0.96%
	Recall: 0.96%
	F1 Score: 0.95%

food
	Precision: 0.94%
	Recall: 0.94%
	F1 Score: 0.94%

shelter
	Precision: 0.93%
	Recall: 0.94%
	F1 Score: 0.93%

clothing
	Precision: 0.98%
	Recall: 0.98%
	F1 Score: 0.98%

money
	Precision: 0.98%
	Recall: 0.98%
	F1 Score: 0.97%

missing_people
	Precision: 0.99%
	Recall: 0.99%
	F1 Score: 0.98%

refugees
	Precision: 0.95%
	Recall: 0.96%
	F1 Score: 0.95%

death
	Precision: 0.96%
	Recall: 0.96%
	F1 Score: 0.95%

other_aid
	Precision: 0.83%
	Recall: 0.87%
	F1 Score: 0.82%

infrastructure_related
	Precision: 0.88%
	Recall: 0.94%
	F1 Score: 0.90%

transport
	Precision: 0.95%
	Recall: 0.96%
	F1 Score: 0.94%

buildings
	Precision: 0.95%
	Recall: 0.95%
	F1 Score: 0.94%

electricity
	Precis

0.948767504153528

### 6. Improve model
Use grid search to find better parameters. 

In [9]:
pipeline.get_params()

{'memory': None,
 'steps': [('vect',
   CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                   dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                   lowercase=True, max_df=1.0, max_features=None, min_df=1,
                   ngram_range=(1, 1), preprocessor=None, stop_words=None,
                   strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                   tokenizer=<function tokenize at 0x7f05ea935c20>,
                   vocabulary=None)),
  ('tfidf',
   TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)),
  ('clf',
   MultiOutputClassifier(estimator=RandomForestClassifier(bootstrap=True,
                                                          ccp_alpha=0.0,
                                                          class_weight=None,
                                                          criterion='gini',
                                                          max_depth=No

In [None]:
parameters = {}

cv = GridSearchCV(pipeline, parameters)
cv

In [None]:
cv.fit(X_train, Y_train)

In [None]:
cv.best_estimator_

In [None]:
cv.best_params_

### 7. Test model
Show the precision, recall and overall accuracy of the tuned model. 

In [None]:
Y_pred = cv.predict(X_test)

report_classification(Y_test, Y_pred)

In [None]:
cv.best_params_

In [None]:
cv.best_estimator_.get_params()

### 8. Improve model


### 9. Export your model as a pickle file

In [None]:
with open('web_app/models/classifier.pkl', 'wb') as f:
    pickle.dump(cv, f)

### 10. Use this notebook to complete `train_classifier.py`

In [None]:
%%writefile web_app/models/train_classifier.py