# ML Pipeline Preparation

### 1. Import libraries and load data from database
- Import Python libraries
- Load dataset from database with [`read_sql_table`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_table.html)
- Define feature and target variables X and Y

In [5]:
# import libraries
import nltk
nltk.download(['averaged_perceptron_tagger', 'wordnet'])
from nltk import pos_tag
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

import numpy as np
import pandas as pd
import pickle
import re

from sqlalchemy import create_engine

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.multioutput import MultiOutputClassifier

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/faustina/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/faustina/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [7]:
# load data from database
engine = create_engine('sqlite:///web_app/data/DisasterResponse.db')
df = pd.read_sql_table('DisasterResponseData', engine)
X = df['message']
Y = df[df.columns[-36:]]

### 2. Write a tokenization function to process text data

In [8]:
def tokenize(text):
    """
    Transforms a text to clean tokens, where every token is a word converted to lower case,
    passed to a part-of-speech tagger and lemmatized accordingly.
    Words recognized as stopwords are ommitted.
    
    Input:
        text (str)
        
    Output:
        clean_tokens (list): list of clean tokens (words converted to lower case and lemmatized)
        
    """
    
    tokenizer = RegexpTokenizer('\w+')
    lemmatizer = WordNetLemmatizer()

    tokens = tokenizer.tokenize(text.lower())
    
    clean_tokens = []
    
    for word, tag in pos_tag(tokens):
        if tag[0] in ['A', 'R', 'N', 'V']:
            tag = tag[0].lower()
            clean_token = lemmatizer.lemmatize(word, pos=tag)
        else:
            clean_token = word
            
        if clean_token not in stopwords.words('english'):
            clean_tokens.append(clean_token)
        
    return clean_tokens

### 3. Build a machine learning pipeline
This machine pipeline takes in the `message` column as input and outputs classification results on the other 36 categories in the dataset.

In [9]:
pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(RandomForestClassifier())),
])

### 4. Train pipeline
- Split data into train and test sets
- Train pipeline in batches

In [10]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y)

In [158]:
def batch_processing(model=None, X_subset=None, Y_subset=None, n_batches = 101):
    """
    Takes in X and Y subsets and fits a model with it. Returns the model after processing all batches.
    """
    
    assert X_subset.shape[0] == Y_subset.shape[0]
    
    batch_size = X_subset.shape[0] // n_batches

    it = itertools.count(step=batch_size)

    for _ in range(n_batches):
        start = next(it)
        end = start + batch_size
        
        model.fit(X_subset[start : end], Y_subset[start : end])
        
        print(f"{end} trained examples. {round(end / X_subset.shape[0] * 100, 2)}%")

        if (end + batch_size) > X_subset.shape[0]:
            assert (end + len(X_subset[end:])) == X_subset.shape[0]
            model.fit(X_subset[end:], Y_subset[end:])
            print(f"{end + len(X_subset[end:])} trained examples. {round((end + len(X_subset[end:])) / X_subset.shape[0] * 100, 2)}%")
    
    return model            

In [159]:
batch_processing(pipeline, X_train, Y_train)

194 trained examples. 0.99%
388 trained examples. 1.97%
582 trained examples. 2.96%
776 trained examples. 3.95%
970 trained examples. 4.93%
1164 trained examples. 5.92%
1358 trained examples. 6.91%
1552 trained examples. 7.89%
1746 trained examples. 8.88%
1940 trained examples. 9.87%
2134 trained examples. 10.85%
2328 trained examples. 11.84%
2522 trained examples. 12.83%
2716 trained examples. 13.81%
2910 trained examples. 14.8%
3104 trained examples. 15.79%
3298 trained examples. 16.77%
3492 trained examples. 17.76%
3686 trained examples. 18.75%
3880 trained examples. 19.73%
4074 trained examples. 20.72%
4268 trained examples. 21.71%
4462 trained examples. 22.69%
4656 trained examples. 23.68%
4850 trained examples. 24.67%
5044 trained examples. 25.65%
5238 trained examples. 26.64%
5432 trained examples. 27.63%
5626 trained examples. 28.62%
5820 trained examples. 29.6%
6014 trained examples. 30.59%
6208 trained examples. 31.58%
6402 trained examples. 32.56%
6596 trained examples. 33.5

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function tokenize at...
                                                                        ccp_alpha=0.0,
                                                                        class_weight=None,
                                                                        criterion='gini',
                                                                   

### 5. Test model
Report the precision, recall, f1 score for each output category of the dataset, and overall accuracy score.

In [13]:
def report_classification(y_test, y_pred):
    """
    Takes in Y_test and Y_pred and returns precision, recall and F1 score 
    for every feature in the dataset, and the overall accuracy of the model.
    
    Input:
        Y_test (pandas.core.series.Series): a subset of Y with the purpose of testing the model
        Y_pred (pandas.core.series.Series): predictions made with X_test by the model
        
    Output:
        Prints out the following format
            feature_name
                Precision: __%
                Recall: __%
                F1 Score: __%
                
                ...
                
                Accuracy Score: __%
                
        And also returns the full value of accuracy.
    """
    
    for idx, col in enumerate(y_test):
        set_y_pair = (y_test[col], y_pred[:, idx])
        avg='weighted'
        rep_col = "{}\n\tPrecision: {:.2f}%\n\tRecall: {:.2f}%\n\tF1 Score: {:.2f}%\n".format(col,
                                                                                 precision_score(*set_y_pair, average=avg), 
                                                                                 recall_score(*set_y_pair, average=avg), 
                                                                                 f1_score(*set_y_pair, average=avg))
        print(rep_col)
        
    print('Accuracy Score: {:.2f}%'.format(np.mean(y_test.values == y_pred)))

    return np.mean(y_test.values == y_pred)

In [14]:
Y_pred = pipeline.predict(X_test)

report_classification(Y_test, Y_pred)

related
	Precision: 0.82%
	Recall: 0.77%
	F1 Score: 0.67%

request
	Precision: 0.84%
	Recall: 0.84%
	F1 Score: 0.78%

offer
	Precision: 0.99%
	Recall: 0.99%
	F1 Score: 0.99%

aid_related
	Precision: 0.68%
	Recall: 0.62%
	F1 Score: 0.53%

medical_help
	Precision: 0.85%
	Recall: 0.92%
	F1 Score: 0.88%

medical_products
	Precision: 0.90%
	Recall: 0.95%
	F1 Score: 0.92%

search_and_rescue
	Precision: 0.94%
	Recall: 0.97%
	F1 Score: 0.96%



  _warn_prf(average, modifier, msg_start, len(result))


security
	Precision: 0.97%
	Recall: 0.98%
	F1 Score: 0.97%

military
	Precision: 0.93%
	Recall: 0.97%
	F1 Score: 0.95%

child_alone
	Precision: 1.00%
	Recall: 1.00%
	F1 Score: 1.00%

water
	Precision: 0.88%
	Recall: 0.94%
	F1 Score: 0.90%

food
	Precision: 0.78%
	Recall: 0.88%
	F1 Score: 0.83%

shelter
	Precision: 0.83%
	Recall: 0.91%
	F1 Score: 0.87%

clothing
	Precision: 0.96%
	Recall: 0.98%
	F1 Score: 0.97%

money
	Precision: 0.96%
	Recall: 0.98%
	F1 Score: 0.97%

missing_people
	Precision: 0.98%
	Recall: 0.99%
	F1 Score: 0.98%

refugees
	Precision: 0.93%
	Recall: 0.96%
	F1 Score: 0.95%

death
	Precision: 0.91%
	Recall: 0.95%
	F1 Score: 0.93%

other_aid
	Precision: 0.79%
	Recall: 0.87%
	F1 Score: 0.81%

infrastructure_related
	Precision: 0.88%
	Recall: 0.94%
	F1 Score: 0.91%

transport
	Precision: 0.91%
	Recall: 0.96%
	F1 Score: 0.93%

buildings
	Precision: 0.90%
	Recall: 0.95%
	F1 Score: 0.93%

electricity
	Precision: 0.96%
	Recall: 0.98%
	F1 Score: 0.97%

tools
	Precision: 0.99%
	

0.9278303326213

### 6. Improve model
Use grid search to find better parameters. 

In [15]:
pipeline.get_params()

{'memory': None,
 'steps': [('vect',
   CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                   dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                   lowercase=True, max_df=1.0, max_features=None, min_df=1,
                   ngram_range=(1, 1), preprocessor=None, stop_words=None,
                   strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                   tokenizer=<function tokenize at 0x7fa64324e680>,
                   vocabulary=None)),
  ('tfidf',
   TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)),
  ('clf',
   MultiOutputClassifier(estimator=RandomForestClassifier(bootstrap=True,
                                                          ccp_alpha=0.0,
                                                          class_weight=None,
                                                          criterion='gini',
                                                          max_depth=No

In [None]:
parameters = {}

cv = GridSearchCV(pipeline, parameters)
cv

In [None]:
batch_processing(cv, X_train, Y_train, 250)

### 7. Test model
Show the precision, recall and overall accuracy of the tuned model. 

In [None]:
y_pred = cv.predict(X_test)

report_classification(y_test, y_pred)

In [None]:
cv.best_params_

### 8. Improve model


### 9. Export your model as a pickle file

In [None]:
with open('classifier.pkl', 'wb') as f:
    pickle.dump(, f)

### 10. Use this notebook to complete `train_classifier.py`

In [None]:
%%writefile web_app/models/train_classifier.py