# ML Pipeline Preparation
Follow the instructions below to help you create your ML pipeline.
### 1. Import libraries and load data from database.
- Import Python libraries
- Load dataset from database with [`read_sql_table`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_table.html)
- Define feature and target variables X and Y

In [3]:
# import libraries
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.multioutput import MultiOutputClassifier
from sklearn.multiclass import OneVsRestClassifier
# from sklearn.externals import joblib
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer
from sklearn.svm import SVC

import nltk
nltk.download(['punkt', 'wordnet', 'stopwords', 'averaged_perceptron_tagger'])
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

import sqlite3
from sqlalchemy import create_engine
from warnings import filterwarnings
import pickle
filterwarnings('ignore')
url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [4]:
# load data from database
def load_data(db, table_name):
    """
    Load the database and convert it to a
    dataframe
    Args: db - A sqlite database
          table_name - A sqlite table
    Returns - An input array of messages and output 
              array of label, including label names
    """
    engine = create_engine('sqlite:///{}'.format(db))
    table_name = table_name
    df = pd.read_sql_table(table_name, engine)
    X = df['message'].values
    Y = df.iloc[:, 4:].values
    
    return X, Y, df.columns[4:]

### 2. Write a tokenization function to process your text data

In [31]:
def tokenize(text):
    """
    Normalize the input text. Steps performed
    1) Remove punctuations
    1) Tokenize text
    2) Lemmatize text
    3) Strip whitespaces
    4) Convert to lower
    5) Remove stopwords
    
    Args: text - String
    Return - A list of cleaned tokens
    """
    stop_words = stopwords.words("english")
    detected_urls = re.findall(url_regex, text)
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")
    
    text = re.sub(r'[^a-zA-Z0-9]', ' ', text)
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    
    clean_tokens = [lemmatizer.lemmatize(tok).lower().strip() for tok in tokens if tok.lower() not in stop_words]

    return clean_tokens

### 3. Build a machine learning pipeline
This machine pipeline should take in the `message` column as input and output classification results on the other 36 categories in the dataset. You may find the [MultiOutputClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.multioutput.MultiOutputClassifier.html) helpful for predicting multiple target variables.

In [136]:
def create_pipeline(clf, tokenize):
    """
    Create a pipeline of a sequence of actions
    to perform on the input
    1) CountVectorizer()
    2) TfIdfTransformer()
    3) MultiOutputClassifier() or OneVsRestClasssifier()
    
    Args: clf - Classifier object
          tokenize - A function to normalize the input
                      text
    Returns - A pipeline object
    """
    pipeline = Pipeline([
                       ('union', FeatureUnion([
                           ('text_pipeline', Pipeline([
                                ('vect', CountVectorizer(
                                                ngram_range=(1, 2),
                                                max_df=0.93,
                                                min_df=7,
                                                tokenizer=tokenize
                                )),
                                ('tfidf', TfidfTransformer()),
                               
                           ])),
                            ('avg_length', AvgResponseLength()),
                            ('starting_verb', StartingVerbExtractor())
                       ])),
                        
                       ('clf', OneVsRestClassifier(clf))
                       ])
    
    return pipeline

### 4. Train pipeline
- Split data into train and test sets
- Train pipeline

In [137]:
def split_data(X, y, test_size=0.2):
    """
    Split the data into train and validation sets
    Args: X - An input array of messages
          y - A array of labels
    Returns X_train - An array of training set of messages
            X_valid - An array of validation set of message
            y_train - An array of training labels
            y_valid - An array of validation labels
    """
    return train_test_split(X, y, test_size=test_size, random_state=2018)
    

In [138]:
X[:3]

array(['Weather update - a cold front from Cuba that could pass over Haiti',
       'Is the Hurricane over or is it not over',
       'Looking for someone but no name'], dtype=object)

In [6]:
db = 'responses.db'
table_name = 'emergency_messages'
X, y, labels = load_data(db, table_name)
# new_X = []
# for message in X:
#     new_X.append(' '.join(tokenize(message)))
X_train, X_valid, y_train, y_valid = split_data(X, y, test_size=0.1)

In [128]:
X.shape

(26206,)

In [106]:
rf = RandomForestClassifier(min_samples_split=5,
                            n_estimators=20,
                            random_state=2018)
# gb = GradientBoostingClassifier(random_state=2018)
# svc = SVC(random_state=2018)
# lg = LogisticRegression(random_state=2018)
# sg = SGDClassifier(random_state=2018)
# mb = MultinomialNB()
model = rf
pipeline = create_pipeline(model, tokenize)
pipeline.fit(X_train, y_train)

TypeError: no supported conversion for types: (dtype('float64'), dtype('O'))

In [9]:
list(zip(labels, np.sum(y, axis=0)))

[('related', 26206),
 ('request', 4474),
 ('offer', 118),
 ('aid_related', 10860),
 ('medical_help', 2084),
 ('medical_products', 1313),
 ('search_and_rescue', 724),
 ('security', 471),
 ('military', 860),
 ('child_alone', 0),
 ('water', 1672),
 ('food', 2923),
 ('shelter', 2314),
 ('clothing', 405),
 ('money', 604),
 ('missing_people', 298),
 ('refugees', 875),
 ('death', 1194),
 ('other_aid', 3446),
 ('infrastructure_related', 1705),
 ('transport', 1201),
 ('buildings', 1333),
 ('electricity', 532),
 ('tools', 159),
 ('hospitals', 283),
 ('shops', 120),
 ('aid_centers', 309),
 ('other_infrastructure', 1151),
 ('weather_related', 7297),
 ('floods', 2155),
 ('storm', 2443),
 ('fire', 282),
 ('earthquake', 2455),
 ('cold', 530),
 ('other_weather', 1376),
 ('direct_report', 5075)]

### 5. Test your model
Report the f1 score, precision and recall for each output category of the dataset. You can do this by iterating through the columns and calling sklearn's `classification_report` on each.

In [140]:
def display_report(pipeline, X_valid, y_valid, labels=None):
    """
    Make a prediction on the validation set
    and display a classification report showing
    f1_score, precision and recall
    Args: pipeline - A pipeline object
          X_valid - An array of validation set of messages
          y_valid - An array of validation labels
          labels - A list of label names
    """
    preds = pipeline.predict(X_valid)
    print(classification_report(y_valid, preds, target_names=labels))

In [91]:
display_report(pipeline, X_valid, y_valid, labels)

                        precision    recall  f1-score   support

               related       1.00      1.00      1.00      2621
               request       0.80      0.51      0.62       449
                 offer       0.00      0.00      0.00        14
           aid_related       0.74      0.72      0.73      1089
          medical_help       0.68      0.16      0.26       199
      medical_products       0.86      0.22      0.35       138
     search_and_rescue       0.82      0.13      0.22        71
              security       0.00      0.00      0.00        39
              military       0.73      0.13      0.22        85
           child_alone       0.00      0.00      0.00         0
                 water       0.83      0.55      0.66       156
                  food       0.84      0.70      0.76       305
               shelter       0.77      0.50      0.60       227
              clothing       0.92      0.23      0.37        47
                 money       1.00      

### 6. Improve your model
Use grid search to find better parameters. 

Append two transformers to FeatureUnion object inside the existing pipeline

In [141]:
# http://michelleful.github.io/code-blog/2015/06/20/pipelines/
class AvgResponseLength(BaseEstimator, TransformerMixin):
    """ Compute the average length of the resposne"""
    def __init__(self):
        pass
    
    def avg_word_length(self, text):
        return np.mean([len(word.strip()) if len(word.strip()) != 0 else 0 for word in text.split()])
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, x, y=None):
        return pd.DataFrame(pd.Series(x).apply(self.avg_word_length)).fillna(0)
    

In [142]:
# not working as expected. the length of the series is not in 
# sync with the input X
class StartingVerbExtractor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.c = 0
    def starting_verb(self, text):
        sentence_list = nltk.sent_tokenize(text)
        for sentence in sentence_list:
            pos_tags = nltk.pos_tag(tokenize(sentence))
            self.c += 1
            if len(pos_tags) == 0:
                return False
            first_word, first_tag = pos_tags[0]
            if first_tag in ['VB', 'VBP'] or first_word == 'RT':
                return True
            return False

    def fit(self, x, y=None):
        return self

    def transform(self, X):
        X_tagged = pd.Series(X).apply(self.starting_verb)
        return pd.DataFrame(X_tagged)

In [13]:
#pipeline.steps[0][1].get_params()

In [14]:
# pipeline.steps[0][1].get_params().update({
#                        'transformer_list': pipeline.steps[0][1].get_params()['transformer_list'].extend([('avg_length', AvgResponseLength())]),
#                        'avg_length': AvgResponseLength()})
# pipeline.steps[0][1].get_params().update({
#                        'transformer_list': pipeline.steps[0][1].get_params()['transformer_list'].extend([('starting_verb', StartingVerbExtractor())]),
#                        'starting_verb': StartingVerbExtractor()})

In [15]:
#pipeline.steps[0][1].get_params()

In [101]:
parameters = {
                'union__text_pipeline__vect__max_features': [9000, 12000],
#                'clf__estimator__n_estimators':[10, 100, 200],
#                'clf__estimator__min_samples_split': [2, 5]
}

# parameters = {
#                 'union__text_pipeline__vect__max_features': [9000, 12000],
#                 'clf__estimator__loss': ['log', 'modified_huber'],
#                 'clf__estimator__penalty': ['l2', 'elasticnet']
# }
gs_pipeline = create_pipeline(model, tokenize)
gs_pipeline.steps[0][1].get_params().update({
                       'transformer_list': gs_pipeline.steps[0][1].get_params()['transformer_list'].extend([('avg_length', AvgResponseLength())]),
                       'avg_length': AvgResponseLength()})
# gs_pipeline.steps[0][1].get_params().update({
#                        'transformer_list': gs_pipeline.steps[0][1].get_params()['transformer_list'].extend([('starting_verb', StartingVerbExtractor())]),
#                        'starting_verb': StartingVerbExtractor()})

In [102]:
gs = GridSearchCV(gs_pipeline, param_grid=parameters, n_jobs=-1, 
                  scoring=make_scorer(f1_score, average='weighted'),
                 cv=3, verbose=4)
gs.fit(X_train, y_train)

Fitting 3 folds for each of 2 candidates, totalling 6 fits
[CV] union__text_pipeline__vect__max_features=9000 ...................
[CV]  union__text_pipeline__vect__max_features=9000, score=0.658734925323933, total=10.5min
[CV] union__text_pipeline__vect__max_features=9000 ...................


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed: 11.9min remaining:    0.0s


[CV]  union__text_pipeline__vect__max_features=9000, score=0.6572394004045591, total=10.5min
[CV] union__text_pipeline__vect__max_features=9000 ...................


[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed: 23.7min remaining:    0.0s


[CV]  union__text_pipeline__vect__max_features=9000, score=0.6612393926281606, total=10.5min
[CV] union__text_pipeline__vect__max_features=12000 ..................


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed: 35.5min remaining:    0.0s


[CV]  union__text_pipeline__vect__max_features=12000, score=0.658734925323933, total=10.5min
[CV] union__text_pipeline__vect__max_features=12000 ..................
[CV]  union__text_pipeline__vect__max_features=12000, score=0.6572394004045591, total=10.6min
[CV] union__text_pipeline__vect__max_features=12000 ..................
[CV]  union__text_pipeline__vect__max_features=12000, score=0.6612393926281606, total=10.5min


[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed: 71.1min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('union', FeatureUnion(n_jobs=1,
       transformer_list=[('text_pipeline', Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0... oob_score=False, random_state=2018, verbose=0,
            warm_start=False),
          n_jobs=1))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'union__text_pipeline__vect__max_features': [9000, 12000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(f1_score, average=weighted), verbose=4)

In [103]:
gs.best_params_

{'union__text_pipeline__vect__max_features': 9000}

### 7. Test your model
Show the accuracy, precision, and recall of the tuned model.  

Since this project focuses on code quality, process, and  pipelines, there is no minimum performance metric needed to pass. However, make sure to fine tune your models for accuracy, precision and recall to make your project stand out - especially for your portfolio!

In [104]:
display_report(gs, X_valid, y_valid, labels)

                        precision    recall  f1-score   support

               related       1.00      1.00      1.00      2621
               request       0.80      0.51      0.62       449
                 offer       0.00      0.00      0.00        14
           aid_related       0.74      0.72      0.73      1089
          medical_help       0.68      0.16      0.26       199
      medical_products       0.86      0.22      0.35       138
     search_and_rescue       0.82      0.13      0.22        71
              security       0.00      0.00      0.00        39
              military       0.73      0.13      0.22        85
           child_alone       0.00      0.00      0.00         0
                 water       0.83      0.55      0.66       156
                  food       0.84      0.70      0.76       305
               shelter       0.77      0.50      0.60       227
              clothing       0.92      0.23      0.37        47
                 money       1.00      

### 8. Try improving your model further. Here are a few ideas:
* try other machine learning algorithms
* add other features besides the TF-IDF

In [None]:
# Failed as the transformer was added in series and not in parallel
# which caused an error as a densor matrix was appended to a sparse
# matrix
#gs_pipeline.steps.insert(2, ('avg_length', AvgResponseLength()))

### 9. Export your model as a pickle file

In [108]:
# https://stackoverflow.com/questions/34143829/sklearn-how-to-save-a-model-created-from-a-pipeline-and-gridsearchcv-using-jobli
def save_model(model, file):
    """
    Pickle the model
    Args: model - Model object
          file - name of pickled object
    Returns None      
    """
    pickle.dump(model, open(file, 'wb'))
    #joblib.dump(model, file)

In [109]:
#save_model(gs.best_estimator_, 'rf_classifier.pkl', compress=1)
save_model(gs.best_estimator_, 'rf_classifier.pkl')

### 10. Use this notebook to complete `train.py`
Use the template file attached in the Resources folder to write a script that runs the steps above to create a database and export a model based on a new dataset specified by the user.

In [143]:
s = StartingVerbExtractor()
s.transform(X_train)

Unnamed: 0,0
0,False
1,False
2,False
3,False
4,False
5,False
6,False
7,False
8,False
9,False


In [144]:
print(s.c)

23585


In [112]:
s = AvgResponseLength()
p = s.transform(X)
len(p)

26206

In [2]:
!zip  rf_classifier.pkl.zip rf_classifier.pkl

  adding: rf_classifier.pkl (deflated 79%)


In [7]:
X[:3]

array(['Weather update - a cold front from Cuba that could pass over Haiti',
       'Is the Hurricane over or is it not over',
       'Looking for someone but no name'], dtype=object)

In [8]:
from collections import Counter

In [33]:
newX = []
for text in X:
    newX += tokenize(text)
setwords = [word for word in (' '.join(newX)).split()]
    

In [34]:
res = Counter(setwords)

In [40]:
dict(res.most_common(50))

{'water': 3037,
 'people': 3010,
 'food': 2896,
 'help': 2652,
 'need': 2480,
 'please': 2048,
 'earthquake': 1913,
 'area': 1655,
 'like': 1529,
 'would': 1490,
 'u': 1473,
 'said': 1351,
 '000': 1254,
 'country': 1247,
 'also': 1114,
 'know': 1113,
 'government': 1045,
 'haiti': 1041,
 'one': 1016,
 'rain': 1005,
 'flood': 978,
 'information': 967,
 'year': 944,
 'sandy': 923,
 'find': 920,
 'house': 915,
 'family': 838,
 'good': 832,
 'tent': 812,
 'relief': 812,
 'urlplaceholder': 807,
 'aid': 801,
 'day': 778,
 'supply': 774,
 'two': 774,
 'affected': 758,
 'health': 752,
 'child': 737,
 'get': 730,
 'thank': 707,
 'many': 692,
 'message': 691,
 'well': 691,
 'region': 690,
 '1': 671,
 'school': 671,
 '2': 637,
 'work': 634,
 'new': 627,
 'since': 623}

In [23]:
newX[:2]

['weather', 'update']