# ML Pipeline Preparation
Follow the instructions below to help you create your ML pipeline.
### 1. Import libraries and load data from database.
- Import Python libraries
- Load dataset from database with [`read_sql_table`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_table.html)
- Define feature and target variables X and Y

In [35]:
# import libraries
import pandas as pd
from sqlalchemy import create_engine
import re
import pickle

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

from sklearn import multioutput
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.naive_bayes import GaussianNB, MultinomialNB

from sklearn.multioutput import MultiOutputClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, cross_val_score

import sklearn.metrics as metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import TruncatedSVD
from sklearn.base import BaseEstimator, TransformerMixin

import pickle
import re
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)



[nltk_data] Downloading package punkt to
[nltk_data]     /Users/rishimadhav/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/rishimadhav/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rishimadhav/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# load data from database
engine = create_engine('sqlite:///DisasterResponse.db')
df = pd.read_sql_table("labeledmessages", con=engine)
X = df['message']
y = df.drop(columns=['id', 'message', 'original', 'genre'], axis=1)

### 2. Write a tokenization function to process your text data

In [3]:
url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

In [4]:
def tokenize(text):
    """
    Receives text data and processes it
    Args: Text Data (column of messages)
    Returns: Tokenized text
    """
    # get list of URLS using regex
    detected_urls = re.findall(url_regex, text)

    # replace each url in text string with placeholder
    for url in detected_urls:
        text = text.replace(url, 'urlplaceholder')
    
    # tokenize text
    tokens = word_tokenize(text)

    # initiate lemmatizer
    lemmatizer = WordNetLemmatizer()

    # iterate through each token and lemmatize, normalize, remove leading/trailing white space and return clean tokenized text
    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)
    
    return clean_tokens

### 3. Build a machine learning pipeline
This machine pipeline should take in the `message` column as input and output classification results on the other 36 categories in the dataset. You may find the [MultiOutputClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.multioutput.MultiOutputClassifier.html) helpful for predicting multiple target variables.

In [15]:
pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', RandomForestClassifier())
])

### 4. Train pipeline
- Split data into train and test sets
- Train pipeline

In [16]:
# Split data into train and test sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
pipeline.fit(X_train, y_train)

Pipeline(steps=[('vect',
                 CountVectorizer(tokenizer=<function tokenize at 0x7fc990e729d0>)),
                ('tfidf', TfidfTransformer()),
                ('clf', RandomForestClassifier())])

### 5. Test your model
Report the f1 score, precision and recall for each output category of the dataset. You can do this by iterating through the columns and calling sklearn's `classification_report` on each.

In [17]:
# define a function to evaluate models
def evaluate_model(model, X_test, y_test, label_names, print_reports=False):
    pred = pd.DataFrame(model.predict(X_test), columns=label_names)

    metrics = []
    for col in label_names:
        # Store metrics in a list
        report = classification_report(y_test[col], pred[col])
        scores = report.split('accuracy')[1].split()
        metrics.append([float(scores[i]) for i in [0,4,5,6,10,11,12]])

        # Print classification report
        if print_reports:
            print('-' * 53)
            print(f'Label: {col}')
            print(report)
    
    # Convert metrics list into a Dataframe
    metric_names = ['accuracy', 'macro_avg_precision', 'macro_avg_recall', 'macro_avg_f1', 'weighted_avg_precision', 'weighted_avg_recall', 'weighted_avg_f1']
    return pd.DataFrame(metrics, columns=metric_names, index=label_names)

evaluate_model(pipeline, X_test, y_test, y_test.columns)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,accuracy,macro_avg_precision,macro_avg_recall,macro_avg_f1,weighted_avg_precision,weighted_avg_recall,weighted_avg_f1
related,0.82,0.79,0.66,0.69,0.81,0.82,0.79
request,0.89,0.89,0.71,0.76,0.89,0.89,0.88
offer,1.0,0.5,0.5,0.5,0.99,1.0,0.99
aid_related,0.75,0.78,0.71,0.72,0.77,0.75,0.73
medical_help,0.92,0.83,0.5,0.48,0.9,0.92,0.88
medical_products,0.95,0.81,0.51,0.5,0.93,0.95,0.93
search_and_rescue,0.97,0.73,0.5,0.5,0.95,0.97,0.95
security,0.98,0.74,0.5,0.5,0.97,0.98,0.97
military,0.97,0.48,0.5,0.49,0.93,0.97,0.95
water,0.94,0.94,0.58,0.63,0.94,0.94,0.93


### 6. Improve your model
Use grid search to find better parameters. 

In [20]:
# Parameter grid to search
parameters = {
    "clf__max_depth" : [4, 5, 10],
    "clf__max_features" : [2, 3],
    "clf__min_samples_leaf" : [3, 4, 5],
    "clf__n_estimators" : [100, 200, 300]
} 

gs = GridSearchCV(pipeline, parameters, cv=4, n_jobs=12, verbose=2)
gs.fit(X_train, y_train)
gs.best_params_

Fitting 4 folds for each of 54 candidates, totalling 216 fits
[CV] END clf__max_depth=4, clf__max_features=2, clf__min_samples_leaf=3, clf__n_estimators=100; total time=  23.1s
[CV] END clf__max_depth=4, clf__max_features=2, clf__min_samples_leaf=3, clf__n_estimators=100; total time=  23.8s
[CV] END clf__max_depth=4, clf__max_features=2, clf__min_samples_leaf=3, clf__n_estimators=100; total time=  23.9s
[CV] END clf__max_depth=4, clf__max_features=2, clf__min_samples_leaf=3, clf__n_estimators=100; total time=  23.9s
[CV] END clf__max_depth=4, clf__max_features=2, clf__min_samples_leaf=3, clf__n_estimators=200; total time=  33.3s
[CV] END clf__max_depth=4, clf__max_features=2, clf__min_samples_leaf=3, clf__n_estimators=200; total time=  33.1s
[CV] END clf__max_depth=4, clf__max_features=2, clf__min_samples_leaf=3, clf__n_estimators=200; total time=  33.8s
[CV] END clf__max_depth=4, clf__max_features=2, clf__min_samples_leaf=3, clf__n_estimators=200; total time=  34.2s
[CV] END clf__max_

{'clf__max_depth': 4,
 'clf__max_features': 2,
 'clf__min_samples_leaf': 3,
 'clf__n_estimators': 100}

Best parameters for a Random Forest Classifier pipeline:
{'clf__max_depth': 4,
 'clf__max_features': 2,
 'clf__min_samples_leaf': 3,
 'clf__n_estimators': 100}


**6.1 Evaluate Pipeline with best parameters**

In [21]:
rndclf = gs.best_estimator_
report_rndclf = evaluate_model(rndclf, X_test, y_test, y_test.columns)
report_rndclf


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

Unnamed: 0,accuracy,macro_avg_precision,macro_avg_recall,macro_avg_f1,weighted_avg_precision,weighted_avg_recall,weighted_avg_f1
related,0.76,0.38,0.5,0.43,0.58,0.76,0.66
request,0.83,0.41,0.5,0.45,0.69,0.83,0.75
offer,1.0,0.5,0.5,0.5,0.99,1.0,0.99
aid_related,0.58,0.29,0.5,0.37,0.34,0.58,0.42
medical_help,0.92,0.46,0.5,0.48,0.84,0.92,0.88
medical_products,0.95,0.47,0.5,0.49,0.9,0.95,0.92
search_and_rescue,0.97,0.48,0.5,0.49,0.94,0.97,0.95
security,0.98,0.49,0.5,0.5,0.96,0.98,0.97
military,0.97,0.48,0.5,0.49,0.93,0.97,0.95
water,0.93,0.47,0.5,0.48,0.87,0.93,0.9


### 7. Test your model
Show the accuracy, precision, and recall of the tuned model.  

Since this project focuses on code quality, process, and  pipelines, there is no minimum performance metric needed to pass. However, make sure to fine tune your models for accuracy, precision and recall to make your project stand out - especially for your portfolio!

In [23]:
report_rndclf.mean()


accuracy                  0.924286
macro_avg_precision       0.460571
macro_avg_recall          0.500000
macro_avg_f1              0.479143
weighted_avg_precision    0.860000
weighted_avg_recall       0.924286
weighted_avg_f1           0.888000
dtype: float64

### 8. Try improving your model further. Here are a few ideas:
* try other machine learning algorithms
* add other features besides the TF-IDF

**Try Naive Bayes and ADA Boost Classifier and check for the above metrics**


***8.1 Naive Bayes Classifier***

In [36]:
# Trying Naive Bayes classifier

from sklearn.naive_bayes import MultinomialNB

pipe_nb = Pipeline([ 
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', multioutput.MultiOutputClassifier(MultinomialNB(alpha=0.01)))
])

pipe_nb.fit(X_train, y_train)

Pipeline(steps=[('vect',
                 CountVectorizer(tokenizer=<function tokenize at 0x7fc990e729d0>)),
                ('tfidf', TfidfTransformer()),
                ('clf',
                 MultiOutputClassifier(estimator=MultinomialNB(alpha=0.01)))])

In [38]:
pipe_nb.get_params().keys()


dict_keys(['memory', 'steps', 'verbose', 'vect', 'tfidf', 'clf', 'vect__analyzer', 'vect__binary', 'vect__decode_error', 'vect__dtype', 'vect__encoding', 'vect__input', 'vect__lowercase', 'vect__max_df', 'vect__max_features', 'vect__min_df', 'vect__ngram_range', 'vect__preprocessor', 'vect__stop_words', 'vect__strip_accents', 'vect__token_pattern', 'vect__tokenizer', 'vect__vocabulary', 'tfidf__norm', 'tfidf__smooth_idf', 'tfidf__sublinear_tf', 'tfidf__use_idf', 'clf__estimator__alpha', 'clf__estimator__class_prior', 'clf__estimator__fit_prior', 'clf__estimator', 'clf__n_jobs'])

In [39]:
# Grid search for parameters
params = {
    'vect__ngram_range' : [(1,1), (1,2)],
    'vect__max_df' : [0.1, 0.2],
    'vect__max_features' : [None, 100, 1000, 10000] 
}

gs_nb = GridSearchCV(pipe_nb, params, cv=4, n_jobs=16, verbose=2)
gs_nb.fit(X_train, y_train)
gs_nb.best_params_

Fitting 4 folds for each of 16 candidates, totalling 64 fits
[CV] END vect__max_df=0.1, vect__max_features=None, vect__ngram_range=(1, 1); total time=  18.5s
[CV] END vect__max_df=0.1, vect__max_features=100, vect__ngram_range=(1, 1); total time=  20.1s
[CV] END vect__max_df=0.1, vect__max_features=None, vect__ngram_range=(1, 1); total time=  23.3s
[CV] END vect__max_df=0.1, vect__max_features=100, vect__ngram_range=(1, 1); total time=  24.4s
[CV] END vect__max_df=0.1, vect__max_features=100, vect__ngram_range=(1, 2); total time=  24.2s
[CV] END vect__max_df=0.1, vect__max_features=None, vect__ngram_range=(1, 1); total time=  25.5s
[CV] END vect__max_df=0.1, vect__max_features=100, vect__ngram_range=(1, 1); total time=  25.7s
[CV] END vect__max_df=0.1, vect__max_features=None, vect__ngram_range=(1, 1); total time=  26.3s
[CV] END vect__max_df=0.1, vect__max_features=100, vect__ngram_range=(1, 1); total time=  26.8s
[CV] END vect__max_df=0.1, vect__max_features=None, vect__ngram_range=(

{'vect__max_df': 0.1, 'vect__max_features': 10000, 'vect__ngram_range': (1, 2)}

In [40]:
#Evaluate NB Classifier Model with best parameters
multinb = gs_nb.best_estimator_
report_multinb = evaluate_model(multinb, X_test, y_test, y_test.columns)
report_multinb

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,accuracy,macro_avg_precision,macro_avg_recall,macro_avg_f1,weighted_avg_precision,weighted_avg_recall,weighted_avg_f1
related,0.82,0.76,0.7,0.72,0.81,0.82,0.81
request,0.89,0.8,0.81,0.81,0.89,0.89,0.89
offer,0.99,0.58,0.53,0.54,0.99,0.99,0.99
aid_related,0.76,0.75,0.75,0.75,0.76,0.76,0.76
medical_help,0.93,0.77,0.62,0.66,0.91,0.93,0.91
medical_products,0.95,0.72,0.65,0.68,0.94,0.95,0.94
search_and_rescue,0.97,0.86,0.53,0.56,0.96,0.97,0.96
security,0.98,0.62,0.5,0.5,0.97,0.98,0.97
military,0.97,0.75,0.65,0.69,0.96,0.97,0.96
water,0.94,0.78,0.7,0.74,0.94,0.94,0.94


In [41]:
#Calculate Metrics for the NB Model
report_multinb.mean()


accuracy                  0.940571
macro_avg_precision       0.735429
macro_avg_recall          0.623429
macro_avg_f1              0.645429
weighted_avg_precision    0.932286
weighted_avg_recall       0.940571
weighted_avg_f1           0.932571
dtype: float64

***8.2 ADA Boost Classifier***


In [42]:
# Trying ADA Boost classifier

pipe_ada = Pipeline([ 
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', multioutput.MultiOutputClassifier(AdaBoostClassifier(random_state=42)))
])

pipe_ada.fit(X_train, y_train)

pipe_ada.get_params().keys()


dict_keys(['memory', 'steps', 'verbose', 'vect', 'tfidf', 'clf', 'vect__analyzer', 'vect__binary', 'vect__decode_error', 'vect__dtype', 'vect__encoding', 'vect__input', 'vect__lowercase', 'vect__max_df', 'vect__max_features', 'vect__min_df', 'vect__ngram_range', 'vect__preprocessor', 'vect__stop_words', 'vect__strip_accents', 'vect__token_pattern', 'vect__tokenizer', 'vect__vocabulary', 'tfidf__norm', 'tfidf__smooth_idf', 'tfidf__sublinear_tf', 'tfidf__use_idf', 'clf__estimator__algorithm', 'clf__estimator__base_estimator', 'clf__estimator__learning_rate', 'clf__estimator__n_estimators', 'clf__estimator__random_state', 'clf__estimator', 'clf__n_jobs'])

In [51]:
# Grid search for parameters
params = {
    'tfidf__use_idf' : (True, False),
    'clf__estimator__n_estimators' : [50, 100],
    'clf__estimator__random_state' : [42],
    'clf__estimator__learning_rate' : [0.5] 
}

gs_ada = GridSearchCV(pipe_ada, param_grid=params, refit=True, cv=10, n_jobs=-1, verbose=1, return_train_score=True)
gs_ada


GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('vect',
                                        CountVectorizer(tokenizer=<function tokenize at 0x7fc990e729d0>)),
                                       ('tfidf', TfidfTransformer()),
                                       ('clf',
                                        MultiOutputClassifier(estimator=AdaBoostClassifier()))]),
             n_jobs=-1,
             param_grid={'clf__estimator__learning_rate': [0.5],
                         'clf__estimator__n_estimators': [50, 100],
                         'clf__estimator__random_state': [42],
                         'tfidf__use_idf': (True, False)},
             return_train_score=True, verbose=1)

In [52]:
best_ada = gs_ada.fit(X_train, y_train)

#print('Best Model :', gs_ada.best_score_)
print('Params :', best_ada.best_params_)


Fitting 10 folds for each of 4 candidates, totalling 40 fits
Params : {'clf__estimator__learning_rate': 0.5, 'clf__estimator__n_estimators': 100, 'clf__estimator__random_state': 42, 'tfidf__use_idf': True}


In [53]:
# Evaluate ADA Boost Classifier Model with best parameters
gsada = best_ada.best_estimator_
report_gsada = evaluate_model(gsada, X_test, y_test, y_test.columns)
report_gsada


Unnamed: 0,accuracy,macro_avg_precision,macro_avg_recall,macro_avg_f1,weighted_avg_precision,weighted_avg_recall,weighted_avg_f1
related,0.8,0.76,0.62,0.64,0.79,0.8,0.77
request,0.9,0.86,0.75,0.79,0.89,0.9,0.89
offer,1.0,0.5,0.5,0.5,0.99,1.0,0.99
aid_related,0.76,0.77,0.74,0.74,0.76,0.76,0.75
medical_help,0.93,0.79,0.59,0.62,0.91,0.93,0.91
medical_products,0.96,0.84,0.63,0.68,0.95,0.96,0.95
search_and_rescue,0.97,0.87,0.56,0.6,0.97,0.97,0.96
security,0.98,0.66,0.5,0.5,0.97,0.98,0.97
military,0.97,0.83,0.62,0.67,0.96,0.97,0.96
water,0.96,0.87,0.82,0.84,0.96,0.96,0.96


In [54]:
report_gsada.mean()


accuracy                  0.948286
macro_avg_precision       0.801429
macro_avg_recall          0.636286
macro_avg_f1              0.668000
weighted_avg_precision    0.940857
weighted_avg_recall       0.948286
weighted_avg_f1           0.938286
dtype: float64

In [55]:
# Testing with some sample texts
test_text = ['there is a storm and people are trapped']
test = best_ada.predict(test_text)
print(y_train.columns.values[(test.flatten()==1)])


['related' 'aid_related' 'weather_related' 'storm']


In [58]:
test_text = ['we are having an earthquake, buildings are destroyed, victims need food']
test = best_ada.predict(test_text)
print(y_train.columns.values[(test.flatten()==1)])

test_text_1 = ['we are having an earthquake, buildings are destroyed, victims need food']
test_1 = gs_nb.predict(test_text)
print(y_train.columns.values[(test_1.flatten()==1)])

test_text_2 = ['we are having an earthquake, buildings are destroyed, victims need food']
test_2 = gs.predict(test_text)
print(y_train.columns.values[(test_2.flatten()==1)])


['related' 'request' 'aid_related' 'buildings' 'weather_related'
 'earthquake' 'direct_report']
['related' 'aid_related' 'buildings' 'weather_related' 'earthquake'
 'direct_report']
['related']


As we see above, the text predictions for the same message are most accurate for the ADA Boost Classifier, followed by the Multinomial Naive Bayes and finally the Random Classifier.


### 9. Export your model as a pickle file

In [59]:
pickle.dump(best_ada, open('classifier.pkl', 'wb'))


### 10. Use this notebook to complete `train.py`
Use the template file attached in the Resources folder to write a script that runs the steps above to create a database and export a model based on a new dataset specified by the user.

In [None]:
import sys


def load_data(database_filepath):
    pass


def tokenize(text):
    pass


def build_model():
    pass


def evaluate_model(model, X_test, Y_test, category_names):
    pass


def save_model(model, model_filepath):
    pass


def main():
    if len(sys.argv) == 3:
        database_filepath, model_filepath = sys.argv[1:]
        print('Loading data...\n    DATABASE: {}'.format(database_filepath))
        X, Y, category_names = load_data(database_filepath)
        X_train, X_test, Y_train, Y_test = train_test_split(
            X, Y, test_size=0.2)

        print('Building model...')
        model = build_model()

        print('Training model...')
        model.fit(X_train, Y_train)

        print('Evaluating model...')
        evaluate_model(model, X_test, Y_test, category_names)

        print('Saving model...\n    MODEL: {}'.format(model_filepath))
        save_model(model, model_filepath)

        print('Trained model saved!')

    else:
        print('Please provide the filepath of the disaster messages database '
              'as the first argument and the filepath of the pickle file to '
              'save the model to as the second argument. \n\nExample: python '
              'train_classifier.py ../data/DisasterResponse.db classifier.pkl')


if __name__ == '__main__':
    main()
