# ML Pipeline Preparation
Follow the instructions below to help you create your ML pipeline.
### 1. Import libraries and load data from database.
- Import Python libraries
- Load dataset from database with [`read_sql_table`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_table.html)
- Define feature and target variables X and Y

In [1]:
# import libraries
import pandas as pd
import numpy as np
import nltk
nltk.download(['punkt', 'wordnet','stopwords'])
from sqlalchemy import create_engine
import sys
import re
import pickle
from nltk.tokenize import word_tokenize, RegexpTokenizer, sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, make_scorer, accuracy_score, f1_score, fbeta_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import TruncatedSVD
from sklearn.tree import DecisionTreeClassifier


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
# load data from database
import pandas as pd
engine = create_engine('sqlite:///DisasterResponse.db')
df = pd.read_sql_table('Disaster_messages', con=engine) 
X = df['message']
Y = df.drop(['message', 'genre', 'id', 'original'], axis = 1)

### 2. Write a tokenization function to process your text data

In [3]:
def tokenize(text):
    """
    Inputs:
    the messages
    
    Returns: A list of words into numbers/root form of the messages words
    """
    
    #Normalizing text by converting everything to lower case:
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())
    
    #Tokenize 'text'
    tokenizedwords = word_tokenize(text)
    
    #Normalization of word tokens and removal of stop words
    normalizertokens = PorterStemmer()
    stop_words = stopwords.words("english")
    
    normalizedwords = [normalizertokens.stem(word) for word in tokenizedwords if word not in stop_words]
    
    return normalizedwords

### 3. Build a machine learning pipeline
This machine pipeline should take in the `message` column as input and output classification results on the other 36 categories in the dataset. You may find the [MultiOutputClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.multioutput.MultiOutputClassifier.html) helpful for predicting multiple target variables.

In [4]:
#ML Pipeline for Random Forest Classifier
pipeline_rfc = Pipeline([
    ('vect', CountVectorizer(tokenizer = tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(RandomForestClassifier()))
])
print(pipeline_rfc)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...oob_score=False, random_state=None, verbose=0,
            warm_start=False),
           n_jobs=1))])


### 4. Train pipeline
- Split data into train and test sets
- Train pipeline

In [5]:
#Split the data into train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 43)
#Train Pipeline Random Forest Classifier
pipeline_rfc.fit(X_train, Y_train)

# Prediction: the Random Forest Classifier  
Y_pred = pipeline_rfc.predict(X_test)

### 5. Test your model
Report the f1 score, precision and recall for each output category of the dataset. You can do this by iterating through the columns and calling sklearn's `classification_report` on each.

In [6]:
#Display the results and model them into a dataframe
def get_results(Y_test, Y_pred):
    modelresults = pd.DataFrame(columns=['Category', 'f_score', 'precision', 'recall'])
    num = 0
    for ctg in Y_test.columns:
        precision, recall, f_score, support = precision_recall_fscore_support(Y_test[ctg], Y_pred[:,num], average='weighted')
        modelresults.at[num+1, 'Category'] = ctg 
        modelresults.at[num+1, 'f_score'] = f_score
        modelresults.at[num+1, 'precision'] = precision
        modelresults.at[num+1, 'recall'] = recall  
        num +=1
    print('Aggregated f_score:', modelresults['f_score'].mean())
    print('Aggregated precision:', modelresults['precision'].mean())
    print('Aggregated recall:', modelresults['recall'].mean())
    print('Accuracy:', np.mean(Y_test.values == Y_pred))
    return modelresults

In [7]:
modelresults = get_results(Y_test, Y_pred)
modelresults

Aggregated f_score: 0.933301297972
Aggregated precision: 0.934174348278
Aggregated recall: 0.94515425036
Accuracy: 0.94515425036


  'precision', 'predicted', average, warn_for)


Unnamed: 0,Category,f_score,precision,recall
1,related,0.788222,0.787291,0.802059
2,request,0.875902,0.880498,0.887681
3,offer,0.992284,0.989729,0.994851
4,aid_related,0.748562,0.750607,0.752479
5,medical_help,0.895194,0.911192,0.924104
6,medical_products,0.938507,0.943387,0.954805
7,search_and_rescue,0.965093,0.96837,0.974447
8,security,0.974709,0.966713,0.982838
9,military,0.961093,0.959379,0.971014
10,child_alone,1.0,1.0,1.0


### 6. Improve your model
Use grid search to find better parameters. 

In [8]:
pipeline_rfc.get_params()

{'memory': None,
 'steps': [('vect',
   CountVectorizer(analyzer='word', binary=False, decode_error='strict',
           dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
           lowercase=True, max_df=1.0, max_features=None, min_df=1,
           ngram_range=(1, 1), preprocessor=None, stop_words=None,
           strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
           tokenizer=<function tokenize at 0x7fd5ea763950>, vocabulary=None)),
  ('tfidf',
   TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)),
  ('clf',
   MultiOutputClassifier(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
               max_depth=None, max_features='auto', max_leaf_nodes=None,
               min_impurity_decrease=0.0, min_impurity_split=None,
               min_samples_leaf=1, min_samples_split=2,
               min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
               oob_score=False, random_state=None,

In [9]:
parameters = {'clf__estimator__max_depth': [10, 50, None],
              'clf__estimator__min_samples_leaf':[2, 5, 10]}

cv =  GridSearchCV(pipeline_rfc, parameters)

### 7. Test your model
Show the accuracy, precision, and recall of the tuned model.  

Since this project focuses on code quality, process, and  pipelines, there is no minimum performance metric needed to pass. However, make sure to fine tune your models for accuracy, precision and recall to make your project stand out - especially for your portfolio!

In [10]:
cv.fit(X_train.as_matrix(), Y_train.as_matrix())
Y_pred = cv.predict(X_test)
modelresults2 = get_results(Y_test, Y_pred)
modelresults2

  """Entry point for launching an IPython kernel.


Aggregated f_score: 0.93008706972
Aggregated precision: 0.934517572646
Aggregated recall: 0.944386176795
Accuracy: 0.944386176795


  'precision', 'predicted', average, warn_for)


Unnamed: 0,Category,f_score,precision,recall
1,related,0.777148,0.792286,0.803204
2,request,0.850824,0.866037,0.872616
3,offer,0.992284,0.989729,0.994851
4,aid_related,0.763481,0.763894,0.763158
5,medical_help,0.892615,0.899445,0.921815
6,medical_products,0.934514,0.945603,0.954043
7,search_and_rescue,0.959658,0.960015,0.972731
8,security,0.974899,0.966719,0.983219
9,military,0.959666,0.968996,0.972159
10,child_alone,1.0,1.0,1.0


In [11]:
cv.best_estimator_

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...oob_score=False, random_state=None, verbose=0,
            warm_start=False),
           n_jobs=1))])

### 8. Try improving your model further. Here are a few ideas:
* try other machine learning algorithms
* add other features besides the TF-IDF

In [12]:
#ML Pipeline for Decision Tree classifier
pipeline_dtc = Pipeline([
        ('vect', CountVectorizer(tokenizer = tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier(DecisionTreeClassifier()))
])
print(pipeline_dtc)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y)
pipeline_dtc.fit(X_train.as_matrix(), Y_train.as_matrix())
Y_pred = pipeline_dtc.predict(X_test)
modelresults3 = get_results(Y_test, Y_pred)
modelresults3

#ML Pipleline for Ada Boost Classifier
pipeline_ada = Pipeline([
        ('vect', CountVectorizer(tokenizer = tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf',  MultiOutputClassifier(AdaBoostClassifier()))
])
print(pipeline_ada)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y)
pipeline_ada.fit(X_train.as_matrix(), Y_train.as_matrix())
Y_pred = pipeline_ada.predict(X_test)
modelresults4 = get_results(Y_test, Y_pred)
modelresults4

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...ion_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
           n_jobs=1))])


  # Remove the CWD from sys.path while we load stuff.


Aggregated f_score: 0.932846751184
Aggregated precision: 0.932001062847
Aggregated recall: 0.93378937375
Accuracy: 0.93378937375
Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...mator=None,
          learning_rate=1.0, n_estimators=50, random_state=None),
           n_jobs=1))])




Aggregated f_score: 0.940340041013
Aggregated precision: 0.940786983842
Aggregated recall: 0.948174143017
Accuracy: 0.948174143017


Unnamed: 0,Category,f_score,precision,recall
1,related,0.716844,0.760217,0.778303
2,request,0.889013,0.888542,0.895636
3,offer,0.99131,0.98904,0.993592
4,aid_related,0.757642,0.762152,0.76213
5,medical_help,0.91177,0.911244,0.926305
6,medical_products,0.952261,0.950793,0.958499
7,search_and_rescue,0.968136,0.968803,0.974367
8,security,0.975093,0.970727,0.981996
9,military,0.966685,0.965135,0.970095
10,child_alone,1.0,1.0,1.0


### 9. Export your model as a pickle file

In [13]:
#Exporting model as a pickle file using pickle dump
pickle.dump(cv, open('model_classifier.pkl', 'wb'))

### 10. Use this notebook to complete `train.py`
Use the template file attached in the Resources folder to write a script that runs the steps above to create a database and export a model based on a new dataset specified by the user.