# ML Pipeline Preparation


### Import libraries and load data


In [1]:
import numpy as np
import pandas as pd
from sqlalchemy import create_engine

from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV, train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.multioutput import MultiOutputClassifier
# from sklearn.base import BaseEstimator, TransformerMixin

import re
import nltk
nltk.download(['punkt', 'wordnet', 'stopwords', 'averaged_perceptron_tagger', 'maxent_ne_chunker', 'words'])
from nltk.corpus import stopwords
from nltk import pos_tag, ne_chunk
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer

# visualization
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(); color='rebeccapurple'
%matplotlib inline  

# display settings
pd.set_option('max_colwidth', -1)
pd.set_option('display.max_columns', None)  

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


In [2]:
# load data from database with `read_sql_table`
engine = create_engine('sqlite:///DisasterResponse.db')
df = pd.read_sql_table('messages', engine)

## Split data into training and test sets (with stratified sampling)

In [11]:
# for StratifiedShuffleSlpit to work properly all values > 10 in ['total'] column will be set to 11.
df['total'] = np.where((df['total'] >10), 11, df['total'])

In [12]:
# create testset with stratified sampling according to the category count per message
split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 111)
for train_index, test_index in split.split(df, df['total']):
    train = df.loc[train_index]
    test = df.loc[test_index]
    
# safety-check
assert len(test) == round(len(df) * 0.2), 'split got messed up'
assert (round(df['related'].sum() / len(df),3)) == (round(train['related'].sum() / len(train),3)), 'not properly stratisfied'

In [14]:
# remove 'total' column from train and test sets
for set_ in (train, test):
    set_.drop('total', axis=1, inplace=True)
    
# safety-check
assert len(test.columns) == 40

## Split into features an target variables

In [39]:
X_train = train['message'].values
y_train = train.iloc[:, 4:40].values

X_test = test['message'].values
y_test = test.iloc[:, 4:40].values

In [28]:
#check
X_train[:2]

array([ "To determine possible evolution of the eruptive crisis of the Nyiragongo, it's necessary to improve the monitoring of the volcano, in particularly to add seismometers and to install tiltmeters to measure the ground deformation around the volcano.",
       'Over 25 people in line at Starbucks . People need coffee as much as they need food and gas post #Sandy ( @Starbucks ) http : //t.co/T7G3fBJg'], dtype=object)

In [29]:
#check
y_train[:2]

array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0],
       [1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

### 2. Write a tokenization function to process your text data

In [35]:
def tokenize_text(message):

    lemmatizer = WordNetLemmatizer()
    stop_words = stopwords.words('english')
    
    # normalize case and remove punctuation
    message = re.sub(r"[^a-zA-Z0-9]", " ", message.lower())
    # tokenize text
    tokens = word_tokenize(message)
    # lemmatize and remove stop words
    tokens = [lemmatizer.lemmatize(word.strip()) for word in tokens if word not in stop_words]
    # add part-of-speech tags
    tokens = pos_tag(tokens)
    
    return tokens

In [36]:
for message in X_train[:2]:
    tokens = tokenize_text(message)
    print(message)
    print(tokens, '\n')

To determine possible evolution of the eruptive crisis of the Nyiragongo, it's necessary to improve the monitoring of the volcano, in particularly to add seismometers and to install tiltmeters to measure the ground deformation around the volcano.
[('determine', 'NN'), ('possible', 'JJ'), ('evolution', 'NN'), ('eruptive', 'JJ'), ('crisis', 'NN'), ('nyiragongo', 'NN'), ('necessary', 'JJ'), ('improve', 'VB'), ('monitoring', 'NN'), ('volcano', 'NN'), ('particularly', 'RB'), ('add', 'JJ'), ('seismometers', 'NNS'), ('install', 'VBP'), ('tiltmeters', 'NNS'), ('measure', 'VBP'), ('ground', 'NN'), ('deformation', 'NN'), ('around', 'IN'), ('volcano', 'NN')] 

Over 25 people in line at Starbucks . People need coffee as much as they need food and gas post #Sandy ( @Starbucks ) http : //t.co/T7G3fBJg
[('25', 'CD'), ('people', 'NNS'), ('line', 'NN'), ('starbucks', 'VBZ'), ('people', 'NNS'), ('need', 'VBP'), ('coffee', 'NN'), ('much', 'JJ'), ('need', 'NN'), ('food', 'NN'), ('gas', 'NN'), ('post', 'NN

### 3. Build a machine learning pipeline
Use sk-learn's [MultiOutputClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.multioutput.MultiOutputClassifier.html) ([example here](https://scikit-learn.org/stable/modules/multiclass.html#multioutput-classification))

In [41]:
# define the classifier, wrapped in MultiOutputClassifier
clf = RandomForestClassifier(n_estimators=100, random_state=1)

# build the pipeline
pipe = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize_text)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(clf, n_jobs=-1, verbose=1)),
              ])

# train the pipeline
pipe.fit(X_train, y_train)

In [40]:
y_pred = pipe.predict(X_test)

### 5. Test your model
Report the f1 score, precision and recall for each output category of the dataset. You can do this by iterating through the columns and calling sklearn's `classification_report` on each.

In [55]:
category_list = train.iloc[:, 4:40].columns

for col in y_test.T:
    print(classification_report(y_test.T, y_pred.T, target_names=category_list))

ValueError: Unknown label type: (array([[1, 1, 1, ..., 1, 1, 1],
       [0, 0, 1, ..., 1, 0, 1],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [1, 0, 1, ..., 1, 0, 1]]), array([[1, 1, 1, ..., 1, 1, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 1]]))

In [None]:
def get_eval_metrics(actual, predicted, col_names):
    """Calculate evaluation metrics for ML model
    
    Args:
    actual: array. Array containing actual labels.
    predicted: array. Array containing predicted labels.
    col_names: list of strings. List containing names for each of the predicted fields.
       
    Returns:
    metrics_df: dataframe. Dataframe containing the accuracy, precision, recall 
    and f1 score for a given set of actual and predicted labels.
    """
    metrics = []
    
    # Calculate evaluation metrics for each set of labels
    for i in range(len(col_names)):
        accuracy = accuracy_score(actual[:, i], predicted[:, i])
        precision = precision_score(actual[:, i], predicted[:, i])
        recall = recall_score(actual[:, i], predicted[:, i])
        f1 = f1_score(actual[:, i], predicted[:, i])
        
        metrics.append([accuracy, precision, recall, f1])
    
    # Create dataframe containing metrics
    metrics = np.array(metrics)
    metrics_df = pd.DataFrame(data = metrics, index = col_names, columns = ['Accuracy', 'Precision', 'Recall', 'F1'])
      
    return metrics_df

In [None]:
def evaluate_model(model, X_test, Y_test, category_names):

    

    # predict on test data

    y_pred = model.predict(X_test)

    

    accuracy = []



    y_testData = pd.DataFrame(Y_test)

    y_predData = pd.DataFrame(y_pred)



    for col in range(len(y_testData.columns)):

        accuracy.append(accuracy_score(y_testData[col],y_predData[col]))

 		

    target_colums = (category_names.iloc[:,4:].columns).tolist()

	

    acc_score = pd.DataFrame(accuracy,columns=['Accuracy_score'], index=target_colums)

	

    target_names = (category_names.iloc[:,4:].columns).tolist()

	

    output = classification_report(Y_test, y_pred, target_names=target_names)

	

    print("Accuracy for each category:\n")

    print(acc_score)

    

    print("\n\nPrecision,recall, f1-score and support for each category:\n")

    print(output)

### 6. Improve your model
Use grid search to find better parameters. 

In [None]:
parameters = 

cv = 

### 7. Test your model
Show the accuracy, precision, and recall of the tuned model.  

Since this project focuses on code quality, process, and  pipelines, there is no minimum performance metric needed to pass. However, make sure to fine tune your models for accuracy, precision and recall to make your project stand out - especially for your portfolio!

### 8. Try improving your model further. Here are a few ideas:
* try other machine learning algorithms
* add other features besides the TF-IDF

### 9. Export your model as a pickle file

### 10. Use this notebook to complete `train.py`
Use the template file attached in the Resources folder to write a script that runs the steps above to create a database and export a model based on a new dataset specified by the user.

- to do: concatenate one hot encoded genre with text
- eventually length and total categories to