# ML Pipeline Preparation


### Import libraries and load data


In [1]:
import numpy as np
import pandas as pd
from sqlalchemy import create_engine

from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV, train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.multioutput import MultiOutputClassifier
# from sklearn.base import BaseEstimator, TransformerMixin

import re
import nltk
nltk.download(['punkt', 'wordnet', 'stopwords', 'averaged_perceptron_tagger', 'maxent_ne_chunker', 'words'])
from nltk.corpus import stopwords
from nltk import pos_tag, ne_chunk
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer

# visualization
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(); color='rebeccapurple'
%matplotlib inline  

# display settings
pd.set_option('max_colwidth', -1)
pd.set_option('display.max_columns', None)  

import time
from tqdm import tqdm

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\r2d4\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\r2d4\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\r2d4\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\r2d4\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\r2d4\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\r2d4\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-dat

In [2]:
# load data from database with `read_sql_table`
engine = create_engine('sqlite:///DisasterResponse.db')
df = pd.read_sql_table('messages', engine)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25992 entries, 0 to 25991
Data columns (total 39 columns):
id                        25992 non-null int64
message                   25992 non-null object
original                  10021 non-null object
genre                     25992 non-null object
related                   25992 non-null int64
request                   25992 non-null int64
offer                     25992 non-null int64
aid_related               25992 non-null int64
medical_help              25992 non-null int64
medical_products          25992 non-null int64
search_and_rescue         25992 non-null int64
security                  25992 non-null int64
military                  25992 non-null int64
water                     25992 non-null int64
food                      25992 non-null int64
shelter                   25992 non-null int64
clothing                  25992 non-null int64
money                     25992 non-null int64
missing_people            25992 non-null i

## Split data into training and test sets (with stratified sampling)
After EDA (documented in separate `EDA.ipynb`) the decision was taken to use stratfied sampling for splitting into training and test set. The proportion of the different _numbers of active categories per message_ will be preserved.

In [4]:
# create new column with total number of active categories per message
df['total'] = df.iloc[:,4:40].sum(axis=1)
# for StratifiedShuffleSlpit to work properly all values > 10 in ['total'] column will be set to 11. (Kind of oultier removal.)
df['total'] = np.where((df['total'] >10), 11, df['total'])

In [5]:
# create testset with stratified sampling according to the category count per message
split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 111)
for train_index, test_index in split.split(df, df['total']):
    train = df.loc[train_index]
    test = df.loc[test_index]
    
# safety-check
assert np.abs(len(test) - (len(df) * 0.2)) <= 1, 'split got messed up'
assert (round(df['related'].sum() / len(df),3)) == (round(train['related'].sum() / len(train),3)), 'not properly stratisfied'

In [6]:
# remove 'total' column from train and test sets
for set_ in (train, test):
    set_.drop('total', axis=1, inplace=True)
    
# safety-check
assert len(test.columns) == 39

## Split into features an target variables

In [7]:
X_train = train['message'].values
Y_train = train.iloc[:, 4:39].values

X_test = test['message'].values
Y_test = test.iloc[:, 4:39].values

In [8]:
#check
X_train[:2]

array(['Some 2,000 women protesting against the conduct of the elections were teargassed as they tried to converge on the local electoral commission offices in the southern oil city of Port Harcourt.',
       'Good evening to all USA soldiers. we still suffer from this great event. We would love to handle your presence of authority. thank you'],
      dtype=object)

In [9]:
#check
Y_train[:2]

array([[1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int64)

### 2. Write a tokenization function to process your text data

In [10]:
def tokenize_text(message):

    lemmatizer = WordNetLemmatizer()
    stop_words = stopwords.words('english')
    
    # normalize case and remove punctuation
    message = re.sub(r"[^a-zA-Z0-9]", " ", message.lower())
    # tokenize text
    tokens = word_tokenize(message)
    # lemmatize and remove stop words
    tokens = [lemmatizer.lemmatize(word.strip()) for word in tokens if word not in stop_words]
    # add part-of-speech tags
    tokens = pos_tag(tokens)
    
    return tokens

In [11]:
# check
for message in X_train[:2]:
    tokens = tokenize_text(message)
    print(message)
    print(tokens, '\n')

Some 2,000 women protesting against the conduct of the elections were teargassed as they tried to converge on the local electoral commission offices in the southern oil city of Port Harcourt.
[('2', 'CD'), ('000', 'CD'), ('woman', 'NN'), ('protesting', 'VBG'), ('conduct', 'NN'), ('election', 'NN'), ('teargassed', 'VBD'), ('tried', 'JJ'), ('converge', 'NN'), ('local', 'JJ'), ('electoral', 'JJ'), ('commission', 'NN'), ('office', 'NN'), ('southern', 'JJ'), ('oil', 'NN'), ('city', 'NN'), ('port', 'NN'), ('harcourt', 'NN')] 

Good evening to all USA soldiers. we still suffer from this great event. We would love to handle your presence of authority. thank you
[('good', 'JJ'), ('evening', 'NN'), ('usa', 'NN'), ('soldier', 'NN'), ('still', 'RB'), ('suffer', 'VBZ'), ('great', 'JJ'), ('event', 'NN'), ('would', 'MD'), ('love', 'VB'), ('handle', 'VB'), ('presence', 'NN'), ('authority', 'NN'), ('thank', 'NN')] 



### 3. Build a machine learning pipeline
Use sk-learn's [MultiOutputClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.multioutput.MultiOutputClassifier.html) ([example here](https://scikit-learn.org/stable/modules/multiclass.html#multioutput-classification))

In [12]:
# define the classifier, wrapped in MultiOutputClassifier
clf = RandomForestClassifier(n_estimators=100, random_state=1, n_jobs=-1)

# build the pipeline
pipe = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize_text)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(clf, n_jobs=-1)),
              ])

start = time.time()

# train the pipeline
model = pipe.fit(X_train, Y_train)
# predict labels on test_set
Y_pred = model.predict(X_test)

print('Duration: {} seconds'.format(time.time() - start))

Duration: 323.91256976127625 seconds


### 5. Test your model
Report the f1 score, precision and recall for each output category of the dataset. You can do this by iterating through the columns and calling sklearn's `classification_report` on each.

In [13]:
def evaluate_multilabel(Y-test, Y-pred):
    """Calculate evaluation metrics for ML model
    
    ARGUMENTS:
    Y-test: Array containing actual labels.
    Y-pred: Array containing predicted labels.
       
    RETURNS:
    metrics_df: Dataframe containing the accuracy, precision, 
    recall and f1 scores.
    """
    
    # create list of strings with target class names
    target_names = train.iloc[:, (-1 * Y-train.shape[1]):].columns
    
    
    # Calculate evaluation metrics for each set of labels
    metrics = []
    for i in range(len(target_names)):
        accuracy = accuracY-score(Y-test[:, i], Y-pred[:, i])
        f1 = f1_score(Y-test[:, i], Y-pred[:, i], average='macro')  # not taking imbalance into account
        precision = precision_score(Y-test[:, i], Y-pred[:, i], average='macro')
        recall = recall_score(Y-test[:, i], Y-pred[:, i], average='macro')

        metrics.append([accuracy, f1, precision, recall])
    
    # Create dataframe containing metrics
    metrics = np.array(metrics)
    metrics_df = pd.DataFrame(
        data = metrics, index = target_names, 
        columns = ['Accuracy', 'F1', 'Precision', 'Recall'],
        )
      
    return metrics_df

In [39]:
def evaluate_multilabel(Y_real, Y_pred):
    """Calculate evaluation metrics for ML model
    
    ARGUMENTS:
    Y_real: Array containing actual labels.
    Y_pred: Array containing predicted labels.
       
    RETURNS:
    metrics_df: Dataframe containing the multilabel
    classification report.
    """
    
    # create list of strings with target class names
    target_names = train.iloc[:, (-1 * Y_train.shape[1]):].columns
    
    # Calculate classification report
    metrics = classification_report(
                Y_real, Y_pred,
                target_names=target_names,
                output_dict=True,
                )

    # Create dataframe, tanspose it
    metrics_df = pd.DataFrame(
                    data = metrics, 
                    ).T
      
    return metrics_df

In [40]:
evaluate_multilabel(Y_test, Y_pred)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


Unnamed: 0,f1-score,precision,recall,support
related,0.889883,0.836766,0.950201,3976.0
request,0.603854,0.816602,0.479049,883.0
offer,0.0,0.0,0.0,32.0
aid_related,0.695568,0.73188,0.662689,2179.0
medical_help,0.099602,0.641026,0.053996,463.0
medical_products,0.112211,0.894737,0.059859,284.0
search_and_rescue,0.119048,0.769231,0.064516,155.0
security,0.023256,1.0,0.011765,85.0
military,0.139303,0.7,0.077348,181.0
water,0.429561,0.853211,0.287037,324.0


In [43]:
evaluate_multilabel(Y_train, Y_train_pred)

NameError: name 'Y_train' is not defined

### Test different baseline models

Evaluation of different possible classifiers, Validation with stratified k-fold CV. Models are:
- LogisticRegression 
- GaussianNB (Naïve Bayes) 
- KNeighborsClassifier 
- SVC 
- RandomForestClassifier 
- XGBClassifier

In [None]:
# initialize baseline models in list
classifiers = [
    LogisticRegression(solver='liblinear'),
    KNeighborsClassifier(3), 
    SVC(gamma='auto'), 
    GaussianNB(),
    RandomForestClassifier(n_estimators=10),
    GradientBoostingClassifier(n_estimators=100),
    ]

In [46]:
# for each model, fit to train set and validate with stratified 3-fold CV, store results in nested dict

def evaluate_baseline_models(model_list, X_train, y_train, cv=StratifiedKFold(3), scorer):
    """This Funciont is built on top of sk-learn's cross_validate function. 
       It fits baseline models to train set and validates results with CV.
       
       ARGUMENTS:
       model_list: list of sk-learn model objects
       X_train: training features (df or array)
       y_train: training labels (df or array)
       cv: type of CV, default is StratifiedKFold(3)
       scorer: evaluation metric for validation
       
       RETURNS:
       baseline_results: dataframe with
"""
    # store results of cross_validate in nested dict
    baseline_results = {}
    
    for clf in tqdm(model_list):
    try:  # standard type for data input is df
        clfName = clf.__class__.__name__
        scores = cross_validate(
            clf, X_train, y_train, cv=cv, error_score='raise', 
            n_jobs=-1, scoring=scorer, return_train_score=False, 
            return_estimator=True
        )
    except:  # exception for classifiers that need data as array
        clfName = clf.__class__.__name__ 
        scores = cross_validate(
            clf, X_train.toarray(), y_train, cv=cv, 
            error_score='raise', n_jobs=-1,
            scoring=scorer, return_train_score=False, 
            return_estimator=True
        )
        
    baseline_results[clfName] = scores
    
    # create dataframe with extra metrics (95_conf, total_time)
    baseline_results = pd.DataFrame(baseline_results).T
    baseline_results['95_conf'] = \
        baseline_results['test_score'].apply(lambda x: np.std(x) * 2)
    for col in df.iloc[:,1:]:
        df[col] = df[col].apply(lambda x: np.mean(x))
    df['total_time'] = df['score_time'] + df['fit_time']
    df.columns = ['estimator, test_score', '95_conf', 
                  'total_time', 'fit_time', 'score_time']
    baseline_results = df
    
    return basline_results

SyntaxError: invalid syntax (<ipython-input-46-07bfbc61efdd>, line 3)

In [None]:
   
    
    
    df.columns = ['estimator, test_score', '95_conf', 'total_time', 'fit_time', 'score_time']

### 6. Improve your model
Use grid search to find better parameters. 

In [17]:
parameters = 

cv = 

SyntaxError: invalid syntax (<ipython-input-17-84949413bfaa>, line 1)

### 7. Test your model
Show the accuracy, precision, and recall of the tuned model.  

Since this project focuses on code quality, process, and  pipelines, there is no minimum performance metric needed to pass. However, make sure to fine tune your models for accuracy, precision and recall to make your project stand out - especially for your portfolio!

### 8. Try improving your model further. Here are a few ideas:
* try other machine learning algorithms
* add other features besides the TF-IDF

### 9. Export your model as a pickle file

### 10. Use this notebook to complete `train.py`
Use the template file attached in the Resources folder to write a script that runs the steps above to create a database and export a model based on a new dataset specified by the user.

In [None]:
target_names = list(train.iloc[:, (-1 * y_train.shape[1]):].columns)
target_names

- fun evaluate: set columns in col_names to right

- to do: concatenate one hot encoded genre with text
- eventually length and total categories to

In [None]:
y_train.shape