# ML Pipeline Preparation
Follow the instructions below to help you create your ML pipeline.
### 1. Import libraries and load data from database.
- Import Python libraries
- Load dataset from database with [`read_sql_table`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_table.html)
- Define feature and target variables X and Y

In [None]:
# import libraries
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.multioutput import MultiOutputClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import TruncatedSVD
from sklearn.base import BaseEstimator
import pickle
import re
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
nltk.download(['punkt', 'wordnet'])

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

#imblearn
from imblearn.over_sampling import RandomOverSampler

from sklearn.base import BaseEstimator, TransformerMixin
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
from imblearn.under_sampling import RandomUnderSampler

In [None]:
# load data from database
engine = create_engine('sqlite:///messages.db')
df = pd.read_sql_table("messages", con=engine)

In [None]:
df.head()

based on this quick check most of the data is very imbalanced

In [None]:
X = df["message"]
y = df.drop(['message', 'genre', 'id', 'original'], axis = 1)

### 2. Write a tokenization function to process your text data

In [None]:
url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

In [None]:
def tokenize(text):
    '''
    Receives text related data and processes it
    Args: text related data (columns)
    Returns: tokenized text
    '''
    # get list of all urls using regex
    detected_urls = re.findall(url_regex, text) 
    
    # replace each url in text string with placeholder
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")

    # tokenize text
    tokens = word_tokenize(text)
    
    # initiate lemmatizer
    lemmatizer = WordNetLemmatizer()

    # iterate through each token
    clean_tokens = []
    for tok in tokens:
        
        # lemmatize, normalize case, and remove leading/trailing white space
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens

### 3. Build a machine learning pipeline
This machine pipeline should take in the `message` column as input and output classification results on the other 36 categories in the dataset. You may find the [MultiOutputClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.multioutput.MultiOutputClassifier.html) helpful for predicting multiple target variables.

In [None]:
def multi_tester(X, y):
    '''
    Function to create list of fitted models
    Args: training data X and y
    returns: list of the selected fitted models
    '''
    pipe_1 = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier(RandomForestClassifier()))
    ])
    
    pipe_2 = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier(ExtraTreesClassifier()))
    ])
    
    pipe_3 = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier(GradientBoostingClassifier()))
    ])
    
    pipe_4 = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier(AdaBoostClassifier()))
    ])
    
    pipe_5 = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier(SVC()))
    ])
    
    pips = [pipe_1, pipe_2, pipe_3, pipe_4, pipe_5]
    pip_names = ['RandomForestClassifier', 'ExtraTreesClassifier', 'GradientBoostingClassifier', 
                 'AdaBoostClassifier', 'SVC']
    
    model_fits = []
    for i in range(len(pips)):
        print('Model: ', pip_names[i])
        print(pips[i].get_params())
        mdl = pips[i].fit(X, y)
        model_fits.append(mdl)
        
    return model_fits

### 4. Train pipeline
- Split data into train and test sets
- Train pipeline

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.33)

In [None]:
fitted_mdls = multi_tester(X_train, y_train)

### 5. Test your models
Report the f1 score, precision and recall for each output category of the dataset. You can do this by iterating through the columns and calling sklearn's `classification_report` on each.

In [None]:
target_names = y_train.columns.tolist()

In [None]:
def perf_report(model, X_test, y_test):
    '''
    Function to return model classification reports
    Input: Model list, and test data 
    Output: Prints the Classification report
    '''
    pip_names = ['RandomForestClassifier', 'ExtraTreesClassifier', 'GradientBoostingClassifier', 
             'AdaBoostClassifier', 'SVC']
    
    for i in range(len(model)):
        print('______________________________Model______________________________')
        print('______________________________', pip_names[i], '______________________________')
        y_pred = model[i].predict(X_test)
        print(classification_report(y_test, y_pred, target_names = target_names))

In [None]:
perf_report(fitted_mdls, X_test, y_test)

-shops has very little label diversity so it became an edge case, I will drop it for the optimization

______________________________ RandomForestClassifier ______________________________


                           precision    recall  f1-score   support
                           
                           
             micro avg       0.80      0.44      0.57     27308
             
             
             macro avg       0.58      0.16      0.21     27308
             
             
          weighted avg       0.74      0.44      0.50     27308
          
          
           samples avg       0.65      0.42      0.46     27308
           

______________________________ ExtraTreesClassifier ______________________________


             micro avg       0.79      0.44      0.56     27308
             
             
             macro avg       0.53      0.15      0.21     27308
             
             
          weighted avg       0.71      0.44      0.49     27308
          
          
           samples avg       0.66      0.42      0.46     27308

______________________________ GradientBoostingClassifier ______________________________


             micro avg       0.76      0.57      0.65     27308
             
             
             macro avg       0.51      0.32      0.38     27308
             
             
          weighted avg       0.72      0.57      0.61     27308
          
          
           samples avg       0.65      0.50      0.52     27308
           
           
______________________________ AdaBoostClassifier ______________________________


             micro avg       0.77      0.58      0.66     27308
             
             
             macro avg       0.58      0.33      0.40     27308
             
             
          weighted avg       0.73      0.58      0.62     27308
          
          
           samples avg       0.63      0.50      0.51     27308
           
______________________________ SVC ______________________________


             micro avg       0.76      0.24      0.36     27308
             
             
             macro avg       0.02      0.03      0.02     27308
             
             
          weighted avg       0.18      0.24      0.21     27308
          
          
           samples avg       0.76      0.32      0.40     27308


### 6. Improve models based on poor target performance elimination
Report the f1 score, precision and recall for each output category of the dataset. You can do this by iterating through the columns and calling sklearn's `classification_report` on each.
Testing models after dropping poor predictors

In [None]:
# dropping the targets that had the word performances based on the classification report
targs_drop = ['offer', 'security', 'infrastructure_related', 'tools', 
              'hospitals', 'shops', 'aid_centers', 'other_infrastructure', 'fire', 'other_weather']
y_min = y.copy()
y_min.drop(targs_drop, axis = 1, inplace = True)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y_min, random_state = 42, test_size = 0.33)

In [None]:
fitted_mdls_min = multi_tester(X_train, y_train)

In [None]:
target_names = y_train.columns.tolist()

In [None]:
perf_report(fitted_mdls_min, X_test, y_test)


______________________________ RandomForestClassifier ______________________________

                           precision    recall  f1-score   support 
                           
              micro avg       0.80      0.48      0.60     25330
              
              macro avg       0.72      0.22      0.30     25330
              
           weighted avg       0.78      0.48      0.53     25330
           
            samples avg       0.66      0.44      0.48     25330

______________________________ ExtraTreesClassifier ______________________________

        micro avg       0.79      0.46      0.59     25330
        
        macro avg       0.68      0.20      0.27     25330
        
     weighted avg       0.75      0.46      0.52     25330
     
      samples avg       0.65      0.43      0.47     25330    

______________________________ GradientBoostingClassifier ______________________________

        micro avg       0.78      0.61      0.68     25330
        
        macro avg       0.65      0.43      0.50     25330
        
     weighted avg       0.76      0.61      0.65     25330
     
      samples avg       0.66      0.52      0.54     25330
           
           
______________________________ AdaBoostClassifier ______________________________

        micro avg       0.77      0.61      0.69     25330
        
        macro avg       0.69      0.42      0.51     25330
        
     weighted avg       0.75      0.61      0.66     25330
     
      samples avg       0.64      0.51      0.53     25330
           
______________________________ SVC ______________________________

        micro avg       0.76      0.26      0.38     25330
        
        macro avg       0.03      0.04      0.03     25330
        
     weighted avg       0.19      0.26      0.22     25330
     
      samples avg       0.76      0.33      0.41     25330


### 7. Improve your model
Use grid search to find better parameters. 

I will work on my best performing model adaboost and using the reduced target data

In [None]:
pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(AdaBoostClassifier()))
])

In [None]:
pipeline.get_params()

In [None]:
parameters = {'tfidf__use_idf': (True, False),
              'clf__estimator__n_estimators': [50, 100], 
              'clf__estimator__random_state': [42],
             'clf__estimator__learning_rate': [0.5]} 

cv = GridSearchCV(pipeline, param_grid = parameters, cv = 10,
                  refit = True, verbose = 1, return_train_score = True, n_jobs = -1)

In [None]:
cv

### 8. Test selected model

In [None]:
# dropping the targets that had the word performances based on the classification report
targs_drop = ['offer', 'security', 'infrastructure_related', 'tools', 
              'hospitals', 'shops', 'aid_centers', 'other_infrastructure', 'fire', 'other_weather']
y_min = y.copy()
y_min.drop(targs_drop, axis = 1, inplace = True)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y_min, random_state = 42, test_size = 0.33)

In [None]:
best_ada = cv.fit(X_train, y_train)


print('Best model :', best_ada.best_score_)
print('Params :', best_ada.best_params_)

In [None]:
y_pred = best_ada.predict(X_test)
print(classification_report(y_test, y_pred, target_names = target_names))

### 9. Other Approaches


Custom estimators (inspired by: [repo](https://github.com/hnbezz/Portfolio_under_construction/blob/master/Disaster_Response_Pipeline/ML%20Pipeline%20Preparation.ipynb) )

In [None]:
class StartVerbExtractor(BaseEstimator, TransformerMixin):
    def start_verb(self, text):
        sentence_list = nltk.sent_tokenize(text)
        for sentence in sentence_list:
            pos_tags = nltk.pos_tag(tokenize(sentence))
            if len(pos_tags) != 0:
                first_word, first_tag = pos_tags[0]
                if first_tag in ['VB', 'VBP'] or first_word == 'RT':
                    return 1
        return 0


    def fit(self, X, y=None):
        return self
    

    def transform(self, X):
        X_tag = pd.Series(X).apply(self.start_verb)
        return pd.DataFrame(X_tag)

In [None]:
def get_text_len(data):
    return np.array([len(text) for text in data]).reshape(-1, 1)

In [None]:
# dropping the targets that had the word performances based on the classification report
targs_drop = ['offer', 'security', 'infrastructure_related', 'tools', 
              'hospitals', 'shops', 'aid_centers', 'other_infrastructure', 'fire', 'other_weather', 'other_aid']
y_min = y.copy()
y_min.drop(targs_drop, axis = 1, inplace = True)
target_names = y_min.columns.tolist()

In [None]:
#stratifying data
mlss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.33, random_state=42)

for train_index, test_index in mlss.split(X, y_min):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y_min.values[train_index], y_min.values[test_index]

In [None]:
y_train = pd.DataFrame(y_train,columns=target_names)
y_test = pd.DataFrame(y_test,columns=target_names)

In [None]:
pipeline_2 = Pipeline([
    ('features', FeatureUnion([
            ('text_pipeline', Pipeline([
                ('vect', CountVectorizer(tokenizer=tokenize)),
                ('best', TruncatedSVD()),
                ('tfidf', TfidfTransformer())])), 
        ('start_verb', StartVerbExtractor())])), 
    ('clf', MultiOutputClassifier(AdaBoostClassifier()))
])

In [None]:
pipeline_2.get_params()

In [None]:
parameters = {'features__text_pipeline__tfidf__use_idf': (True, False),
              'clf__estimator__n_estimators': [100, 200, 300], 
              'clf__estimator__random_state': [42],
             'clf__estimator__learning_rate': [0.05]} 

cv_2 = GridSearchCV(pipeline, param_grid = parameters, cv = 10,
                  refit = True, verbose = 1, return_train_score = True, n_jobs = -1)

In [None]:
cv_2

In [None]:
best_ada_2 = cv_2.fit(X_train, y_train)


print('Best model :', best_ada_2.best_score_)
print('Params :', best_ada_2.best_params_)

In [None]:
y_pred = best_ada_2.predict(X_test)
print(classification_report(y_test, y_pred, target_names = target_names))

In [None]:
test_text = ['there is a storm and people are trapped']
test = cv.predict(test_text)
print(y_train.columns.values[(test.flatten()==1)])

That is a pretty cool prediction, let's try a few more

In [None]:
test_text = ['we are having an earthquake, buildings are destroyed, victims need clothes']
test = cv.predict(test_text)
print(y_train.columns.values[(test.flatten()==1)])

In [None]:
test_text = ['there was an accident near the bank and we need an ambulance']
test = cv.predict(test_text)
print(y_train.columns.values[(test.flatten()==1)])

### 9. Export your model as a pickle file

In [None]:
pickle.dump(cv_2, open('disaster_ada.sav', 'wb'))

### 10. Use this notebook to complete `train.py`
Use the template file attached in the Resources folder to write a script that runs the steps above to create a database and export a model based on a new dataset specified by the user.