# ML Pipeline Preparation
Follow the instructions below to help you create your ML pipeline.
### 1. Import libraries and load data from database.
- Import Python libraries
- Load dataset from database with [`read_sql_table`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_table.html)
- Define feature and target variables X and Y

In [1]:
from google.colab import drive
drive.mount('/content/drive')
main = '/content/drive/My Drive/Colab Notebooks/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# data management
import pandas as pd
from sqlalchemy import create_engine
# nlp
import re
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
# ML essentials
from sklearn.utils import resample
from sklearn.pipeline import Pipeline
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.model_selection import train_test_split
# Text engineering - TruncatedSVD for LSA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
# Classifiers
from sklearn.multioutput import MultiOutputClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
# Gridsearch
from sklearn.model_selection import GridSearchCV
# Evaluation metrics
from sklearn.metrics import classification_report, hamming_loss, precision_recall_fscore_support
from sklearn.metrics import accuracy_score, make_scorer, f1_score

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### 2. Write a tokenization function to process your text data

In [0]:
# load data from database
def load_data(db_path, table_name):
    engine = create_engine(db_path)
    df = pd.read_sql(f"SELECT * FROM {table_name}", engine)

    X = df['message']
    y = df.iloc[:, 4:]
    return X, y

def tokenize(text):
    
    lemmatizer = WordNetLemmatizer()
    stop_words = stopwords.words('english')
    
    # normalize - remove puncuation & convert to lowercase
    text = re.sub('[^a-zA-Z0-9\s]', '', text.lower())    
    # tokenize
    tokens = word_tokenize(text)    
    # stopwords filter
    tokens = [word.strip() for word in tokens if not word in stop_words]
    # lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    return tokens

In [0]:
X, y = load_data(f"sqlite:///{main}distab.db", "DisasterTable")

In [0]:
# resampling due to label imbalances
X, y = resample(X, y)

### 3. Build a machine learning pipeline
This machine pipeline should take in the `message` column as input and output classification results on the other 36 categories in the dataset. You may find the [MultiOutputClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.multioutput.MultiOutputClassifier.html) helpful for predicting multiple target variables.

In [0]:
pipeline = Pipeline([
    ('vect', TfidfVectorizer(tokenizer=tokenize)),
    ('lsa', TruncatedSVD(n_components=100, random_state=42)),
    #('clf', MultiOutputClassifier(RandomForestClassifier(n_estimators=100)))
    ('clf', MultiOutputClassifier(MLPClassifier(learning_rate='constant', learning_rate_init=0.001,
                                                max_iter=500, early_stopping=True, n_iter_no_change=5,
                                                random_state=42, warm_start=True)))
])

### 4. Train pipeline
- Split data into train and test sets
- Train pipeline

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=42)
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(...
                                                               beta_2=0.999,
                                                               early_stopping=True,
                                                               epsilon=1e-08,
                                                               

### 5. Test your model
Report the f1 score, precision and recall for each output category of the dataset. You can do this by iterating through the columns and calling sklearn's `classification_report` on each.

In [0]:
pred = pipeline.predict(X_test)

In [0]:
def evaluate(y_true, y_pred):
    
    result = precision_recall_fscore_support(y_true, y_pred)
    scores = []
    for i, col in enumerate(y_true.columns.values):
        scores.append((result[3][i], result[0][i], result[1][i], result[2][i]))
    
    score_df = pd.DataFrame(index=y_true.columns.values, data=scores, 
                            columns=['Total Positive labels', 'Precision', 'Recall', 'Unweighted F-Score'])
    score_df.sort_values(by='Unweighted F-Score', axis=0, ascending=False, inplace=True)

    acc = accuracy_score(y_true, y_pred)
    loss = hamming_loss(y_true, y_pred)
    print("=====Global Metrics=====\n")
    print("Accuracy: {:.4f}".format(acc))
    print("Hamming Loss: {:.4f}\n".format(loss))
    print("=====Label Metrics=====\n")
    return score_df

In [10]:
evaluate(y_test, pred)

=====Global Metrics=====

Accuracy: 0.3047
Hamming Loss: 0.0522

=====Label Metrics=====



  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Total Positive labels,Precision,Recall,Unweighted F-Score
related,5007,0.870865,0.925305,0.89726
earthquake,606,0.875527,0.684818,0.768519
aid_related,2725,0.765485,0.721101,0.74263
food,730,0.839161,0.657534,0.737327
weather_related,1846,0.807917,0.674431,0.735164
water,414,0.772472,0.664251,0.714286
request,1122,0.808399,0.54902,0.653928
shelter,562,0.765172,0.516014,0.616366
storm,622,0.708779,0.532154,0.607897
direct_report,1263,0.783208,0.494854,0.606502


Looking at the results from the initial evaluation, it looks as though I'm going to run into over-fitting problems for much of the classes. For the class 'related', it seems I may just classify *any* text as related to a disaster. I've added in a resampling function after loading my data, however I will have to use GridSearch to fine-tune the parameters and hope this improves performance.

In [11]:
for i, col in enumerate(y_test.columns):
    print(col)
    print(classification_report(y_test[col], pred[:,i]))

related
              precision    recall  f1-score   support

           0       0.70      0.56      0.62      1547
           1       0.87      0.93      0.90      5007

    accuracy                           0.84      6554
   macro avg       0.78      0.74      0.76      6554
weighted avg       0.83      0.84      0.83      6554

request
              precision    recall  f1-score   support

           0       0.91      0.97      0.94      5432
           1       0.81      0.55      0.65      1122

    accuracy                           0.90      6554
   macro avg       0.86      0.76      0.80      6554
weighted avg       0.89      0.90      0.89      6554

offer
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6534
           1       0.00      0.00      0.00        20

    accuracy                           1.00      6554
   macro avg       0.50      0.50      0.50      6554
weighted avg       0.99      1.00      1.00      655

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.96      0.99      0.98      6219
           1       0.70      0.27      0.39       335

    accuracy                           0.96      6554
   macro avg       0.83      0.63      0.68      6554
weighted avg       0.95      0.96      0.95      6554

electricity
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      6415
           1       0.48      0.08      0.14       139

    accuracy                           0.98      6554
   macro avg       0.73      0.54      0.56      6554
weighted avg       0.97      0.98      0.97      6554

tools
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      6518
           1       0.00      0.00      0.00        36

    accuracy                           0.99      6554
   macro avg       0.50      0.50      0.50      6554
weighted avg       0.99      0.99      0.99      6554

h

### 6. Improve your model
Use grid search to find better parameters. 

In [0]:
parameters = {'vect__ngram_range': [(1,1), (1,2)],
              'vect__max_df': [0.7, 1.0],
              'vect__max_features': [None, 5000],
              'lsa__n_components': [50, 100, 200],
              'clf__estimator__learning_rate_init': [0.001, 0.0001],
              'clf__estimator__max_iter': [500],
              'clf__estimator__n_iter_no_change': [5],
              'clf__estimator__warm_start': [True],
              'clf__estimator__early_stopping': [True],
              'clf__estimator__random_state': [42]}

cv = GridSearchCV(pipeline, param_grid=parameters, scoring=make_scorer(f1_score, average='weighted'), n_jobs=4, cv=5, verbose=10)

In [13]:
cv.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:  4.2min
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:  6.4min
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed: 10.4min
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed: 12.8min
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed: 18.6min
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed: 23.1min
[Parallel(n_jobs=4)]: Done  53 tasks      | elapsed: 30.7min
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed: 36.7min
[Parallel(n_jobs=4)]: Done  77 tasks      | elapsed: 45.9min
[Parallel(n_jobs=4)]: Done  90 tasks      | elapsed: 59.8min
[Parallel(n_jobs=4)]: Done 105 tasks      | elapsed: 76.1min
[Parallel(n_jobs=4)]: Done 120 tasks      | elapsed: 90.4min
[Parallel(n_jobs=4)]: Done 137 tasks      | elapsed: 96.4min
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed: 102.9min
[Parallel(n_jobs=4)]: Done 173 tasks      | elapsed: 110.4min
[Paralle

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('vect',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        no

### 7. Test your model

In [0]:
tuned_pipe = cv.best_estimator_

In [0]:
predicted_labels = tuned_pipe.predict(X_test)


The tuned model returned by the GridSearch only achieved a 3% increase in overall accuracy. I have a feeling that some over-fitting is still occurring for some labels. I predict that for classes where the total number of positive cases are relatively low, there is likely to be very low recall and F-Score.

I returned the dataframe of evaluation stats below, and I can confirm my suspicions are more or less correct. The average number of positive cases for labels in which F-Score was below 70%, is ~294. The model performs with decent precision on other labels, but Recall is just not there.

I would love to continue testing other models, such as more complex neural networks using the keras library - but I've spent too much time on this part of the project already so this'll just have to do.

In [18]:
evald = evaluate(y_test, predicted_labels)
evald

=====Global Metrics=====

Accuracy: 0.3340
Hamming Loss: 0.0458

=====Label Metrics=====



  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Total Positive labels,Precision,Recall,Unweighted F-Score
related,5007,0.883646,0.931296,0.906846
weather_related,1846,0.853959,0.788732,0.820051
earthquake,606,0.882239,0.754125,0.813167
aid_related,2725,0.794486,0.814312,0.804277
food,730,0.846906,0.712329,0.77381
water,414,0.85119,0.690821,0.762667
direct_report,1263,0.8274,0.607284,0.700457
storm,622,0.728223,0.672026,0.698997
request,1122,0.825062,0.592692,0.689834
shelter,562,0.792453,0.597865,0.681542


In [21]:
evald[evald['Unweighted F-Score'] < 0.7]['Total Positive labels'].mean()

293.64285714285717

In [0]:
def predict_new_data(model, labels, docs):
    
    prediction = model.predict(docs)
    
    tagged = []

    for i, j in enumerate(prediction.tolist()):
        tags = dict(zip(labels, prediction.tolist()[i]))
        tags = [k for k in tags.keys() if tags[k] == 1]
        tagged.append(tags)
        print("\n", docs[i])
        print(tags)

    #return tagged


In [26]:
test_docs = ["Fires out of control in Australian badlands",
             "Maple Leafs crush Canadiens in amazing 7-0 lockout",
             "Rescues underway in Alicante as roads are Submerged in flood waters",
             "Hurricane Sandy Touches Down in New York",
             "Blowout Sale on All LG Smart TVs!",
             "Thousands Displaced from Fort McMurray as Forest Fires Rage On",
             "TEXT!"]

predict_new_data(tuned_pipe, y_test.columns, test_docs)



 Fires out of control in Australian badlands
['related', 'weather_related', 'fire']

 Maple Leafs crush Canadiens in amazing 7-0 lockout
['related']

 Rescues underway in Alicante as roads are Submerged in flood waters
['related', 'aid_related', 'search_and_rescue', 'transport', 'weather_related', 'floods', 'direct_report']

 Hurricane Sandy Touches Down in New York
['related', 'weather_related', 'storm', 'direct_report']

 Blowout Sale on All LG Smart TVs!
['related']

 Thousands Displaced from Fort McMurray as Forest Fires Rage On
['related', 'aid_related', 'weather_related', 'fire']

 TEXT!
[]


From what we can observe in this simple testing of some dummy headlines I created, the model does overfit for the 'related' class. This should be addressed with a stronger model. Perhaps tuning more parameters, or selecting another technique for handling class imbalances. 

### 9. Export your model as a pickle file

In [0]:
import pickle
with open(f'{main}mlp_class.pkl', 'wb') as out:
    pickle.dump(tuned_pipe, out)

### 10. Use this notebook to complete `train.py`
Use the template file attached in the Resources folder to write a script that runs the steps above to create a database and export a model based on a new dataset specified by the user.

In [0]:
import pickle
import pandas as pd
from sqlalchemy import create_engine

import re
import nltk
nltk.download(['punkt', 'stopwords', 'wordnet']
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.multioutput import MultiOutputClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import classification_report, hamming_loss, precision_recall_fscore_support
from sklearn.metrics import accuracy_score, make_scorer, f1_score

In [0]:
def load_data(db_path, table_name):
    """
    Func: load data from an sql database using Pandas
    Parameters:
    db_path: str, path to the sql database
    table_name: str, table name to select data from
    Returns:
    X: dataframe of text data, (n_samples, )
    y: dataframe of class data, (n_samples, n_classes)
    labels: list of class labels
    """    
    engine = create_engine(db_path)
    df = pd.read_sql(f"SELECT * FROM {table_name}", engine)
    X = df['message']
    y = df.iloc[:, 4:]
    labels = y.columns.values

    return X, y, labels

def tokenize(text):
    """
    Func: normalize, tokenize, and lemmatize inputted text using nltk
    Parameters:
    text: str
    Returns:
    tokens: list of tokens which have been processed
    """    
    lemmatizer = WordNetLemmatizer()
    stop_words = stopwords.words('english')
    text = re.sub('[^a-zA-Z0-9\s]', '', text.lower())  # normalize  
    tokens = word_tokenize(text)  # tokenize  
    tokens = [word.strip() for word in tokens if not word in stop_words] # stopwords
    tokens = [lemmatizer.lemmatize(word) for word in tokens] # lemmatize
    
    return tokens

def build_pipeline():
    # initial pipeline
    pipeline = Pipeline([
                        ('vect', TfidfVectorizer(tokenizer=tokenize)),
                        ('lsa', TruncatedSVD(n_components=100, random_state=42)),
                        ('clf', MultiOutputClassifier(MLPClassifier(random_state=42)))
    ])

    # GridSearch
    parameters = {'vect__ngram_range': [(1,1), (1,2)],
                'vect__max_df': [0.7, 1.0],
                'vect__max_features': [None, 5000],
                'lsa__n_components': [50, 100, 200],
                'clf__estimator__learning_rate_init': [0.001, 0.0001],
                'clf__estimator__max_iter': [500],
                'clf__estimator__n_iter_no_change': [5],
                'clf__estimator__warm_start': [True],
                'clf__estimator__early_stopping': [True],
                'clf__estimator__random_state': [42]}

    cv = GridSearchCV(pipeline, param_grid=parameters, 
                      scoring=make_scorer(f1_score, average='weighted'),
                      n_jobs=4, cv=5, verbose=10)
    # select best estimator
    model = cv.best_estimator_

    
    return

def save_model():
    return

def load_model():
    return

def evaluate(y_true, y_pred):
    
    result = precision_recall_fscore_support(y_true, y_pred)
    scores = []
    for i, col in enumerate(y_true.columns.values):
        scores.append((result[3][i], result[0][i], result[1][i], result[2][i]))
    
    score_df = pd.DataFrame(index=y_true.columns.values, data=scores, 
                            columns=['Total Positive labels', 'Precision', 'Recall', 'Unweighted F-Score'])
    score_df.sort_values(by='Unweighted F-Score', axis=0, ascending=False, inplace=True)

    acc = accuracy_score(y_true, y_pred)
    loss = hamming_loss(y_true, y_pred)
    print("=====Global Metrics=====\n")
    print("Accuracy: {:.4f}".format(acc))
    print("Hamming Loss: {:.4f}\n".format(loss))
    print("=====Label Metrics=====\n")
    return score_df