In [81]:
# imports
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from dotenv import load_dotenv
load_dotenv()
from helpers import get_database_url
import os
import re
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.decomposition import TruncatedSVD
from wordcounter import WordCounter
import joblib
import pickle

<h2>Load Data</h2>

In [82]:
def load_data(database_name):
    url = get_database_url()
    engine = create_engine(url)
    df = pd.read_sql_table(database_name, engine)
    X = df["message"]
    Y = df[df.columns[3:]]
    Y = Y.astype(int)
    categories = list(Y.columns)
    return X, Y, categories

In [83]:
df = load_data("message_categories")

In [84]:
df

(0        Weather update - a cold front from Cuba that c...
 1                  Is the Hurricane over or is it not over
 2                          Looking for someone but no name
 3        UN reports Leogane 80-90 destroyed. Only Hospi...
 4        says: west side of Haiti, rest of the country ...
                                ...                        
 26172    The training demonstrated how to enhance micro...
 26173    A suitable candidate has been selected and OCH...
 26174    Proshika, operating in Cox's Bazar municipalit...
 26175    Some 2,000 women protesting against the conduc...
 26176    A radical shift in thinking came about as a re...
 Name: message, Length: 26177, dtype: object,
        related  request  offer  aid_related  medical_help  medical_products  \
 0            1        0      0            0             0                 0   
 1            1        0      0            1             0                 0   
 2            1        0      0            0          

<h2> Tokenize message data </h2>

In [85]:
def tokenize(series):
    
    stop_words = stopwords.words("english")
    
    tokenizer = RegexpTokenizer("\w+|\d+")
    
    tokens = []
    
    for row in series:
        clean = tokenizer.tokenize(row.lower())
        tokens.append(clean)
        
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        #tok = list(set(tok) - set(stop_words))
        clean_tok = lemmatizer.lemmatize(str(tok)).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens

<h2>Build a simple machine learning pipeline </h2>

In [86]:
def build_model():
    pipeline = Pipeline(
        [
            ("features", FeatureUnion([
                ("text_pipeline", Pipeline([
                    ('vect', CountVectorizer(tokenizer=tokenize)),
                    ('tfidf', TfidfTransformer()),
                ])),
                ("word_count", Pipeline([
                    ('text_len', WordCounter()),
                ])),
            ])),
            ("clf", MultiOutputClassifier(RandomForestClassifier()))
        ])

    return pipeline

In [87]:
def evaluate_model(model, X_test, Y_test, categories):
    Y_pred = model.predict(X_test)
    Y_pred_df = pd.DataFrame(Y_pred, columns = categories)
    for column in Y_test.columns:
        precision, recall, fscore, support = score(Y_test[column], Y_pred_df[column], average="weighted")
        print(column, precision, recall, fscore)

In [88]:
def main():
    X, Y, categories = load_data("message_categories")
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y)
    model = build_model()
    model.fit(X_train, Y_train)
    Y_pred = model.predict(X_test)
    evaluate_model(model, X_test, Y_test, categories)
    save_model(model, "classifier.joblib")

In [89]:
def save_model(model, filename):
    joblib.dump(model, open(filename, 'wb'))

In [90]:
main()

related 0.744751529811771 0.7683728036669213 0.7108245339556915
request 0.858002105347084 0.8598930481283422 0.8233491709830069
offer 0.9902454477172493 0.9951107715813599 0.9926721481558273
aid_related 0.6700587776337131 0.6711993888464477 0.6532412701351804
medical_help 0.9037837315073601 0.9231474407944996 0.8884327980122009
medical_products 0.9186390301085382 0.9489686783804431 0.9254482421845743
search_and_rescue 0.9460497315114671 0.9726508785332315 0.959165903917985
security 0.9657679075519366 0.9827349121466769 0.9741775379405757
military 0.9365626114325166 0.9677616501145913 0.9519065597990249
child_alone 1.0 1.0 1.0
water 0.9210037428319903 0.9361344537815126 0.9092567397460463
food 0.8699121668388238 0.8838808250572956 0.835751699034713
shelter 0.8875422720973803 0.905423987776929 0.8625286038030491
clothing 0.9702776745117103 0.9850267379679144 0.9775965793841047
money 0.9774613079283726 0.9769289533995417 0.9656788632894556
missing_people 0.9790263446166021 0.9894576012223

  _warn_prf(average, modifier, msg_start, len(result))
