In [24]:
# imports
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from dotenv import load_dotenv
load_dotenv()
from helpers import get_database_url
import os
import re
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.decomposition import TruncatedSVD
from wordcounter.py import WordCounter

<h2>Load Data</h2>

In [25]:
def load_data():
    url = get_database_url()
    engine = create_engine(url)
    df = pd.read_sql_table("message_categories", engine)
    X = df["message"]
    Y = df[df.columns[3:]]
    Y = Y.astype(int)
    return X, Y

<h2> Tokenize message data </h2>

In [26]:
def tokenize(series):
    
    stop_words = stopwords.words("english")
    
    tokenizer = RegexpTokenizer("\w+|\d+")
    
    tokens = []
    
    for row in series:
        clean = tokenizer.tokenize(row.lower())
        tokens.append(clean)
        
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        #tok = list(set(tok) - set(stop_words))
        clean_tok = lemmatizer.lemmatize(str(tok)).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens

<h2>Build a simple machine learning pipeline </h2>

In [27]:
def model_pipeline(classifier):
    pipeline = Pipeline(
        [
            ('vect', CountVectorizer(tokenizer=tokenize)),
            ('tfidf', TfidfTransformer()),
            ("clf", MultiOutputClassifier(classifier))
        ])
    return pipeline

In [28]:
def main(classifier):
    X, Y = load_data()
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y)
    model = model_pipeline(classifier)
    model.fit(X_train.values, Y_train.values)
    Y_pred = model.predict(X_test)
    display_score(Y_test, Y_pred)

In [38]:
def display_score(Y_true, Y_pred):
    Y_pred_df = convert_to_dataframe(Y_pred)
    for column in Y_true.columns:
        precision, recall, fscore, support = score(Y_true[column], Y_pred_df[column], average="weighted")
        print(column, precision, recall, fscore, support)
    

In [33]:
def convert_to_dataframe(test_data):
    test_df = pd.DataFrame(test_data, columns = Y.columns)
    return test_df

In [39]:
main(RandomForestClassifier())

related 0.7586420271294321 0.7815074763503204 0.7260642589078131 None
request 0.8435754645616177 0.8515410436374733 0.8137251745582504 None
offer 0.9917777280719161 0.9958803783948733 0.9938248191702986 None
aid_related 0.660382400052106 0.6643271284711626 0.6467866247493009 None
medical_help 0.9033622640126706 0.9197436679890143 0.8823297808811464 None
medical_products 0.9012465248383629 0.9491913335367714 0.9245978046600066 None
search_and_rescue 0.94909334708046 0.9742142203234666 0.9614897282271172 None
security 0.9643129113981111 0.9818431492218492 0.9729990774334931 None
military 0.934872254266647 0.9667378700030516 0.9505380732675465 None
child_alone 1.0 1.0 1.0 None
water 0.9435045979281592 0.9398840402807446 0.9124851196896309 None
food 0.8738173553661184 0.892584681110772 0.8489922962026628 None
shelter 0.8940177406319862 0.9021971315227342 0.8574304447807959 None
clothing 0.9694166087997939 0.9845895636252671 0.9769441768392172 None
money 0.9538559218693385 0.976655477570949

  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
counter = WordCounter()
print(counter.transform(X))