In [34]:
# imports
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from dotenv import load_dotenv
load_dotenv()
from helpers import get_database_url
import os
import re
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.decomposition import TruncatedSVD
from debug import Debug
from wordcounter import WordCounter

<h2>Load Data</h2>

In [35]:
def load_data():
    url = get_database_url()
    engine = create_engine(url)
    df = pd.read_sql_table("message_categories", engine)
    X = df["message"]
    Y = df[df.columns[3:]]
    Y = Y.astype(int)
    return X, Y

<h2> Tokenize message data </h2>

In [36]:
def tokenize(series):
    
    stop_words = stopwords.words("english")
    
    tokenizer = RegexpTokenizer("\w+|\d+")
    
    tokens = []
    
    for row in series:
        clean = tokenizer.tokenize(row.lower())
        tokens.append(clean)
        
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        #tok = list(set(tok) - set(stop_words))
        clean_tok = lemmatizer.lemmatize(str(tok)).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens

<h2>Build a simple machine learning pipeline </h2>

In [37]:
def model_pipeline():
    pipeline = Pipeline(
        [
            ("features", FeatureUnion([
                ("text_pipeline", Pipeline([
                    ('vect', CountVectorizer(tokenizer=tokenize)),
                    ('tfidf', TfidfTransformer()),
                ])),
                ("word_count", Pipeline([
                    ('text_len', WordCounter()),
                ])),
            ])),
            ("clf", MultiOutputClassifier(RandomForestClassifier()))
        ])
    return pipeline

In [45]:
def display_score(Y_true, Y_pred, Y):
    Y_pred_df = convert_to_dataframe(Y_pred, Y)
    for column in Y_true.columns:
        precision, recall, fscore = score(Y_true[column], Y_pred_df[column], average="weighted")
        print(column, precision, recall, fscore)

In [46]:
def convert_to_dataframe(test_data, Y):
    test_df = pd.DataFrame(test_data, columns = Y.columns)
    return test_df

In [47]:
def main():
    X, Y = load_data()
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y)
    model = model_pipeline()
    model.fit(X_train, Y_train)
    Y_pred = model.predict(X_test)
    display_score(Y_test, Y_pred, Y)

In [48]:
main()

############################################
13135    25
22543    29
7991     16
15807    19
17765    44
         ..
15515    25
10042     9
3069     30
919      37
7307     12
Name: message, Length: 19632, dtype: int64 (19632,)
       message
13135       25
22543       29
7991        16
15807       19
17765       44
...        ...
15515       25
10042        9
3069        30
919         37
7307        12

[19632 rows x 1 columns]
############################################
13836    25
23864    15
9172     12
10941    16
17798     8
         ..
9344     27
12163    28
23942    19
25747     3
2870     33
Name: message, Length: 6545, dtype: int64 (6545,)
       message
13836       25
23864       15
9172        12
10941       16
17798        8
...        ...
9344        27
12163       28
23942       19
25747        3
2870        33

[6545 rows x 1 columns]
related 0.7575823008949285 0.7772345301757067 0.7224607210307905 None
request 0.8495893311920176 0.8585179526355997 0.8199963201364 N

  _warn_prf(average, modifier, msg_start, len(result))
