In [458]:
# imports
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from dotenv import load_dotenv
load_dotenv()
from helpers import get_database_url
import os
import re
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import precision_recall_fscore_support as score

<h2>Load Data</h2>

In [425]:
def load_data():
    url = get_database_url()
    engine = create_engine(url)
    df = pd.read_sql_table("message_categories", engine)
    X = df["message"]
    Y = df[df.columns[3:]]
    Y = Y.astype(int)
    return X, Y

<h2> Tokenize message data </h2>

In [426]:
def tokenize(series):
    
    stop_words = stopwords.words("english")
    
    tokenizer = RegexpTokenizer("\w+|\d+")
    
    tokens = []
    
    for row in series:
        clean = tokenizer.tokenize(row.lower())
        tokens.append(clean)
        
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        tok = list(set(tok) - set(stop_words))
        clean_tok = lemmatizer.lemmatize(str(tok)).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens

<h2>Build a simple machine learning pipeline </h2>

In [428]:
def model_pipeline(classifier):
    pipeline = Pipeline(
        [
            ('vect', CountVectorizer(tokenizer=tokenize)),
            ('tfidf', TfidfTransformer()),
            ("clf", MultiOutputClassifier(classifier))
        ])
    return pipeline

In [429]:
#X, Y = load_data()
#X_train, X_test, Y_train, Y_test = train_test_split(X, Y)

In [430]:
model = model_pipeline(RandomForestClassifier())
model.fit(X_train.values, Y_train.values)
Y_pred = model.predict(X_test)

IndentationError: unexpected indent (<ipython-input-430-b466047b9f14>, line 2)

In [None]:
def convert_to_dataframe(test_data):
    test_df = pd.DataFrame(test_data, columns = df.columns[3:] )
    return test_df

In [435]:
Y_pred_df = convert_to_dataframe(y_pred)

In [465]:
for column in Y_test.columns:
    precision, recall, f1_score, support = 

              precision    recall  f1-score   support

           0       0.63      0.13      0.21      1881
           1       0.77      0.98      0.86      5925
           2       0.00      0.00      0.00        59

    accuracy                           0.77      7865
   macro avg       0.47      0.37      0.36      7865
weighted avg       0.73      0.77      0.70      7865

              precision    recall  f1-score   support

           0       0.84      0.99      0.91      6507
           1       0.72      0.12      0.21      1358

    accuracy                           0.84      7865
   macro avg       0.78      0.56      0.56      7865
weighted avg       0.82      0.84      0.79      7865

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7829
           1       0.00      0.00      0.00        36

    accuracy                           1.00      7865
   macro avg       0.50      0.50      0.50      7865
weighted avg       0

              precision    recall  f1-score   support

           0       0.96      0.99      0.97      7145
           1       0.80      0.57      0.67       720

    accuracy                           0.95      7865
   macro avg       0.88      0.78      0.82      7865
weighted avg       0.94      0.95      0.94      7865

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      7700
           1       0.00      0.00      0.00       165

    accuracy                           0.98      7865
   macro avg       0.49      0.50      0.49      7865
weighted avg       0.96      0.98      0.97      7865

              precision    recall  f1-score   support

           0       0.95      1.00      0.97      7453
           1       0.00      0.00      0.00       412

    accuracy                           0.95      7865
   macro avg       0.47      0.50      0.49      7865
weighted avg       0.90      0.95      0.92      7865

              preci

In [490]:
dictionary={}
for column in df.columns[3:]:
    dictionary[column] = (df[column].sum()/df.shape[0])

In [491]:
dictionary

{'related': 0.7736791913026893,
 'request': 0.17066564943734502,
 'offer': 0.004501239748235743,
 'aid_related': 0.41426664123593365,
 'medical_help': 0.07949647148579057,
 'medical_products': 0.05008582872401297,
 'search_and_rescue': 0.027617776082395577,
 'security': 0.017966812893381652,
 'military': 0.03280564562273507,
 'child_alone': 0.0,
 'water': 0.0637802784665268,
 'food': 0.11150104901773794,
 'shelter': 0.08827007438489415,
 'clothing': 0.015449170322334541,
 'money': 0.02304024413503719,
 'missing_people': 0.011367537669273316,
 'refugees': 0.03337783711615487,
 'death': 0.045546442876215905,
 'other_aid': 0.13145145908830821,
 'infrastructure_related': 0.06503909975205036,
 'transport': 0.04581346557314515,
 'buildings': 0.050848750715239366,
 'electricity': 0.020293724966622163,
 'tools': 0.006065229830249857,
 'hospitals': 0.010795346175853518,
 'shops': 0.004577531947358383,
 'aid_centers': 0.011787144764447836,
 'other_infrastructure': 0.04390616059507915,
 'weather_

In [494]:
list(df[df["offer"]==1]["message"])

['I am a driver, a mechanic ,. I want to help',
 'How can we help the victims at Les Cayes?',
 "I'm the vice president of the association Rafadek in Anse a galets. You can call me so that you could help people in La Gonave",
 "Hi i speak english, if it's possible i would like to know of some groups that need an interpreter. you can let me know on this number.",
 'purified water. we leave at croix-des-bouquets ( Meyer entrue 2Rigaud ) just next 2the pshychiatry center. ',
 "I have an association, and i need help for my neighbors, but i don't know how it can be done",
 'i want to give blood where do I go ',
 'WHAT CAN WE DO TO HELP THE SUVIVORS? ',
 "Let's work together to save Haiti, trust in God our father ",
 'Is possible another Haiti thanks to international helps? ',
 'Earthquake Haiti donate to a Dutch foundation that helps Haiti with different projects http bit.ly 77sEhW',
 'Various places online & charity shops where you can donate should you wish to. https www.oxfam.org.uk donat