## MultiOutput pickle Random Forest with spell check

In [14]:
# import necessary libraries
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import pickle
from textblob import TextBlob

from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, plot_confusion_matrix

from sklearn.multioutput import MultiOutputClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import classification_report, accuracy_score, roc_auc_score

In [2]:
# Datasets are downloable at:
# https://www.kaggle.com/landlord/multilingual-disaster-response-messages
# Importing disaster messages data
df = pd.read_csv('./datasets/df_clean.csv')
df.drop(columns = ['Unnamed: 0', 'content_length', 'content_word_count', 'genre', 'related', 'PII'], inplace = True)

In [3]:
# Functions for preprocessing
def tokenize_correct_spelling(text):

    textBlb = TextBlob(text)            # Making our first textblob
    textCorrected = textBlb.correct()   # Correcting the text
    
    tokens = word_tokenize(str(textCorrected))
    lemmatizer = WordNetLemmatizer()
    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()      
        clean_tokens.append(clean_tok)

    return clean_tokens

# Second option for preprocessing without spelling check
def tokenize(text):
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens

In [4]:
# Assigning target variable
X = df['message']
y = df.iloc[:, 1:]

In [5]:
# train test split your data
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [6]:
# Pipeline simple version
multioutput_pipeline = Pipeline([
    ('cv', CountVectorizer(tokenizer = tokenize_correct_spelling)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(RandomForestClassifier()))])

multioutput_pipeline.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'cv', 'tfidf', 'clf', 'cv__analyzer', 'cv__binary', 'cv__decode_error', 'cv__dtype', 'cv__encoding', 'cv__input', 'cv__lowercase', 'cv__max_df', 'cv__max_features', 'cv__min_df', 'cv__ngram_range', 'cv__preprocessor', 'cv__stop_words', 'cv__strip_accents', 'cv__token_pattern', 'cv__tokenizer', 'cv__vocabulary', 'tfidf__norm', 'tfidf__smooth_idf', 'tfidf__sublinear_tf', 'tfidf__use_idf', 'clf__estimator__bootstrap', 'clf__estimator__ccp_alpha', 'clf__estimator__class_weight', 'clf__estimator__criterion', 'clf__estimator__max_depth', 'clf__estimator__max_features', 'clf__estimator__max_leaf_nodes', 'clf__estimator__max_samples', 'clf__estimator__min_impurity_decrease', 'clf__estimator__min_impurity_split', 'clf__estimator__min_samples_leaf', 'clf__estimator__min_samples_split', 'clf__estimator__min_weight_fraction_leaf', 'clf__estimator__n_estimators', 'clf__estimator__n_jobs', 'clf__estimator__oob_score', 'clf__estimator__random_state', 'clf__est

In [7]:
%%time
multioutput_pipeline.fit(X_train, y_train)

CPU times: user 3h 3min 30s, sys: 37.4 s, total: 3h 4min 7s
Wall time: 3h 5min 19s


Pipeline(memory=None,
         steps=[('cv',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function tokenize_cor...
                                                                        ccp_alpha=0.0,
                                                                        class_weight=None,
                                                                        criterion='gini',
                                                                    

In [8]:
# Making preditions
y_pred = multioutput_pipeline.predict(X_test)

### Evaluating Model performance

In [9]:
print(classification_report(y_test.values, y_pred, target_names = y_test.columns.values))

                        precision    recall  f1-score   support

               request       0.86      0.40      0.55      1022
                 offer       0.00      0.00      0.00        30
           aid_related       0.81      0.61      0.69      2529
          medical_help       0.76      0.04      0.07       527
      medical_products       0.94      0.05      0.10       309
     search_and_rescue       0.62      0.03      0.06       169
              security       0.00      0.00      0.00       102
              military       0.83      0.03      0.05       189
           child_alone       0.00      0.00      0.00         0
                 water       0.87      0.20      0.32       388
                  food       0.87      0.51      0.64       639
               shelter       0.85      0.24      0.37       535
              clothing       0.80      0.05      0.09        82
                 money       0.60      0.02      0.04       160
        missing_people       0.00      

### Saving `Classification Report` to csv file

In [10]:
spellcheck_rf_report = classification_report(y_test.values, y_pred, target_names = y_test.columns.values, output_dict=True)
spellcheck_rf_report = pd.DataFrame(spellcheck_rf_report)
spellcheck_rf_report.to_csv('./reports/Random_Forest_Spellcheck_Classification_Report.csv')

### Evaluating Model performance for each category

In [11]:
category_names = list(df.columns[1:])  
for i in range(len(category_names)):
    print('------------Category {}: {} ---------------'.format(i, category_names[i]))
    print(classification_report(y_test.iloc[:, i].values, y_pred[:, i]))
    print('Accuracy {}\n\n'.format(accuracy_score(y_test.iloc[:, i].values, y_pred[:, i])))

------------Category 0: request ---------------
              precision    recall  f1-score   support

           0       0.89      0.99      0.93      4881
           1       0.86      0.40      0.55      1022

    accuracy                           0.89      5903
   macro avg       0.88      0.69      0.74      5903
weighted avg       0.88      0.89      0.87      5903

Accuracy 0.8854819583262747


------------Category 1: offer ---------------
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      5873
           1       0.00      0.00      0.00        30

    accuracy                           0.99      5903
   macro avg       0.50      0.50      0.50      5903
weighted avg       0.99      0.99      0.99      5903

Accuracy 0.9949178383872607


------------Category 2: aid_related ---------------
              precision    recall  f1-score   support

           0       0.75      0.89      0.82      3374
           1       0.81      0.

In [12]:
# Transform it into a dataframe
y_pred = pd.DataFrame(y_pred, columns = y.columns)
y_pred.head()

Unnamed: 0,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,water,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
category_names = list(df.columns[1:])  
print('ROC AUC for Random Forest Classifier with SpellCheck\n')
for i in range(len(category_names)):
    if category_names[i] != 'child_alone':
        print('{} - :{}'.format(category_names[i], roc_auc_score(y_test.iloc[:, i], y_pred.iloc[:, i])))

ROC AUC for Random Forest Classifier with SpellCheck

request - :0.6944178493146677
offer - :0.5
aid_related - :0.7489449592785338
medical_help - :0.51746852975061
medical_products - :0.5258005861573831
search_and_rescue - :0.5145313019196199
security - :0.4998276159282882
military - :0.5131400088522945
water - :0.5969408641848379
food - :0.7510243196769268
shelter - :0.6166424084571779
clothing - :0.5243043480082629
money - :0.5092008749782344
missing_people - :0.5
refugees - :0.5271999020347178
death - :0.5657748155625915
other_aid - :0.5071128855146134
infrastructure_related - :0.49981903727832067
transport - :0.5459596836001329
buildings - :0.5184506112403553
electricity - :0.5248270793705689
tools - :0.5
hospitals - :0.5
shops - :0.5
aid_centers - :0.5
other_infrastructure - :0.49982332155477033
weather_related - :0.8029781876495081
floods - :0.6817061948567782
storm - :0.7366424989709884
fire - :0.5084745762711864
earthquake - :0.8939328444869911
cold - :0.5364096351377696
other_

In [16]:
# Double check pipeline steps
multioutput_pipeline.named_steps

{'cv': CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                 dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                 lowercase=True, max_df=1.0, max_features=None, min_df=1,
                 ngram_range=(1, 1), preprocessor=None, stop_words=None,
                 strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                 tokenizer=<function tokenize_correct_spelling at 0x7f9f57a77310>,
                 vocabulary=None),
 'tfidf': TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True),
 'clf': MultiOutputClassifier(estimator=RandomForestClassifier(bootstrap=True,
                                                        ccp_alpha=0.0,
                                                        class_weight=None,
                                                        criterion='gini',
                                                        max_depth=None,
                                            

In [17]:
# Pickle it
with open('./models/random_forest_spellcheck_pipe.pkl', 'wb') as pickle_out:
     pickle_out = pickle.dump(multioutput_pipeline, pickle_out)

### Test my pickle predictions

In [18]:
# Pickel model in
with open('./models/random_forest_spellcheck_pipe.pkl', 'rb') as pickle_in:
    pipe = pickle.load(pickle_in)

In [19]:
# Test prediction
category_names = list(df.columns[1:])
category_predicted = pipe.predict(['Lord help its flood my house, and i think i feel a hurricane is coming too'])[0]
print(category_predicted)

[0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 1]


In [20]:
# Select category columns
def get_category_names(df):
    return list(df.columns[1:])

print(get_category_names(df))

['request', 'offer', 'aid_related', 'medical_help', 'medical_products', 'search_and_rescue', 'security', 'military', 'child_alone', 'water', 'food', 'shelter', 'clothing', 'money', 'missing_people', 'refugees', 'death', 'other_aid', 'infrastructure_related', 'transport', 'buildings', 'electricity', 'tools', 'hospitals', 'shops', 'aid_centers', 'other_infrastructure', 'weather_related', 'floods', 'storm', 'fire', 'earthquake', 'cold', 'other_weather', 'direct_report']


In [21]:
# Get appropriate category name
def get_predicted_category_names(category_predicted):
    return [category_names[i] for i in range(len(category_predicted)) if category_predicted[i] == 1]

In [22]:
# Assigning it to result, for streamlit functionality
result = get_predicted_category_names(category_predicted)

In [23]:
# Transforming list into string, for streamlit functionality
def list_to_string(s): 
    return (" ".join(s))

In [24]:
# Format list to string for streamlit functionality
list_to_string(result)

'aid_related weather_related floods direct_report'