## MultiOutput pickle Random Forest with spell check

In [3]:
# import necessary libraries
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import pickle
from textblob import TextBlob

from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, plot_confusion_matrix

from sklearn.multioutput import MultiOutputClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import classification_report, accuracy_score, roc_auc_score

In [5]:
# Datasets are downloable at:
# https://www.kaggle.com/landlord/multilingual-disaster-response-messages
# Importing disaster messages data
df = pd.read_csv('../datasets/df_clean.csv')
df.drop(columns = ['Unnamed: 0', 'content_length', 'content_word_count', 'genre', 'related', 'PII'], inplace = True)

In [6]:
# Functions for preprocessing
def tokenize_correct_spelling(text):

    textBlb = TextBlob(text)            # Making our first textblob
    textCorrected = textBlb.correct()   # Correcting the text
    
    tokens = word_tokenize(str(textCorrected))
    lemmatizer = WordNetLemmatizer()
    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()      
        clean_tokens.append(clean_tok)

    return clean_tokens

# Second option for preprocessing without spelling check
def tokenize(text):
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens

In [7]:
# Assigning target variable
X = df['message']
y = df.iloc[:, 1:]

In [8]:
# train test split your data
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [9]:
# Pipeline simple version
multioutput_pipeline = Pipeline([
    ('cv', CountVectorizer(tokenizer = tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(RandomForestClassifier()))])

multioutput_pipeline.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'cv', 'tfidf', 'clf', 'cv__analyzer', 'cv__binary', 'cv__decode_error', 'cv__dtype', 'cv__encoding', 'cv__input', 'cv__lowercase', 'cv__max_df', 'cv__max_features', 'cv__min_df', 'cv__ngram_range', 'cv__preprocessor', 'cv__stop_words', 'cv__strip_accents', 'cv__token_pattern', 'cv__tokenizer', 'cv__vocabulary', 'tfidf__norm', 'tfidf__smooth_idf', 'tfidf__sublinear_tf', 'tfidf__use_idf', 'clf__estimator__bootstrap', 'clf__estimator__ccp_alpha', 'clf__estimator__class_weight', 'clf__estimator__criterion', 'clf__estimator__max_depth', 'clf__estimator__max_features', 'clf__estimator__max_leaf_nodes', 'clf__estimator__max_samples', 'clf__estimator__min_impurity_decrease', 'clf__estimator__min_impurity_split', 'clf__estimator__min_samples_leaf', 'clf__estimator__min_samples_split', 'clf__estimator__min_weight_fraction_leaf', 'clf__estimator__n_estimators', 'clf__estimator__n_jobs', 'clf__estimator__oob_score', 'clf__estimator__random_state', 'clf__est

In [10]:
%%time
multioutput_pipeline.fit(X_train, y_train)

CPU times: user 5min 32s, sys: 3.66 s, total: 5min 35s
Wall time: 5min 40s


Pipeline(steps=[('cv',
                 CountVectorizer(tokenizer=<function tokenize at 0x7f9441f184c0>)),
                ('tfidf', TfidfTransformer()),
                ('clf',
                 MultiOutputClassifier(estimator=RandomForestClassifier()))])

In [198]:
# Making preditions
y_pred = multioutput_pipeline.predict(X_test)

In [205]:
y_pred

array([[0, 0, 0, ..., 0, 0, 0],
       [1, 0, 1, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

### Changing your prediction threshold

In [203]:
y_pred_proba = multioutput_pipeline.predict_proba(X_test)

In [227]:
new_pred=[]
for col in y_pred_proba:
    new_pred.append([1 if i[0]>=0.2 else 0 for i in col] )
new_pred = np.array(new_pred).T
print(new_pred.shape)

(5903, 35)


In [228]:
print(new_pred)

[[1 1 1 ... 1 1 1]
 [0 1 0 ... 1 1 1]
 [1 1 1 ... 1 1 1]
 ...
 [1 1 1 ... 1 1 1]
 [1 1 1 ... 1 1 1]
 [1 1 1 ... 1 1 1]]


### Evaluating Model performance

In [229]:
print(classification_report(y_test.values, new_pred, target_names = y_test.columns.values))

                        precision    recall  f1-score   support

               request       0.17      0.93      0.28      1035
                 offer       0.00      1.00      0.01        21
           aid_related       0.41      0.92      0.57      2562
          medical_help       0.08      1.00      0.15       488
      medical_products       0.05      1.00      0.10       318
     search_and_rescue       0.03      1.00      0.05       163
              security       0.02      1.00      0.04       106
              military       0.03      1.00      0.06       184
           child_alone       0.00      0.00      0.00         0
                 water       0.07      0.99      0.13       404
                  food       0.12      0.98      0.21       692
               shelter       0.09      1.00      0.16       531
              clothing       0.02      1.00      0.04       106
                 money       0.02      1.00      0.05       147
        missing_people       0.01      