In [1]:
# import necessary libraries
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import pickle
from textblob import TextBlob
from nltk.corpus import stopwords

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.multioutput import MultiOutputClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
from sklearn.metrics import accuracy_score, f1_score, fbeta_score

In [2]:
# Datasets are downloable at:
# https://www.kaggle.com/landlord/multilingual-disaster-response-messages
# Importing anxiety data
df = pd.read_csv('./datasets/df_clean.csv')
# Dropping columns for genre, related, and PII are not target variables
# Dropping columns offer, shops and tools they are not relavant for the scope of this project and classification reported badly on precision.
df.drop(columns = ['Unnamed: 0', 'content_length', 'content_word_count', 'genre', 'related', 'PII', 'offer', 'shops', 'tools'], inplace = True)

### Baseline Accuracy

In [3]:
df['direct_report'].value_counts(normalize = True)

0    0.804863
1    0.195137
Name: direct_report, dtype: float64

### Testing with `MultiOutputClassifier` and `DecisionTreeClassifier`

In [4]:
X = df['message']
y = df.iloc[:, 1:]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [6]:
# Functions for preprocessing
def tokenize_correct_spelling(text):

    textBlb = TextBlob(text)            # Making our first textblob
    textCorrected = textBlb.correct()   # Correcting the text
    
    tokens = word_tokenize(str(textCorrected))
    lemmatizer = WordNetLemmatizer()
    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()      
        clean_tokens.append(clean_tok)

    return clean_tokens

In [7]:
# Pipeline simple version
multioutput_pipeline = Pipeline([
    ('cv', CountVectorizer(tokenizer = tokenize_correct_spelling)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(DecisionTreeClassifier()))])

In [8]:
%%time
multioutput_pipeline.fit(X_train, y_train)

CPU times: user 3h 8min 10s, sys: 39 s, total: 3h 8min 49s
Wall time: 3h 10min 7s


Pipeline(memory=None,
         steps=[('cv',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function tokenize_cor...
                 MultiOutputClassifier(estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                                        class_weight=None,
                                                                        criterion='gini',
                                                                    

In [9]:
y_pred = multioutput_pipeline.predict(X_test)

In [10]:
print(classification_report(y_test.values, y_pred, target_names = y_test.columns.values))

                        precision    recall  f1-score   support

               request       0.57      0.55      0.56      1051
           aid_related       0.63      0.62      0.62      2425
          medical_help       0.33      0.30      0.32       441
      medical_products       0.40      0.41      0.40       268
     search_and_rescue       0.16      0.16      0.16       141
              security       0.10      0.12      0.11        84
              military       0.36      0.38      0.37       177
           child_alone       0.00      0.00      0.00         0
                 water       0.64      0.66      0.65       374
                  food       0.71      0.73      0.72       651
               shelter       0.58      0.57      0.57       505
              clothing       0.54      0.60      0.57        73
                 money       0.35      0.27      0.31       161
        missing_people       0.31      0.21      0.25        73
              refugees       0.33      

In [11]:
decision_tree_report = classification_report(y_test.values, y_pred, target_names = y_test.columns.values, output_dict=True)
decision_tree_report = pd.DataFrame(decision_tree_report)
decision_tree_report.to_csv('./reports/Decision_Tree_Classification_Report.csv')

In [12]:
category_names = list(df.columns[1:])  
for i in range(len(category_names)):
    print('------------Category {}: {} ---------------'.format(i, category_names[i]))
    print(classification_report(y_test.iloc[:, i].values, y_pred[:, i]))
    print('Accuracy {}\n\n'.format(accuracy_score(y_test.iloc[:, i].values, y_pred[:, i])))


------------Category 0: request ---------------
              precision    recall  f1-score   support

           0       0.90      0.91      0.91      4852
           1       0.57      0.55      0.56      1051

    accuracy                           0.85      5903
   macro avg       0.74      0.73      0.73      5903
weighted avg       0.84      0.85      0.85      5903

Accuracy 0.8468575300694562


------------Category 1: aid_related ---------------
              precision    recall  f1-score   support

           0       0.74      0.74      0.74      3478
           1       0.63      0.62      0.62      2425

    accuracy                           0.69      5903
   macro avg       0.68      0.68      0.68      5903
weighted avg       0.69      0.69      0.69      5903

Accuracy 0.692359817042182


------------Category 2: medical_help ---------------
              precision    recall  f1-score   support

           0       0.94      0.95      0.95      5462
           1       0.33  

In [13]:
df.message.iloc[10]

'lets do it together need food in delma 75 in didine area'

In [14]:
y_pred = pd.DataFrame(y_pred, columns = y.columns)
y_pred.head()

Unnamed: 0,request,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,water,food,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,1,1,1,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
category_names = list(df.columns[1:])  
print('ROC AUC for Decision Tree Classifier\n')
for i in range(len(category_names)):
    if category_names[i] != 'child_alone':
        print('{} - :{}'.format(category_names[i], roc_auc_score(y_test.iloc[:, i], y_pred.iloc[:, i])))

ROC AUC for Decision Tree Classifier

request - :0.7309342258736821
aid_related - :0.6813123432711061
medical_help - :0.6271196749174465
medical_products - :0.6888063012356144
search_and_rescue - :0.5681217859244106
security - :0.551446820350412
military - :0.6789616596711204
water - :0.8177342510032178
food - :0.8483727360683604
shelter - :0.765789492991537
clothing - :0.798110857867901
money - :0.6295055935235846
missing_people - :0.5998237740548416
refugees - :0.638942457672783
death - :0.7502216661162063
other_aid - :0.5831355592028519
infrastructure_related - :0.5192164045105221
transport - :0.6063956802284398
buildings - :0.6780616069053894
electricity - :0.6768405032467533
hospitals - :0.5575987561599072
aid_centers - :0.5214406611808344
other_infrastructure - :0.5229653207490899
weather_related - :0.7921056099015978
floods - :0.7837812009699044
storm - :0.7897596737828733
fire - :0.7069128382428498
earthquake - :0.8735338104625229
cold - :0.6938473413116908
other_weather - :0.6

In [16]:
print("The F1 Micro Score is: {0:.2f}".format(f1_score(y_pred, y_test, average='micro')))
print("The F1 Macro Score (Unweighted average) is: {0:.2f}".format(f1_score(y_pred, y_test, average='macro')))

The F1 Micro Score is: 0.52
The F1 Macro Score (Unweighted average) is: 0.39


In [17]:
multioutput_pipeline.named_steps

{'cv': CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                 dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                 lowercase=True, max_df=1.0, max_features=None, min_df=1,
                 ngram_range=(1, 1), preprocessor=None, stop_words=None,
                 strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                 tokenizer=<function tokenize_correct_spelling at 0x7fb89f944d30>,
                 vocabulary=None),
 'tfidf': TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True),
 'clf': MultiOutputClassifier(estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                        class_weight=None,
                                                        criterion='gini',
                                                        max_depth=None,
                                                        max_features=None,
                                         

In [18]:
with open('./models/multiout_decision_tree.pkl', 'wb') as pickle_out:
     pickle_out = pickle.dump(multioutput_pipeline, pickle_out)

### Testing with `AdaBoostClassifier`

In [19]:
# Pipeline simple version
multioutput_pipeline = Pipeline([
    ('cv', CountVectorizer(tokenizer = tokenize_correct_spelling)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(AdaBoostClassifier()))])

In [20]:
%%time
multioutput_pipeline.fit(X_train, y_train)

CPU times: user 2h 42min 37s, sys: 16.6 s, total: 2h 42min 53s
Wall time: 2h 43min 7s


Pipeline(memory=None,
         steps=[('cv',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function tokenize_correct_spelling at 0x7fb89f944d30>,
                                 vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultiOutputClassifier(estimator=AdaBoostClassifier(algo

In [21]:
y_pred = multioutput_pipeline.predict(X_test)

In [22]:
print(classification_report(y_test.values, y_pred, target_names = y_test.columns.values))

                        precision    recall  f1-score   support

               request       0.76      0.53      0.62      1051
           aid_related       0.75      0.60      0.67      2425
          medical_help       0.61      0.29      0.39       441
      medical_products       0.66      0.34      0.45       268
     search_and_rescue       0.52      0.17      0.26       141
              security       0.17      0.05      0.07        84
              military       0.65      0.36      0.46       177
           child_alone       0.00      0.00      0.00         0
                 water       0.71      0.66      0.68       374
                  food       0.78      0.70      0.74       651
               shelter       0.79      0.52      0.63       505
              clothing       0.70      0.51      0.59        73
                 money       0.58      0.29      0.38       161
        missing_people       0.61      0.15      0.24        73
              refugees       0.53      

In [23]:
adaboost_class_report = classification_report(y_test.values, y_pred, target_names = y_test.columns.values, output_dict=True)
adaboost_class_report = pd.DataFrame(adaboost_class_report)
adaboost_class_report.to_csv('./reports/adaboost_classification_report.csv')

In [24]:
category_names = list(df.columns[1:])  
for i in range(len(category_names)):
    print('------------Category {}: {} ---------------'.format(i, category_names[i]))
    print(classification_report(y_test.iloc[:, i].values, y_pred[:, i]))
    print('Accuracy {}\n\n'.format(accuracy_score(y_test.iloc[:, i].values, y_pred[:, i])))

------------Category 0: request ---------------
              precision    recall  f1-score   support

           0       0.90      0.96      0.93      4852
           1       0.76      0.53      0.62      1051

    accuracy                           0.89      5903
   macro avg       0.83      0.74      0.78      5903
weighted avg       0.88      0.89      0.88      5903

Accuracy 0.8854819583262747


------------Category 1: aid_related ---------------
              precision    recall  f1-score   support

           0       0.76      0.86      0.80      3478
           1       0.75      0.60      0.67      2425

    accuracy                           0.75      5903
   macro avg       0.75      0.73      0.74      5903
weighted avg       0.75      0.75      0.75      5903

Accuracy 0.7531763510079621


------------Category 2: medical_help ---------------
              precision    recall  f1-score   support

           0       0.94      0.99      0.96      5462
           1       0.61 

In [25]:
y_pred = pd.DataFrame(y_pred, columns = y.columns)
y_pred.head()

Unnamed: 0,request,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,water,food,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
2,0,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,1,1,1,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
category_names = list(df.columns[1:])  
print('ROC AUC for Adaboost\n')
for i in range(len(category_names)):
    if category_names[i] != 'child_alone':
        print('{} - :{}'.format(category_names[i], roc_auc_score(y_test.iloc[:, i], y_pred.iloc[:, i])))

ROC AUC for Adaboost

request - :0.7447398269461112
aid_related - :0.7306751717719037
medical_help - :0.636667604915761
medical_products - :0.6638288151081327
search_and_rescue - :0.5831973236243325
security - :0.5220910154747584
military - :0.677734725733151
water - :0.8184064480623798
food - :0.8372765316233857
shelter - :0.7558923547041625
clothing - :0.7520524448412792
money - :0.639983579638752
missing_people - :0.5747421227002514
refugees - :0.6049601542780977
death - :0.7398338456848149
other_aid - :0.5631531130896166
infrastructure_related - :0.5394531829825947
transport - :0.6164843473304409
buildings - :0.6592365023332286
electricity - :0.627001488095238
hospitals - :0.5450404511555591
aid_centers - :0.5341469050662921
other_infrastructure - :0.5272499152120754
weather_related - :0.8031300134948077
floods - :0.7866990647350083
storm - :0.7576288631116305
fire - :0.6085186889878405
earthquake - :0.8816000900718558
cold - :0.6270232472832317
other_weather - :0.5636947541644277


In [28]:
print("The F1 Micro Score is: {0:.2f}".format(f1_score(y_pred, y_test, average='micro')))
print("The F1 Macro Score (Unweighted average) is: {0:.2f}".format(f1_score(y_pred, y_test, average='macro')))

The F1 Micro Score is: 0.58
The F1 Macro Score (Unweighted average) is: 0.42


In [29]:
with open('./models/multiout_adaboost.pkl', 'wb') as pickle_out:
     pickle_out = pickle.dump(multioutput_pipeline, pickle_out)