In [75]:
# import libraries
from matplotlib import pyplot as plt
%matplotlib inline 
import pandas as pd
import numpy as np 
import re 
from sqlalchemy import create_engine
import nltk
nltk.download(['punkt', 'wordnet', 'stopwords','averaged_perceptron_tagger'])
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import HashingVectorizer
import joblib

[nltk_data] Downloading package punkt to /home/ramzo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ramzo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/ramzo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/ramzo/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
# load data from database
engine = create_engine('sqlite:///../data/DisasterResponseDataBase.db')
df = pd.read_sql_query(sql='select * from DisasterResponseData',con=engine)
df.head()

Unnamed: 0,id,message,original,genre,request,offer,aid_related,medical_help,medical_products,search_and_rescue,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,0,0,1,0,0,0,...,0,0,1,0,1,0,0,0,0,0
1,9,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct,1,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,15,Storm at sacred heart of jesus,Cyclone Coeur sacr de jesus,direct,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
3,16,"Please, we need tents and water. We are in Sil...",Tanpri nou bezwen tant avek dlo nou zon silo m...,direct,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,18,I am in Croix-des-Bouquets. We have health iss...,"Nou kwadebouke, nou gen pwoblem sant m yo nan ...",direct,1,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,1


In [3]:
print(df[['help' in s for s  in df['message']]].medical_help.value_counts(),
df[['help' in s for s  in df['message']]].aid_related.value_counts())#.hist(figsize=(20,10));

0    1454
1     260
Name: medical_help, dtype: int64 1    1555
0     159
Name: aid_related, dtype: int64


In [120]:
d=df[['help' in s for s  in df['message']]].medical_help.value_counts()
d.index=('not medical help', 'medical help')
d.index.values.tolist()

['not medical help', 'medical help']

In [121]:
X = df.iloc[:,1]
Y = df.iloc[:,4:df.shape[1]]

### 2. Write a tokenization function to process your text data

In [6]:
def tokenize(text):
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    tokens = [PorterStemmer().stem(w) for w in tokens]
    clean_tokens = []
    for tok in tokens:
        if tok.lower() not in nltk.corpus.stopwords.words("english"):
            clean_tok = lemmatizer.lemmatize(tok, pos='v').lower().strip()       
            clean_tokens.append(clean_tok)        
    return clean_tokens

### 3. Build a machine learning pipeline
This machine pipeline should take in the `message` column as input and output classification results on the other 36 categories in the dataset. 

#### 3.1 pipeline without hashing 

In [113]:
pipeline = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier(KNeighborsClassifier()))
    ]) 

#### 3.2 pipeline with hashing for efficiency

In [123]:
pipeline = Pipeline([
        ('vect', HashingVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier(KNeighborsClassifier()))
    ]) 

### 4. Train pipeline
- Split data into train and test sets
- Train pipeline

In [124]:
X_train, X_test, y_train, y_test = train_test_split(X, Y)


In [116]:
pipeline.fit(X_train, y_train)



Pipeline(memory=None,
         steps=[('vect',
                 HashingVectorizer(alternate_sign=True, analyzer='word',
                                   binary=False, decode_error='strict',
                                   dtype=<class 'numpy.float64'>,
                                   encoding='utf-8', input='content',
                                   lowercase=True, n_features=1048576,
                                   ngram_range=(1, 1), norm='l2',
                                   preprocessor=None, stop_words=None,
                                   strip_accents=None,
                                   token_pattern='(?u)\\b\\w\\w+\\b',
                                   tokenizer=<function tokenize at 0x7fd5eecd7e18>)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultiOutputClassifier(estimator=KNeighborsClassifier(a

### 5. Test your model
Show the accuracy, precision, and recall of the tuned model.  

Since this project focuses on code quality, process, and  pipelines, there is no minimum performance metric needed to pass. However, make sure to fine tune your models for accuracy, precision and recall to make your project stand out - especially for your portfolio!

In [109]:
 y_pred = pipeline.predict(X_test)

In [66]:
def display_results(y_test, y_pred, label_name):
    labels = np.unique(y_pred)    
    target_names =[''.join(['not ',label_name]),label_name]
    accuracy = (y_pred == y_test).mean()
    print(classification_report(y_pred=y_pred, y_true=y_test,labels=labels, target_names=target_names))
    print("")
    

In [110]:
print("average accuracy {}".format(sum([accuracy_score(y_test.iloc[:, i].to_numpy(), y_pred[:, i])
                                            for i in range(y_pred.shape[1])]) / y_pred.shape[1]))
print("average f1_score {}".format(sum([f1_score(y_test.iloc[:, i].to_numpy(), y_pred[:, i])
                                            for i in range(y_pred.shape[1])]) / y_pred.shape[1]))
for i,l in enumerate(Y.columns):
    display_results(y_test.iloc[:,i].to_numpy() , y_pred[:,i], l)

average accuracy 0.9142037302725969
average f1_score 0.3269380136444824
              precision    recall  f1-score   support

 not request       0.89      0.87      0.88      2585
     request       0.72      0.75      0.74      1105

    accuracy                           0.84      3690
   macro avg       0.81      0.81      0.81      3690
weighted avg       0.84      0.84      0.84      3690


              precision    recall  f1-score   support

   not offer       0.99      1.00      1.00      3659

   micro avg       0.99      1.00      1.00      3659
   macro avg       0.99      1.00      1.00      3659
weighted avg       0.99      1.00      1.00      3659


                 precision    recall  f1-score   support

not aid_related       0.60      0.48      0.53       986
    aid_related       0.82      0.88      0.85      2704

       accuracy                           0.77      3690
      macro avg       0.71      0.68      0.69      3690
   weighted avg       0.76      0.77   

                   precision    recall  f1-score   support

not other_weather       0.92      0.99      0.95      3351
    other_weather       0.55      0.12      0.20       339

         accuracy                           0.91      3690
        macro avg       0.74      0.56      0.58      3690
     weighted avg       0.88      0.91      0.88      3690


                   precision    recall  f1-score   support

not direct_report       0.83      0.83      0.83      2446
    direct_report       0.66      0.66      0.66      1244

         accuracy                           0.77      3690
        macro avg       0.74      0.74      0.74      3690
     weighted avg       0.77      0.77      0.77      3690




  .format(len(labels), len(target_names))


### 6. Train alternative models
Here we use Randomforest and SVC classifier to improve the classification resutlts

#### 6.1 Train RandomForestClassifier
Randomforest improves the accuracy by 4%

In [68]:
from sklearn.svm import SVC
pipeline = Pipeline([
        ('vect', HashingVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier(RandomForestClassifier()))
    ]) 
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print("average accuracy RandomForestClassifier {}".format(sum([accuracy_score(y_test.iloc[:, i].to_numpy(), y_pred[:, i])
                                            for i in range(y_pred.shape[1])]) / y_pred.shape[1]))
print("average f1_score RandomForestClassifier {}".format(sum([f1_score(y_test.iloc[:, i].to_numpy(), y_pred[:, i])
                                            for i in range(y_pred.shape[1])]) / y_pred.shape[1]))
for i,l in enumerate(Y.columns):
    display_results(y_test.iloc[:,i].to_numpy() , y_pred[:,i], l)




#### 6.2 train SVC
SVC improves the accuracy by 4%

In [69]:
from sklearn.svm import SVC
pipeline = Pipeline([
        ('vect', HashingVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier(SVC()))
    ]) 
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
for i,l in enumerate(Y.columns):
    display_results(y_test.iloc[:,i].to_numpy() , y_pred[:,i], l)
print("average accuracy SVC {}".format(sum([accuracy_score(y_test.iloc[:, i].to_numpy(), y_pred[:, i])
                                            for i in range(y_pred.shape[1])]) / y_pred.shape[1]))
print("average f1_score SVC {}".format(sum([f1_score(y_test.iloc[:, i].to_numpy(), y_pred[:, i])
                                            for i in range(y_pred.shape[1])]) / y_pred.shape[1]))

              precision    recall  f1-score   support

 not request       0.90      0.95      0.92      2608
     request       0.86      0.75      0.80      1082

    accuracy                           0.89      3690
   macro avg       0.88      0.85      0.86      3690
weighted avg       0.89      0.89      0.89      3690


              precision    recall  f1-score   support

   not offer       0.99      1.00      1.00      3660

   micro avg       0.99      1.00      1.00      3660
   macro avg       0.99      1.00      1.00      3660
weighted avg       0.99      1.00      1.00      3660


                 precision    recall  f1-score   support

not aid_related       0.76      0.54      0.63      1036
    aid_related       0.84      0.94      0.88      2654

       accuracy                           0.82      3690
      macro avg       0.80      0.74      0.76      3690
   weighted avg       0.82      0.82      0.81      3690


                  precision    recall  f1-score   su

  .format(len(labels), len(target_names))


average f1_score SVC 0.39003446663791086


#### 6.3 performe gridsearch on SVM


In [72]:
clf=MultiOutputClassifier(SVC())
tune_parameters = {'estimator__gamma': [1e-1, 1e-2, 1e-3, 1e-4],
                       'estimator__C': [1, 10, 100, 1000]}
clf_grid = GridSearchCV(estimator=clf, n_jobs=7, cv=5, param_grid=tune_parameters)
pipeline = Pipeline([
        ('vect', HashingVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', clf_grid )
    ]) 
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
for i,l in enumerate(Y.columns):
    display_results(y_test.iloc[:,i].to_numpy() , y_pred[:,i], l)
print("average accuracy SVC {}".format(sum([accuracy_score(y_test.iloc[:, i].to_numpy(), y_pred[:, i])
                                            for i in range(y_pred.shape[1])]) / y_pred.shape[1]))
print("average f1_score SVC {}".format(sum([f1_score(y_test.iloc[:, i].to_numpy(), y_pred[:, i])
                                            for i in range(y_pred.shape[1])]) / y_pred.shape[1]))

              precision    recall  f1-score   support

 not request       0.91      0.94      0.92      2608
     request       0.84      0.77      0.80      1082

    accuracy                           0.89      3690
   macro avg       0.87      0.85      0.86      3690
weighted avg       0.89      0.89      0.89      3690


              precision    recall  f1-score   support

   not offer       0.99      1.00      1.00      3660

   micro avg       0.99      1.00      1.00      3660
   macro avg       0.99      1.00      1.00      3660
weighted avg       0.99      1.00      1.00      3660


                 precision    recall  f1-score   support

not aid_related       0.71      0.60      0.65      1036
    aid_related       0.85      0.90      0.88      2654

       accuracy                           0.82      3690
      macro avg       0.78      0.75      0.76      3690
   weighted avg       0.81      0.82      0.81      3690


                  precision    recall  f1-score   su

  .format(len(labels), len(target_names))


average f1_score SVC 0.4927744867765006


In [73]:
pipeline

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function tokenize at...
                                                                            degree=3,
                                                                            gamma='scale',
                                                                            kernel='rbf',
                                                                    

In [76]:
m_f = "".join(['../models/', "dr_trained_opt_model.lzma"])
joblib.dump(value=pipeline, filename=m_f, compress=("lzma", 9))

['../models/dr_trained_opt_model.lzma']