# Disaster Response ML_Pipeline

In [1]:
#import libraries
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import pickle
import warnings
warnings.filterwarnings("ignore")

In [2]:
#import NLP libraries
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

In [3]:
#import from sklearn libraries
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import precision_score,recall_score,f1_score
from sklearn.ensemble import AdaBoostClassifier,RandomForestClassifier

In [4]:
#importing data from data base
engine=create_engine('sqlite:///DisasterResponse.db')
df=pd.read_sql_table('DisasterResponse',engine)
df.head()

Unnamed: 0,id,message,original,genre,related,request,offer,aid_related,medical_help,medical_products,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,1,0,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct,1,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,12,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
X=df['message']
y=df.drop(['id','message','original','genre'],axis=1)
pd.DataFrame(X).head()

Unnamed: 0,message
0,Weather update - a cold front from Cuba that c...
1,Is the Hurricane over or is it not over
2,Looking for someone but no name
3,UN reports Leogane 80-90 destroyed. Only Hospi...
4,"says: west side of Haiti, rest of the country ..."


In [6]:
y.head()

Unnamed: 0,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,1,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Function to tokenize the data 

In [7]:
def generate_tokens(text):
    url_re = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\), ]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    
    # Detect and replace urls
    detected_urls = re.findall(url_re, text)
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")
    
    # tokenize sentences
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    
    # save cleaned tokens
    clean_tokens = [lemmatizer.lemmatize(tok).lower().strip() for tok in tokens]
    
    # remove stopwords
    STOPWORDS = list(set(stopwords.words('english')))
    clean_tokens = [token for token in clean_tokens if token not in STOPWORDS]
    
    return clean_tokens

In [8]:
#this block is for testing the above function
#l=[]
#for i in X:
#    l.append(i)
#mystr=' '.join(l)
#mystr=mystr[:100000]
#print(clean_data(mystr))

### Building Machine Learning Pipeline

In [9]:
def ml_pipeline():
    pipeline=Pipeline([
        ('vect',CountVectorizer(tokenizer=generate_tokens)),
        ('tfidf',TfidfTransformer()),
        ('clf',MultiOutputClassifier(RandomForestClassifier(n_estimators = 100,n_jobs = 4)))])
    return pipeline


### Training the pipeline

In [10]:
X_train,X_test,y_train,y_test=train_test_split(X,y)
pipeline=ml_pipeline()
pipeline.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function generate_t...
                                                                        ccp_alpha=0.0,
                                                                        class_weight=None,
                                                                        criterion='gini',
                                                                    

### Building fuction for testing model


Report the f1 score, precision and recall for each output category of the dataset. You can do this by iterating through the columns and calling sklearn's classification_report on each.

In [11]:
def metric_report(pipeline,X_test,y_test):
    y_pred=pipeline.predict(X_test)
    
    metric=[]
    for i in range(len(y_test.columns)):
        metric.append([f1_score(y_test.iloc[:,i].values,y_pred[:,i], average='micro'),
                      precision_score(y_test.iloc[:,i].values,y_pred[:,i], average='micro'),
                      recall_score(y_test.iloc[:,i].values,y_pred[:,i], average='micro')])
    
    metric_dataframe=pd.DataFrame(metric,index=y_test.columns,columns=['f1score','Precion_score','Recall_score'])
    
    return metric_dataframe

In [12]:
metric_report(pipeline,X_test,y_test)

Unnamed: 0,f1score,Precion_score,Recall_score
related,0.817341,0.817341,0.817341
request,0.89571,0.89571,0.89571
offer,0.994846,0.994846,0.994846
aid_related,0.780658,0.780658,0.780658
medical_help,0.926936,0.926936,0.926936
medical_products,0.953767,0.953767,0.953767
search_and_rescue,0.976353,0.976353,0.976353
security,0.983023,0.983023,0.983023
military,0.968319,0.968319,0.968319
child_alone,1.0,1.0,1.0


### Using grid search CV to tune the model

GridSearchCV can be use to get the best model for random forest, otherwise i am using AdaBoost classifier.

In [13]:
#parameter={'clf__estimator__max_features':['sqrt', 0.5],
#              'clf__estimator__n_estimators':[50, 100]}
#grid=GridSearchCV(estimator=pipeline,param_grid=parameter,cv=5,n_jobs=6)
#grid.fit(X_train, y_train)

### Another pipeline with Adaboost Classifier

In [14]:
def enhanced_ml_pipeline():
    pipeline=Pipeline([
        ('vect',CountVectorizer(tokenizer=generate_tokens)),
        ('tfidf',TfidfTransformer()),
        ('clf',MultiOutputClassifier(AdaBoostClassifier(n_estimators = 100)))])
    return pipeline

enhanced_pipeline=enhanced_ml_pipeline()
enhanced_pipeline.fit(X_train,y_train)
metric_report(enhanced_pipeline,X_test,y_test)

Unnamed: 0,f1score,Precion_score,Recall_score
related,0.769592,0.769592,0.769592
request,0.886312,0.886312,0.886312
offer,0.993937,0.993937,0.993937
aid_related,0.758072,0.758072,0.758072
medical_help,0.927998,0.927998,0.927998
medical_products,0.955283,0.955283,0.955283
search_and_rescue,0.975595,0.975595,0.975595
security,0.980294,0.980294,0.980294
military,0.972866,0.972866,0.972866
child_alone,1.0,1.0,1.0


In [15]:
#saving the pipeline into pickle file
pickle.dump(pipeline,open('pipeline1.pkl','wb'))
pickle.dump(enhanced_pipeline,open('pipeline2.pkl','wb'))

The size of both the model are:
    
    * Pipeline1: 938mb
    * Pipeline2: 4.5mb

Hence, i was not able to upload the Pipeline1