# ML Pipeline Preparation

### 1. Import libraries and load data from database.

In [27]:
# import libraries
import pandas as pd
from sqlalchemy import create_engine
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
nltk.download(['punkt','stopwords'])
from nltk.stem.porter import PorterStemmer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,classification_report,recall_score,f1_score
from sklearn.model_selection import GridSearchCV
import pickle

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\RahulGupta\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\RahulGupta\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
# load data from database
engine = create_engine('sqlite:///data/DisasterResponse.db')
df = pd.read_sql('disaster_response',engine)
X = df.message
Y = df.drop(['id','message','original','genre'],axis=1)

In [21]:
print(X.head())
print(Y.head())

0    Weather update - a cold front from Cuba that c...
1              Is the Hurricane over or is it not over
2                      Looking for someone but no name
3    UN reports Leogane 80-90 destroyed. Only Hospi...
4    says: west side of Haiti, rest of the country ...
Name: message, dtype: object
   related  request  offer  aid_related  medical_help  medical_products  \
0        1        0      0            0             0                 0   
1        1        0      0            1             0                 0   
2        1        0      0            0             0                 0   
3        1        1      0            1             0                 1   
4        1        0      0            0             0                 0   

   search_and_rescue  security  military  child_alone  ...  aid_centers  \
0                  0         0         0            0  ...            0   
1                  0         0         0            0  ...            0   
2                  0

### 2. Write a tokenization function to process your text data

In [10]:
# Normalize by converting to lower case
# Tokenize by converting sentence to tokens
# Remove stop words
# Convert words to root form by Stemming
def tokenize(text):
    text=text.lower()
    token=word_tokenize(text)
    final_token=[]
    stemmer=PorterStemmer()
    for tok in token:
        if tok not in stopwords.words('english'):
            stem=stemmer.stem(tok)
            final_token.append(stem)
    return final_token

### 3. Build a machine learning pipeline
This machine pipeline should take in the `message` column as input and output classification results on the other 36 categories in the dataset.

In [11]:
pipeline = Pipeline([
    ('vect',CountVectorizer(tokenizer=tokenize)),
    ('tfidf',TfidfTransformer()),
    ('clf',MultiOutputClassifier(RandomForestClassifier()))
])

### 4. Train pipeline

In [13]:
# Split data into train and test set
X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.2)

# Train pipeline
pipeline.fit(X_train,y_train)

Pipeline(steps=[('vect',
                 CountVectorizer(tokenizer=<function tokenize at 0x0000019AEAF835E8>)),
                ('tfidf', TfidfTransformer()),
                ('clf',
                 MultiOutputClassifier(estimator=RandomForestClassifier()))])

### 5. Measure performance of the model

In [15]:
# Predicting output on test set
y_pred=pipeline.predict(X_test)

In [34]:
def evaluate_model(model,X_test, y_test):
    y_pred=model.predict(X_test)
    y_pred_df=pd.DataFrame(y_pred)
    y_pred_df.columns=y_test.columns
    
    ## Creating an evaluation matrix of precision scores and recall scores for each column
    eval_matrix=[]
    for column in y_test.columns:    
        eval_matrix.append(str(precision_score(y_test[column], y_pred_df[column])) +','+ str(recall_score(y_test[column], y_pred_df[column])) +','+ str(f1_score(y_test[column], y_pred_df[column])))
    
    # Converting eval matrix to data frame for ease of readability
    df=pd.DataFrame(eval_matrix)
    eval_df=df[0].str.split(',',expand=True)
    eval_df.columns=['Precision','Recall','F1']
    for col in eval_df.columns:
        eval_df[col]=eval_df[col].astype(float)

    print(eval_df.shape)
    print(eval_df)
    print(eval_df.describe())

In [35]:
evaluate_model(pipeline,X_test, y_test)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  average, "true nor predicted", 'F-score is', len(true_sum)


(36, 3)
    Precision    Recall        F1
0    0.821100  0.966986  0.888092
1    0.877895  0.461283  0.604786
2    0.000000  0.000000  0.000000
3    0.767830  0.688435  0.725968
4    0.828571  0.068235  0.126087
5    0.789474  0.062241  0.115385
6    0.875000  0.089744  0.162791
7    0.500000  0.010417  0.020408
8    0.684211  0.075145  0.135417
9    0.000000  0.000000  0.000000
10   0.898438  0.333333  0.486258
11   0.853598  0.587031  0.695652
12   0.800000  0.278261  0.412903
13   0.692308  0.116883  0.200000
14   0.833333  0.043103  0.081967
15   0.000000  0.000000  0.000000
16   0.600000  0.016216  0.031579
17   0.833333  0.116732  0.204778
18   0.571429  0.022599  0.043478
19   0.333333  0.005865  0.011527
20   0.800000  0.066667  0.123077
21   0.833333  0.091912  0.165563
22   1.000000  0.019802  0.038835
23   0.000000  0.000000  0.000000
24   0.000000  0.000000  0.000000
25   0.000000  0.000000  0.000000
26   0.000000  0.000000  0.000000
27   0.000000  0.000000  0.000000
28   0

### 6. Improve your model
Use grid search to find better parameters. 

In [20]:
pipeline.get_params()

{'memory': None,
 'steps': [('vect',
   CountVectorizer(tokenizer=<function tokenize at 0x0000019AEAF835E8>)),
  ('tfidf', TfidfTransformer()),
  ('clf', MultiOutputClassifier(estimator=RandomForestClassifier()))],
 'verbose': False,
 'vect': CountVectorizer(tokenizer=<function tokenize at 0x0000019AEAF835E8>),
 'tfidf': TfidfTransformer(),
 'clf': MultiOutputClassifier(estimator=RandomForestClassifier()),
 'vect__analyzer': 'word',
 'vect__binary': False,
 'vect__decode_error': 'strict',
 'vect__dtype': numpy.int64,
 'vect__encoding': 'utf-8',
 'vect__input': 'content',
 'vect__lowercase': True,
 'vect__max_df': 1.0,
 'vect__max_features': None,
 'vect__min_df': 1,
 'vect__ngram_range': (1, 1),
 'vect__preprocessor': None,
 'vect__stop_words': None,
 'vect__strip_accents': None,
 'vect__token_pattern': '(?u)\\b\\w\\w+\\b',
 'vect__tokenizer': <function __main__.tokenize(text)>,
 'vect__vocabulary': None,
 'tfidf__norm': 'l2',
 'tfidf__smooth_idf': True,
 'tfidf__sublinear_tf': False,


In [24]:
parameters = {'vect__analyzer': ['word']
             ,'clf__estimator__min_samples_leaf': [1,3],
             'clf__estimator__n_estimators':[10, 25], 
             'clf__estimator__min_samples_split':[2, 5]
             }
cv = GridSearchCV(pipeline,parameters,verbose=10)
tuned_model=cv.fit(X_train,y_train)
tuned_model.best_params_

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] vect__analyzer=word .............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ................. vect__analyzer=word, score=0.261, total=12.6min
[CV] vect__analyzer=word .............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 12.6min remaining:    0.0s


[CV] ................. vect__analyzer=word, score=0.253, total=12.7min
[CV] vect__analyzer=word .............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 25.2min remaining:    0.0s


[CV] ................. vect__analyzer=word, score=0.260, total=12.8min
[CV] vect__analyzer=word .............................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 38.1min remaining:    0.0s


[CV] ................. vect__analyzer=word, score=0.248, total=12.7min
[CV] vect__analyzer=word .............................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed: 50.8min remaining:    0.0s


[CV] ................. vect__analyzer=word, score=0.246, total=12.8min


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 63.6min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 63.6min finished


{'vect__analyzer': 'word'}

### 7. Test your model
Get the Precision, Recall and F1 score of the tuned model.

In [36]:
evaluate_model(tuned_model,X_test, y_test)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  average, "true nor predicted", 'F-score is', len(true_sum)


(36, 3)
    Precision    Recall        F1
0    0.822041  0.966230  0.888323
1    0.864372  0.472345  0.610873
2    0.000000  0.000000  0.000000
3    0.773374  0.690249  0.729451
4    0.710526  0.063529  0.116631
5    0.789474  0.062241  0.115385
6    0.826087  0.121795  0.212291
7    0.500000  0.010417  0.020408
8    0.636364  0.040462  0.076087
9    0.000000  0.000000  0.000000
10   0.921986  0.376812  0.534979
11   0.862069  0.554608  0.674974
12   0.828221  0.293478  0.433387
13   0.545455  0.077922  0.136364
14   0.800000  0.034483  0.066116
15   0.000000  0.000000  0.000000
16   0.583333  0.037838  0.071066
17   0.916667  0.128405  0.225256
18   0.656250  0.029661  0.056757
19   0.166667  0.002933  0.005764
20   0.818182  0.075000  0.137405
21   0.814815  0.080882  0.147157
22   0.666667  0.019802  0.038462
23   0.000000  0.000000  0.000000
24   0.000000  0.000000  0.000000
25   0.000000  0.000000  0.000000
26   0.000000  0.000000  0.000000
27   0.250000  0.004032  0.007937
28   0

### 9. Export your model as a pickle file

In [37]:
# Pickle best model
pickle.dump(tuned_model, open('models/disaster_model.sav', 'wb'))