# ML Pipeline Implementation
## 1. Import libraries and download nltk packages

In [9]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import re
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn import multioutput
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
#ML models
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC,SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
import pickle

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/sanket/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/sanket/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/sanket/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## 2. Define Functions
1. Define function to load data from database
2. Defining Tokenizer function
3. Define evaluation metric to find F1-score, Precision, Recall

In [4]:
def load_data(db_name,table_name):
    '''
    Input
    db_name: take the database name that has to be loaded
    table_name: table_name in the database that needs to be loaded
    Output
    X: Feature dataframe that will be given as input to the model
    y: Target dataframe for the model
    category_names: Multi-Label names for the target
    '''
    engine = create_engine('sqlite:///' + db_name)
    df = pd.read_sql_table(table_name, engine)
    X = df['message']
    y = df.iloc[:, 4:]
    category_names = list(df.columns[4:])
    return X, y, category_names

In [7]:
def tokenize(text):
    '''
    Input
    text: take the text as input
    Output
    words_lemmed: tokenized and lemmatized text with stop words removed 
    '''
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())
    
    stop_words = stopwords.words("english")
    
    #tokenize
    words = word_tokenize (text)
    words_lemmed = [WordNetLemmatizer().lemmatize(w) for w in words if w not in stop_words]
    return words_lemmed

In [None]:
def evaulation_metric(y_true,y_pred):
    '''
    Input 
    y_true: ground truth dataframe
    y_pred: predicted dataframe
    
    Output
    report: dataframe that contains mean f1-score,precision and recall value for each class
    '''
    report = pd.DataFrame ()
    for col in y_true.columns:
        class_dict = classification_report (output_dict = True, y_true = y_true.loc [:,col], y_pred = y_pred.loc [:,col])
    
        metric_df = pd.DataFrame (pd.DataFrame.from_dict (class_dict))
        
        metric_df.drop(['macro avg', 'weighted avg'], axis =1, inplace = True)
        
        metric_df.drop(index = 'support', inplace = True)
        
        metric_df = pd.DataFrame (metric_df.transpose ().mean ())
         
        metric_df = metric_df.transpose ()
    
        report = report.append (metric_df, ignore_index = True)    
    
    report.index = y_true.columns
    
    return report
    

## 3. Load Data
1. Load the data from database
2. Split the data into train and test 

In [5]:
X, y, category_names = load_data('DisasterResponse.db','message_and_category')
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 22)

## 4. Define ML pipeline
1. Define ML pipeline
2. Define search space
3. Define GridSearch
4. Train the model
5. Predict on Test set
6. Get the Evaluation metric 

In [10]:
pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)),
                     ('tfidf', TfidfTransformer()),
                     ('scale',StandardScaler(with_mean=False)),
                     ('clf', OneVsRestClassifier(LinearSVC()))])

In [19]:
search_space = [{'clf':[OneVsRestClassifier(LinearSVC())],
                 'clf__estimator__C': [1, 10, 100]},
                
                {'clf': [OneVsRestClassifier(LogisticRegression(solver='sag'))], 
                 'clf__estimator__C': [1, 10, 100]},
                
                {'clf': [OneVsRestClassifier(MultinomialNB())],
                 'clf__estimator__alpha': [0.1, 0.5, 1]},
                {'clf':[multioutput.MultiOutputClassifier(RandomForestClassifier())]}]

In [20]:
cv = GridSearchCV(pipeline, search_space)

In [21]:
cv.fit(X_train,y_train)































GridSearchCV(estimator=Pipeline(steps=[('vect',
                                        CountVectorizer(tokenizer=<function tokenize at 0x7f917fa180e0>)),
                                       ('tfidf', TfidfTransformer()),
                                       ('scale',
                                        StandardScaler(with_mean=False)),
                                       ('clf',
                                        OneVsRestClassifier(estimator=LinearSVC()))]),
             param_grid=[{'clf': [OneVsRestClassifier(estimator=LinearSVC())],
                          'clf__estimator__C': [1, 10, 100]},
                         {'clf': [OneVsRestClassifier(estimator=LogisticRegression(solver='sag'))],
                          'clf__estimator__C': [1, 10, 100]},
                         {'clf': [OneVsRestClassifier(estimator=MultinomialNB())],
                          'clf__estimator__alpha': [0.1, 0.5, 1]},
                         {'clf': [MultiOutputClassifier(estimator

In [23]:
pickle.dump(cv, open('pipeline.sav', 'wb'))

In [37]:
def evaulation_metric(y_true,y_pred):
    '''
    Input 
    y_true: ground truth dataframe
    y_pred: predicted dataframe
    
    Output
    report: dataframe that contains mean f1-score,precision and recall value for each class
    '''
    report = pd.DataFrame ()
    for col in y_true.columns:
        class_dict = classification_report (output_dict = True, y_true = y_true.loc [:,col], y_pred = y_pred.loc [:,col])
    
        metric_df = pd.DataFrame (pd.DataFrame.from_dict (class_dict))
        
        metric_df.drop(['macro avg', 'weighted avg'], axis =1, inplace = True)
        
        metric_df.drop(index = 'support', inplace = True)
        
        metric_df = pd.DataFrame (metric_df.transpose ().mean ())
         
        metric_df = metric_df.transpose ()
    
        report = report.append (metric_df, ignore_index = True)    
    
    report.index = y_true.columns
    
    return report
    

In [25]:
y_predict = cv.predict(X_test)

In [34]:
y_predict = pd.DataFrame (y_predict, columns = y_test.columns)

In [38]:
report = evaulation_metric(y_test,y_predict)

  _warn_prf(average, modifier, msg_start, len(result))


In [39]:
print(report)

Unnamed: 0,precision,recall,f1-score
related,0.787522,0.723709,0.740774
request,0.885677,0.784323,0.815438
offer,0.662973,0.66482,0.663894
aid_related,0.76988,0.763312,0.765513
medical_help,0.87404,0.664245,0.671269
medical_products,0.915217,0.687373,0.707872
search_and_rescue,0.857435,0.676151,0.687575
security,0.655379,0.660868,0.658101
military,0.901356,0.670732,0.67879
water,0.937092,0.760356,0.804336


False