In [1]:
import nltk
import re
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sqlalchemy import create_engine
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, precision_score
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

In [2]:
nltk.download(['punkt', 'wordnet'])

[nltk_data] Downloading package punkt to
[nltk_data]     /home/prasannaiyer/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/prasannaiyer/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/prasannaiyer/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

## Load the table with dataset from the SQLite database ##
#### Split the dataset into train and test set ####

In [4]:
db_engine = create_engine('sqlite:////home/prasannaiyer/Projects/NLP_Project/Data/DisasterResponse.db')
msg_df = pd.read_sql_table('Message_Category', db_engine)

In [5]:
X = msg_df['message'].values


In [6]:
## Convert the category variables to int datatype ##
for column_name in msg_df.columns[~msg_df.columns.isin(['id', 'genre', 'message', 'original'])]:
    msg_df[column_name] = msg_df[column_name].astype('int')

In [7]:
y = msg_df.loc[:, ~msg_df.columns.isin(['id', 'genre', 'message', 'original'])].values

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

## Tokenize function to be used in the CountVectorizer ##

In [9]:
def tokenize(text):
    '''
    Create tokens and then lemmatize the tokens. Useful for including in CountVectorizer
    Parameters:
        text (str): Input text to be tokenized
    '''
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = []
    for tok in tokens:
        clean_token = lemmatizer.lemmatize(tok).lower().strip()
        lemmatized_tokens.append(clean_token)
    return lemmatized_tokens

## Create Pipeline ##
#### Pipeline consists of CountVectorizer and TfidTransformer to process the input text ####
#### Pipeline also includes MultiOutput classifier ####

In [10]:
pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer = tokenize)),
    ('tfidf', TfidfTransformer()),
    ('MOClf', MultiOutputClassifier(estimator = KNeighborsClassifier()))
])

In [11]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('vect',
                 CountVectorizer(tokenizer=<function tokenize at 0x7ff7eb97df80>)),
                ('tfidf', TfidfTransformer()),
                ('MOClf',
                 MultiOutputClassifier(estimator=KNeighborsClassifier()))])

In [16]:
filename = 'pipeline_model.sav'
pickle.dump(pipeline, open(filename, 'wb'))

## Setup GridSearch ##
#### Main parameter is the Estimator: Includes RandomForest and KNeighbors ####

In [17]:
parameters = {'MOClf__estimator': [RandomForestClassifier(), KNeighborsClassifier() ]}
cv = GridSearchCV(pipeline, param_grid=parameters)
cv.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('vect',
                                        CountVectorizer(tokenizer=<function tokenize at 0x7fe9b324d560>)),
                                       ('tfidf', TfidfTransformer()),
                                       ('MOClf',
                                        MultiOutputClassifier(estimator=KNeighborsClassifier()))]),
             param_grid={'MOClf__estimator': [RandomForestClassifier(),
                                              KNeighborsClassifier()]})

In [18]:
filename = 'cv_model.sav'
pickle.dump(cv.best_estimator_, open(filename, 'wb'))