In [1]:
import nltk
import re
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sqlalchemy import create_engine
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import numpy as np

In [2]:
nltk.download(['punkt', 'wordnet'])

[nltk_data] Downloading package punkt to
[nltk_data]     /home/prasannaiyer/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/prasannaiyer/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/prasannaiyer/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [4]:
engine = create_engine('sqlite:////home/prasannaiyer/Projects/NLP_Project/Data/DisasterResponse.db')
msg_df = pd.read_sql_table('Message_Category', engine)

In [5]:
def tokenize(text):
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = []
    for tok in tokens:
        clean_token = lemmatizer.lemmatize(tok).lower().strip()
        lemmatized_tokens.append(clean_token)
    return lemmatized_tokens

In [6]:
pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer = tokenize)),
    ('tfidf', TfidfTransformer()),
    ('MOClf', MultiOutputClassifier(estimator = KNeighborsClassifier()))
])

In [7]:
X = msg_df['message'].values


In [8]:
for column_name in msg_df.columns[~msg_df.columns.isin(['id', 'genre', 'message', 'original'])]:
    msg_df[column_name] = msg_df[column_name].astype('int')

In [9]:
y = msg_df.loc[:, ~msg_df.columns.isin(['id', 'genre', 'message', 'original'])].values

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [11]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('vect',
                 CountVectorizer(tokenizer=<function tokenize at 0x7ffb34562ef0>)),
                ('tfidf', TfidfTransformer()),
                ('MOClf',
                 MultiOutputClassifier(estimator=KNeighborsClassifier()))])

In [12]:
pred_prob = pipeline.predict_proba(X_test)

In [13]:
pred_prob1 = pipeline.predict(X_test)

In [15]:
print(classification_report(y_test, pred_prob1))

              precision    recall  f1-score   support

           0       0.81      0.97      0.88      5084
           1       0.78      0.35      0.49      1140
           2       0.00      0.00      0.00        23
           3       0.74      0.34      0.46      2734
           4       0.37      0.08      0.13       528
           5       0.52      0.11      0.18       352
           6       0.75      0.02      0.03       188
           7       0.00      0.00      0.00       118
           8       0.67      0.02      0.04       208
           9       0.00      0.00      0.00         0
          10       0.46      0.20      0.28       457
          11       0.57      0.26      0.35       789
          12       0.42      0.18      0.25       588
          13       0.31      0.04      0.07       104
          14       0.50      0.06      0.10       142
          15       0.00      0.00      0.00        80
          16       0.21      0.06      0.09       213
          17       0.38    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
y_true = np.array([[0, 1, 1, 1],[0,0,1,0],[1,1,0,0]])
y_scores = np.array([[0, 1, 1, 0],[0, 0, 1, 1],[1,1,0,1]])

In [None]:
y_true.shape

In [None]:
type(pred_prob1[1][1])

In [None]:
y_train.shape

In [None]:
label_names = ['label A', 'label B', 'label C', 'label D']

print(classification_report(y_true, y_scores))