In [1]:
import nltk
import re
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sqlalchemy import create_engine
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import numpy as np

In [2]:
nltk.download(['punkt', 'wordnet'])

[nltk_data] Downloading package punkt to
[nltk_data]     /home/prasannaiyer/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/prasannaiyer/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/prasannaiyer/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [4]:
engine = create_engine('sqlite:////home/prasannaiyer/Projects/NLP_Project/Data/DisasterResponse.db')
msg_df = pd.read_sql_table('Message_Category', engine)

In [5]:
def tokenize(text):
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = []
    for tok in tokens:
        clean_token = lemmatizer.lemmatize(tok).lower().strip()
        lemmatized_tokens.append(clean_token)
    return lemmatized_tokens

In [6]:
X = msg_df['message'].values


In [7]:
for column_name in msg_df.columns[~msg_df.columns.isin(['id', 'genre', 'message', 'original'])]:
    msg_df[column_name] = msg_df[column_name].astype('int')

In [8]:
y = msg_df.loc[:, ~msg_df.columns.isin(['id', 'genre', 'message', 'original'])].values

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [10]:
X_train[0:5]

array(['Previously many refugees in Kabul were living in tents with nothing to insulate them from the cold damp ground, and little fuel or heating.',
       'Boro @bubbakoos is still open! Squan is slacking #sandy @point pleasant bubbakoos http://t.co/JlVA8aYr',
       'Is it possible for a person who has registered after the earthquake, the province can return to Port-au-Prince went to school? ',
       'Normally, we have special shock treatment [organophosphate pesticide, potentially dangerous to human health] for swarms, which we apply early in the morning," he said.',
       'People in Jacquet Toto got water but no food. '], dtype=object)

In [11]:
vect = CountVectorizer(tokenizer = tokenize)
tfidf = TfidfTransformer()

In [12]:
X_train_vect = vect.fit_transform(X_train)
X_train_vect

<19661x31324 sparse matrix of type '<class 'numpy.int64'>'
	with 448748 stored elements in Compressed Sparse Row format>

In [13]:
X_train_tfidf = tfidf.fit_transform(X_train_vect)
X_train_tfidf

<19661x31324 sparse matrix of type '<class 'numpy.float64'>'
	with 448748 stored elements in Compressed Sparse Row format>

In [14]:
clf = MultiOutputClassifier(estimator = KNeighborsClassifier())

In [15]:
clf.fit(X_train_tfidf, y_train)

MultiOutputClassifier(estimator=KNeighborsClassifier())

In [16]:
X_test_vect = vect.transform(X_test)
X_test_tfidf = tfidf.transform(X_test_vect)

In [17]:
y_pred1 = clf.predict(X_test_tfidf)

In [19]:
classification_report(y_test, y_pred1)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


'              precision    recall  f1-score   support\n\n           0       0.80      0.97      0.88      5050\n           1       0.76      0.35      0.48      1121\n           2       0.00      0.00      0.00        32\n           3       0.71      0.34      0.46      2675\n           4       0.21      0.07      0.11       480\n           5       0.26      0.08      0.12       331\n           6       0.50      0.02      0.03       167\n           7       0.00      0.00      0.00       129\n           8       0.75      0.04      0.08       224\n           9       0.00      0.00      0.00         0\n          10       0.42      0.19      0.26       409\n          11       0.49      0.26      0.34       728\n          12       0.38      0.16      0.23       563\n          13       0.73      0.07      0.13       111\n          14       0.55      0.04      0.07       161\n          15       1.00      0.01      0.02        83\n          16       0.22      0.05      0.09       244\n       