In [1]:
import nltk
import re
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sqlalchemy import create_engine
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import numpy as np

In [2]:
nltk.download(['punkt', 'wordnet'])

[nltk_data] Downloading package punkt to
[nltk_data]     /home/prasannaiyer/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/prasannaiyer/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/prasannaiyer/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [4]:
engine = create_engine('sqlite:////home/prasannaiyer/Projects/NLP_Project/Data/DisasterResponse.db')
msg_df = pd.read_sql_table('Message_Category', engine)

In [5]:
def tokenize(text):
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = []
    for tok in tokens:
        clean_token = lemmatizer.lemmatize(tok).lower().strip()
        lemmatized_tokens.append(clean_token)
    return lemmatized_tokens

In [6]:
X = msg_df['message'].values


In [7]:
for column_name in msg_df.columns[~msg_df.columns.isin(['id', 'genre', 'message', 'original'])]:
    msg_df[column_name] = msg_df[column_name].astype('int')

In [8]:
y = msg_df.loc[:, ~msg_df.columns.isin(['id', 'genre', 'message', 'original'])].values

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [10]:
X_train[0:5]

array(['I need to know whatus up in the country. ',
       'He said troops had restored electricty in several towns and were repairing the communication system as the cyclone subsided.',
       'The government, humanitarian organisations as well as well-wishers need to urgently mobilize resources and not only deliver relief quickly to save lives and end suffering but also, for the sake of the future of these populations, help communities prepare for future crises, since major droughts are becoming increasingly extreme and common with climate change.',
       'URGENT CRECHE ORPHANAGE KAY TOUT TIMOUN CROIX DES MISSIONS IMPASSE BALEV BUTTE BOYER MANQUE EAU ET NOURRITURE N ONT VU AUCUN SECOURS DEPUIS 8 JOURS HELP HELP',
       "One year after Japan's largest recorded earthquake and tsunami which damaged the Fukushima Daiichi nuclear reactor, the Japanese Red Cross is calling for greater vigilance in preparing for potential nuclear accidents around the world."],
      dtype=object)

In [11]:
vect = CountVectorizer(tokenizer = tokenize)
tfidf = TfidfTransformer()

In [12]:
X_train_vect = vect.fit_transform(X_train)
X_train_vect

<19661x31230 sparse matrix of type '<class 'numpy.int64'>'
	with 449806 stored elements in Compressed Sparse Row format>

In [13]:
X_train_tfidf = tfidf.fit_transform(X_train_vect)
X_train_tfidf

<19661x31230 sparse matrix of type '<class 'numpy.float64'>'
	with 449806 stored elements in Compressed Sparse Row format>

In [14]:
clf = MultiOutputClassifier(estimator = KNeighborsClassifier())

In [15]:
clf.fit(X_train_tfidf, y_train)

MultiOutputClassifier(estimator=KNeighborsClassifier())

In [16]:
X_test_vect = vect.transform(X_test)
X_test_tfidf = tfidf.transform(X_test_vect)

In [17]:
y_pred1 = clf.predict(X_test_tfidf)

In [18]:
clf.score(X_test_tfidf, y_test)

0.22612145254806226

In [19]:
print(classification_report(y_test, y_pred1))

              precision    recall  f1-score   support

           0       0.82      0.95      0.88      5033
           1       0.73      0.49      0.58      1144
           2       0.00      0.00      0.00        24
           3       0.66      0.55      0.60      2660
           4       0.30      0.17      0.22       483
           5       0.26      0.16      0.20       339
           6       0.50      0.03      0.05       188
           7       0.50      0.01      0.02       114
           8       0.54      0.08      0.13       195
           9       0.00      0.00      0.00         0
          10       0.38      0.31      0.34       394
          11       0.50      0.44      0.47       722
          12       0.42      0.27      0.33       587
          13       0.58      0.17      0.26       112
          14       0.46      0.09      0.15       144
          15       0.50      0.01      0.02        90
          16       0.20      0.08      0.12       245
          17       0.32    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
