In [3]:


# import libraries
import re
%matplotlib inline
import matplotlib.pyplot as plt
import nltk
nltk.download(['punkt', 'wordnet', 'stopwords'])
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import numpy as np
import pandas as pd
import pickle
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sqlalchemy import create_engine



[nltk_data] Downloading package punkt to /home/freemo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/freemo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/freemo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# load data from database
engine = create_engine('sqlite:///DisasterResponse.db')

# load data
engine = create_engine('sqlite:///DisasterResponse.db')
df = pd.read_sql("Disaster_Data", engine)
categories = df.columns[4:]

X = df[['message']].values[:, 0]
y = df[categories].values

In [5]:
url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

def tokenize(text, lemmatizer=WordNetLemmatizer()):
    # Detecte URLs
    detected_urls = re.findall(url_regex, text)
    for url in detected_urls:
        text = text.replace(url, 'urlplaceholder')
    
    # Normalize and tokenize
    tokens = nltk.word_tokenize(re.sub(r"[^a-zA-Z0-9]", " ", text.lower()))
    
    # Remove stopwords
    tokens = [t for t in tokens if t not in stopwords.words('english')]

    # Lemmatize
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    
    return tokens

In [8]:


pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(RandomForestClassifier(class_weight='balanced')))
])



In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [10]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('vect',
                 CountVectorizer(tokenizer=<function tokenize at 0x7fdb48cf9200>)),
                ('tfidf', TfidfTransformer()),
                ('clf',
                 MultiOutputClassifier(estimator=RandomForestClassifier(class_weight='balanced')))])

In [11]:


def multioutput_classification_report(y_true, y_pred):
    for i in range(0, len(categories)):
        print(categories[i])
        print("\tAccuracy: {:.4f}\t\t% Precision: {:.4f}\t\t% Recall: {:.4f}\t\t% F1_score: {:.4f}".format(
            accuracy_score(y_true[:, i], y_pred[:, i]),
            precision_score(y_true[:, i], y_pred[:, i], average='weighted'),
            recall_score(y_true[:, i], y_pred[:, i], average='weighted'),
            f1_score(y_true[:, i], y_pred[:, i], average='weighted')
        ))
        #print(classification_report(y_true[:, i], y_pred[:, i]))
        #print("--------------------------------------------------")



In [12]:
y_pred = pipeline.predict(X_train)

In [13]:
multioutput_classification_report(y_train, y_pred)

related
	Accuracy: 0.9983		% Precision: 0.9983		% Recall: 0.9983		% F1_score: 0.9983
request
	Accuracy: 0.9986		% Precision: 0.9986		% Recall: 0.9986		% F1_score: 0.9986
offer
	Accuracy: 0.9997		% Precision: 0.9998		% Recall: 0.9997		% F1_score: 0.9997
aid_related
	Accuracy: 0.9987		% Precision: 0.9987		% Recall: 0.9987		% F1_score: 0.9987
medical_help
	Accuracy: 0.9994		% Precision: 0.9994		% Recall: 0.9994		% F1_score: 0.9994
medical_products
	Accuracy: 0.9996		% Precision: 0.9996		% Recall: 0.9996		% F1_score: 0.9996
search_and_rescue
	Accuracy: 0.9997		% Precision: 0.9997		% Recall: 0.9997		% F1_score: 0.9997
security
	Accuracy: 0.9997		% Precision: 0.9997		% Recall: 0.9997		% F1_score: 0.9997
military
	Accuracy: 0.9996		% Precision: 0.9996		% Recall: 0.9996		% F1_score: 0.9996
child_alone
	Accuracy: 1.0000		% Precision: 1.0000		% Recall: 1.0000		% F1_score: 1.0000
water
	Accuracy: 1.0000		% Precision: 1.0000		% Recall: 1.0000		% F1_score: 1.0000
food
	Accuracy: 0.9999		% Precision

In [14]:
parameters = {
    'clf__estimator__n_estimators': [20, 50]
}

cv = GridSearchCV(estimator=pipeline, param_grid=parameters, cv=3, scoring='f1_weighted', verbose=3)