In [1]:
import os
import sys
import re
import json

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.snowball import StemmerI, SnowballStemmer
from nltk.stem.porter import PorterStemmer

from plotly.graph_objs import Bar, Figure, Scatter, Histogram
import plotly

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sqlalchemy import create_engine

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import NMF

In [2]:
os.getcwd()

'C:\\Users\\smouz\\OneDrive\\Desktop\\deploy_project\\disaster-response-project\\eda'

In [3]:
def load_data(database_filepath):
    """
    Import data from database into a DataFrame. Split DataFrame into
    features and predictors, `X` and `Y`. Additionally, extract the names
    of target categories.

    Preprocess data.

    Params:
    -------
        database_filepath: file path of database

    Returns:
    -------
        tuple(X, Y, category_names)
        pd.DataFrame of features and predictors, `X` and `Y`, respectively.
        List of target category names
    """
    engine = create_engine(f'sqlite:///{database_filepath}')

    # extract directory name
    dir_ = re.findall(".*/", database_filepath)

    # extract table name by stripping away directory name
    table_name = database_filepath.replace('.db', '').replace(dir_[0], "")

    df = pd.read_sql_table(f'{table_name}', engine)

    # reset index
    df.reset_index(drop=False, inplace=True)

    # DROP ROWS/COLUMN
    # where sum across entire row is less than 1
    null_idx = np.where(df.loc[:, 'related':].sum(axis=1) < 1)[0]
    # drop rows which contain all null values
    df.drop(null_idx, axis=0, inplace=True)

    # explore `related` feature where its labeled as a `2`
    related_twos = df[df['related'] == 2]
    df.drop(index=related_twos.index, inplace=True)

    # reset index
    df = df.reset_index(drop=True)

    # define features and predictors
    X = df.loc[:, 'message']
    Y = df.loc[:, 'related':]

    # drop categories with less than 2 classes
    drop_catg_list = Y.nunique()[Y.nunique() < 2].index.tolist()
    df.drop(drop_catg_list, axis=1, inplace=True)

    # extract label names
    category_names = Y.columns.to_list()

    return X, Y, df, category_names

def tokenize(text):
    """
    Replace `url` with empty space "".
    Tokenize and lemmatize input `text`.
    Converts to lower case and strips whitespaces.


    Returns:
    --------
        dtype: list, containing processed words
    """

    lemm = WordNetLemmatizer()

    url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

    detected_urls = re.findall(url_regex, text)
    for url in detected_urls:
        text = text.replace(url, "")

    # load stopwords
    stop_words = stopwords.words("english")

    remove_words = ['one', 'see', 'please', 'thank', 'thank you', 'thanks',
                    'we', 'us', 'you', 'me', 'their', 'there', 'here', 'http']
    for addtl_word in remove_words:
        stop_words.append(addtl_word)

    # remove punctuations (retain alphabetical and numeric chars) and convert to all lower case
    # tokenize resulting text
    tokens = word_tokenize(re.sub(r"[^a-zA-Z]", ' ', text.lower().strip()))

    # drop stop words
    no_stops = [word for word in tokens if word not in stop_words]

    # lemmatize and remove stop words
    lemmatized = [lemm.lemmatize(word) for word in tokens if word not in stop_words]

    return lemmatized


X, Y, df, category_names = load_data('../data/disaster_response.db')

In [4]:
count_vec = CountVectorizer(
        tokenizer=tokenize,
        ngram_range=(1, 1),
        dtype=np.uint16,
        max_features=10000,
        max_df=0.99,
        min_df=2,
        )

word_matrix = count_vec.fit_transform(X).toarray()

# extract words/features
features = count_vec.get_feature_names()

# sum counts of words; axis=0 for column wise summation
word_count = np.sum(word_matrix, axis=0)

In [5]:
# COUNTVECTORIZER VOCABULARY
# create dataframe with results
df = pd.DataFrame({'word': features,
                   'count': word_count})

# print("\nTop 10 Words:")
# print(df.sort_values('count', ascending=False)[:10])
# print("\nBottom 10 words:")
# print(df.sort_values('count', ascending=False)[-10:])

In [6]:
top_words = df.sort_values('count', ascending=False)[:20]
bottom_words = df.sort_values('count', ascending=False)[-20:]

In [7]:
def plot_bar(x, y, title=''):
    # define axis params to re-use
    xy_axis = dict(
        gridcolor='rgb(225, 225, 225)',
        gridwidth=0.25,
        linecolor='rgb(100, 100, 100)',
        linewidth=2,
        showticklabels=True,
        color='black'
    )
    # update x-axis params
    x_axis = xy_axis.copy()
    x_axis.update(dict(
        ticks='outside',
        tickfont=dict(
            family='Arial',
            color='rgb(82, 82, 82)',))
        )

    # Use the hovertext kw argument for hover text
    fig = Figure([
        Bar(x=x, y=y, orientation='h')
    ])

    # Customize aspect
    fig.update_traces(marker_color='rgb(158,202,225)', 
                      marker_line_color='rgb(8,48,107)',
                      marker_line_width=1.5, 
                      opacity=0.7)
    # Edit layout
    fig.update_layout(title=title,
                      yaxis_title='Word',
                      plot_bgcolor='white',
#                       yaxis=xy_axis,
                      xaxis=x_axis,

                     )
    return fig

In [8]:
fig = plot_bar(x=top_words['count'], y=top_words['word'], title='Most Frequent Words')

In [9]:
# fig.to_plotly_json()

In [15]:
plot_bar(x=top_words['count'], y=top_words['word'], title='Most Frequent Words')

In [11]:
plot_bar(x=bottom_words['count'], y=bottom_words['word'], title='Most Frequent Words')


In [12]:
fig = plot_bar(x=top_words['count'], y=top_words['word'], title='Most Frequent Words')

In [13]:
fig.write_html('Top 10.html')