In [1]:
from scipy.io import arff
import pandas as pd
import numpy as np

from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import train_test_split

import plotly.plotly as py
import plotly.graph_objs as go
from plotly.tools import set_credentials_file
from plotly.offline import init_notebook_mode
set_credentials_file(username='mynameistony', api_key='M37H9VmJFuqdvIdT33vj')
init_notebook_mode(connected=True)

#### Part 1

Load data in .arff to pd.DataFrame (1.2)

In [2]:
data, _ = arff.loadarff('data/raw/spambase.arff.txt')

In [3]:
spambase = pd.DataFrame(data)

Drop columns (1.4)

In [4]:
spambase.drop(['capital_run_length_average', 'capital_run_length_longest', 'capital_run_length_total'], 1, inplace=True)

Convert column types (1.5)

In [5]:
def numberic_to_bool(df, inplace=False):
    """
    convert columns with numerical dtypes to boolean
    """
    if not inplace:
        df = df.copy()
    num_cols = df.select_dtypes(float).columns
    df.loc[:, num_cols] = df.loc[:, num_cols].astype(bool).astype(int)
    return df

def object_to_str(df, inplace=False, str_to_bool=False):
    """
    convert columns with objects dtypes to str (or bool)
    """
    if not inplace:
        df = df.copy()
    if not str_to_bool:
        lmbd = lambda x: x.str
    else:
        lmbd = lambda x: x.str.decode("utf-8").astype(int)
    obj_cols = df.select_dtypes([np.object]).columns
    df.loc[:, obj_cols] = df.loc[:, obj_cols].apply(lmbd)
    return df

In [6]:
spambase = numberic_to_bool(spambase)

In [7]:
spambase = object_to_str(spambase, False, True)

Save data with bag of words (1.6)

In [8]:
spambase.to_csv('data/spambase_bag_of_words.csv', index=False)

#### Part 2 (unfinished)

Create labels (classes) to fit Naive Bayes (train and test sets in fact)

In [9]:
Y = spambase.pop('is_spam')
X = spambase

Fit Naive Bayes for Bernoulli destribution (because all columns are boolean now)

In [10]:
clf = BernoulliNB(fit_prior=False)
clf.fit(X, Y)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=False)

Predict for random sample

In [11]:
clf.predict(X.loc[5:6])

array([0, 1])

Compare execution time on whole dataset (4601 obs.) and sample (2000 obs.)

In [12]:
%%timeit
clf.fit(X.loc[:1999], Y[:2000])

1.41 ms ± 28.8 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [13]:
%%timeit
clf.fit(X, Y)

2.97 ms ± 75.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


#### Part 3

Load data in .arff to pd.DataFrame (3.1)

In [14]:
data, _ = arff.loadarff('data/raw/credit.arff.txt')

In [15]:
credit = pd.DataFrame(data)

Plot Age and Duration (3.2)

In [16]:
age_duration_plt = [go.Scatter(x=credit.index, y=credit.Age, name='Age'), 
                    go.Scatter(x=credit.index, y=credit.Duration, name='Duration')]

In [17]:
py.iplot(age_duration_plt)

Drop observations where Age < 0 (3.3)

In [18]:
credit = credit.loc[credit.Age > 0]

Split on train and test sets (3.4)

In [19]:
train, test = train_test_split(credit, test_size=0.2)