In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

# global imports
import sys
import pandas as pd
import numpy as np
import seaborn as sns
import pickle
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# local imports
sys.path.append('../src/') # local path
import dautils

### Read dataframe

In [None]:
# read dataset
df = pd.read_csv('../data/adult_continuous.csv', na_values='?')
# remove unused columns
del df['fnlwgt']
del df['education-num']
df.info()

In [None]:
# some missing values present
df.isna().sum()

In [None]:
df.head()

In [None]:
# nominal-ordinal-continuous partition of predictive attributes
nominal_atts = ['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
ordinal_atts = ['education']
continuous_atts = ['age', 'capital-gain', 'capital-loss', 'hours-per-week']
# class attribute
target = 'class'
# predictive and all atts
pred_atts = nominal_atts + ordinal_atts + continuous_atts
all_atts = pred_atts + [target]

### Encoding 

In [None]:
# forcing encoding of ordinal attributes (consistent with the order) and class attribute (0=negatives, 1=positives)
decode = {
    'education': {
        1:'Preschool', 2:'1st-4th', 3:'5th-6th', 4:'7th-8th', 5:'9th', 6:'10th', 7:'11th',
        8:'12th', 9:'HS-grad', 10:'Some-college', 11:'Assoc-voc', 12:'Assoc-acdm', 13:'Bachelors', 
        14:'Masters', 15:'Prof-school', 16:'Doctorate' 
    },
    'class': {
        0: '<=50K', 1: '>50K'
    }
}

In [None]:
# encode nominal (as categories), ordinal+target (as int), passing the encoding of ordinal+target
df_code = dautils.Encode(nominal=nominal_atts, ordinal=ordinal_atts+[target], decode=decode)
df_encoded = df_code.fit_transform(df)
# workclass, occupation and native-country are floats because there are missing values
df_encoded.head()

In [None]:
df_encoded.info()

In [None]:
# dictionary of encodings for discrete (nominal+ordinal+target) columns
df_code.decode

In [None]:
# from encoded back to orignal
df_back = df_code.inverse_transform(df_encoded)
df_back.head()

### Encoding with onehot option

In [None]:
df_code = dautils.Encode(nominal=nominal_atts, ordinal=ordinal_atts+[target], decode=decode, onehot=True)
df_encoded_onehot = df_code.fit_transform(df)
df_encoded_onehot.head()

In [None]:
# dictionary of encodings now does not include nominal
df_code.decode

In [None]:
# from encoded back to original
df_onehot_back = df_code.inverse_transform(df_encoded_onehot)
df_onehot_back.head()

In [None]:
# encoded atts names
encoded_pred_atts = df_code.encoded_atts(pred_atts)
encoded_pred_atts

In [None]:
# train a decision tree
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(max_depth=3)
clf.fit(df_encoded_onehot[encoded_pred_atts], df_encoded_onehot[target])

In [None]:
# visualize the decision tree
from sklearn import tree
from IPython.display import Image
import pydotplus

dot_data = tree.export_graphviz(clf, out_file=None, 
                                feature_names=encoded_pred_atts, # attributes names
                                class_names=df_code.decode[target], # class labels
                                filled=True, rounded=True)  
graph = pydotplus.graph_from_dot_data(dot_data)  
Image(graph.create_png())