# Data Preparation

In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('sample_data/data.csv').dropna()
del data['educational-num']
data.insert(9, 'capital', data['capital-gain'] - data['capital-loss'])
del data['capital-gain'], data['capital-loss']

In [3]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,marital-status,occupation,relationship,race,gender,capital,hours-per-week,native-country,income_>50K
0,67,Private,366425,Doctorate,Divorced,Exec-managerial,Not-in-family,White,Male,99999,60,United-States,1
1,17,Private,244602,12th,Never-married,Other-service,Own-child,White,Male,0,15,United-States,0
2,31,Private,174201,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,40,United-States,1
3,58,State-gov,110199,7th-8th,Married-civ-spouse,Transport-moving,Husband,White,Male,0,40,United-States,0
4,25,State-gov,149248,Some-college,Never-married,Other-service,Not-in-family,Black,Male,0,40,United-States,0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40727 entries, 0 to 43956
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             40727 non-null  int64 
 1   workclass       40727 non-null  object
 2   fnlwgt          40727 non-null  int64 
 3   education       40727 non-null  object
 4   marital-status  40727 non-null  object
 5   occupation      40727 non-null  object
 6   relationship    40727 non-null  object
 7   race            40727 non-null  object
 8   gender          40727 non-null  object
 9   capital         40727 non-null  int64 
 10  hours-per-week  40727 non-null  int64 
 11  native-country  40727 non-null  object
 12  income_>50K     40727 non-null  int64 
dtypes: int64(5), object(8)
memory usage: 4.4+ MB


In [5]:
data.nunique()

age                  74
workclass             7
fnlwgt            24985
education            16
marital-status        7
occupation           14
relationship          6
race                  5
gender                2
capital             214
hours-per-week       94
native-country       41
income_>50K           2
dtype: int64

In [6]:
from sklearn.preprocessing import LabelEncoder

ind_cat = []    # categorical index
ind_cont = []   # continuous index

for i, v in enumerate(data.columns[:-1]):
  if data[v].dtype == 'object':
    ind_cat.append(i)
    le_str = 'le'+str(i)
    globals()[le_str] = LabelEncoder().fit(data[v])
    print("{} {}".format(le_str, v))
  else:
    ind_cont.append(i)

print()
print("ind_cont = {}".format(ind_cont))
print("ind_cat = {}".format(ind_cat))

le1 workclass
le3 education
le4 marital-status
le5 occupation
le6 relationship
le7 race
le8 gender
le11 native-country

ind_cont = [0, 2, 9, 10]
ind_cat = [1, 3, 4, 5, 6, 7, 8, 11]


# Conversion to Array

In [7]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=0.2, random_state=42, shuffle=True)

In [8]:
def convertx(df):
  mat = np.zeros([len(df), len(df.columns)-1], dtype='int32')
  for i in ind_cont:
    mat[:,i] = df.iloc[:,i]
  for i in ind_cat:
    mat[:,i] = globals()['le'+str(i)].transform(df.iloc[:,i])
  return mat

def converty(df):
  return df.iloc[:,-1].to_numpy(dtype='bool')

In [9]:
X_train, y_train = convertx(train), converty(train)
X_test, y_test = convertx(test), converty(test)

In [10]:
X_train

array([[    23,      2, 164231, ...,      0,     35,     38],
       [    17,      2, 132187, ...,      0,     15,     38],
       [    23,      2, 163595, ...,      0,     40,     38],
       ...,
       [    31,      2, 150324, ...,      0,     40,     38],
       [    47,      2, 181307, ...,  99999,     60,     38],
       [    33,      2, 229716, ...,      0,     38,     38]], dtype=int32)

In [11]:
y_train

array([False, False, False, ..., False,  True, False])

# Naive Bayes for Mixed Typed Data

In [12]:
from sklearn.naive_bayes import GaussianNB, CategoricalNB

# Calculate log probabilities
def log_proba(X, p0):
  myprior = [p0,1-p0]
  gnb = GaussianNB(priors=myprior)  # continuous data
  gnb.fit(np.take(X_train, ind_cont, axis=1), y_train)
  clf = CategoricalNB(class_prior=myprior)  # categorical data
  clf.fit(np.take(X_train, ind_cat, axis=1), y_train)
  return gnb.predict_proba(np.take(X, ind_cont, axis=1)) + \
         clf.predict_proba(np.take(X, ind_cat, axis=1)) - \
         myprior

# Predict a class
def pred(logprob):
  return logprob.argmax(axis=1)

In [13]:
import timeit

df_eval = pd.DataFrame(columns = ['p0', 'accuracy', 'rtime'])
num_step = 50

for i in range(1, num_step):
  p0 = i/num_step
  start = timeit.default_timer()
  y_pred = pred(log_proba(X_test, p0))
  rtime = (timeit.default_timer()-start)*1000  # running time (ms)
  acc = (y_test == y_pred).sum()/X_test.shape[0]*100
  df_eval.loc[i-1] = np.float32([p0, acc, rtime])
  #print("p0 = {} \t Accuracy = {:.2f} \t Running time = {:.2f}".format(p0, acc, rtime))

In [14]:
df_eval.head()

Unnamed: 0,p0,accuracy,rtime
0,0.02,66.33931,35.909679
1,0.04,74.011787,30.286549
2,0.06,77.436775,29.57844
3,0.08,79.106308,29.127287
4,0.1,80.346184,28.870115


In [15]:
df_eval.to_csv('sample_data/eval_bayes.csv', index=False)

In [16]:
df_eval.iloc[df_eval.accuracy.argmax(),:] # maximum accuracy

p0           0.480000
accuracy    83.562485
rtime       29.409271
Name: 23, dtype: float32

In [17]:
df_eval.iloc[df_eval.rtime.argmin(),:]  # minimum running time

p0           0.420000
accuracy    82.887306
rtime       28.350065
Name: 20, dtype: float32