In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
from naive_bayes import NaiveBayes
from naive_bayes import StratifiedShuffleSplit

#### Load Data
Divide dataset to 2 sets:
1. Train Set: 0.7 of orginal dataset
2. Test Set: 0.3 of orginal dataset

In [3]:
def load_data(filepath):
    cols = ['Age','WorkClass','FinancialWeight','Education',
           'Education-num','MaritalStatus','Occupation','Relationship',
           'Race','Sex','CapitalGain','CapitalLoss','HourPerWeek',
           'NativeCountry','Income']
    data = pd.read_csv(filepath,header=None,names=cols)
    data.head()
    
    X = data.iloc[:,:-1]
    y = data.iloc[:,-1]
    X = np.array(X)
    y = np.where(y==' <=50K',0,1).astype('int64')
    
    split = StratifiedShuffleSplit(test_size=0.3,shuffle=True)
    (X_train,y_train),(X_test,y_test) = split.split(X,y)
    return data,(X_train,y_train),(X_test,y_test)

data,(X_train,y_train),(X_test,y_test) = load_data('Data.csv')
print(f'X_train.shape: {X_train.shape} -- y_train.shape: {y_train.shape}')
print(f'X_test.shape: {X_test.shape} -- y_test.shape: {y_test.shape}')

X_train.shape: (22794, 14) -- y_train.shape: (22794,)
X_test.shape: (9767, 14) -- y_test.shape: (9767,)


#### Assume all features come from Independent MultiNomial dist.

In [4]:
model = NaiveBayes('multinomial')
model.fit(X_train,y_train)
print(f'Train Acc:{model.score(X_train,y_train)} -- Test Acc:{model.score(X_test,y_test)}')

Train Acc:0.8180661577608143 -- Test Acc:0.8200061431350466


#### Assume all Numeric features come from Independent Gaussian dist and remaining features come from Independent MultiNomial dist.

In [5]:
dists1 = ['gaussian','multinomial','gaussian','multinomial',
        'gaussian','multinomial','multinomial','multinomial',
        'multinomial','multinomial','gaussian','gaussian','gaussian',
        'multinomial']

In [6]:
model = NaiveBayes(dists1)
model.fit(X_train,y_train)
print(f'Train Acc:{model.score(X_train,y_train)} -- Test Acc:{model.score(X_test,y_test)}')

Train Acc:0.7951653944020356 -- Test Acc:0.7966622299580219
