## Read & Visualize Dataset



In [None]:
import pandas as pd

## change the following to the appropriate dataset names and class label
train = pd.read_csv('bna_train.csv')
test = pd.read_csv('bna_test.csv')
class_label = 'class'

train.head()

Unnamed: 0,variance,skewness,curtosis,entropy,class
0,-2.6406,-4.4159,5.983,-0.13924,1
1,0.96441,5.8395,2.3235,0.066365,0
2,-2.1241,-6.8969,5.5992,-0.47156,1
3,2.3136,10.6651,-3.5288,-4.7672,0
4,-0.539,-5.167,3.4399,0.052141,1


In [None]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

fig = make_subplots(rows=(len(train.columns)//2), cols=2, subplot_titles=tuple([f'Distribution of {item.title()}' for item in train.columns if item != class_label]))

for idx,col in enumerate(train.columns):
  if col == class_label: continue
  fig.add_trace(
      go.Histogram(x=train[col],name=col),
      row=idx//2+1, col=idx%2+1,
  )
  fig.update_xaxes(title_text=col, row=idx//2+1, col=idx%2+1)
  fig.update_yaxes(title_text='frequency', row=idx//2+1, col=idx%2+1)

fig.show()

In [None]:
c = 0.0

from plotly.subplots import make_subplots
import plotly.graph_objects as go
from scipy.stats import anderson

fig = make_subplots(rows=(len(train.columns)//2), cols=2, subplot_titles=tuple([f'Distribution of {item.title()} for Class = {c}' for item in train.columns if item != class_label]))

for idx,col in enumerate(train.columns):
  if col == class_label: continue
  fig.add_trace(
      go.Histogram(x=train[train[class_label]==c][col],name=col),
      row=idx//2+1, col=idx%2+1,
  )
  print(col)
  print(shapiro(train[train[class_label] == c][col]))
  fig.update_xaxes(title_text=col, row=idx//2+1, col=idx%2+1)
  fig.update_yaxes(title_text='frequency', row=idx//2+1, col=idx%2+1)

fig.show()

variance
(0.9755987524986267, 1.540202987371231e-08)
skewness
(0.9408822655677795, 7.790176415457904e-15)
curtosis
(0.9645270705223083, 5.8151362197778056e-11)
entropy
(0.9211082458496094, 2.593098227482067e-17)


## Gaussian Naive Bayes
$P(x) = \frac 1 {\sigma \sqrt {2\pi}} e^{\frac {-(x-\mu)^2} {2\sigma^2}}$

In [None]:
from math import exp, sqrt, pi
def gauss_probability(x, mean, std):
  exponent = exp(-((x-mean)**2)/(2*std**2))
  return (1/(std*sqrt(2*pi))) * exponent

In [None]:
class_list = set(train[class_label])
probs = {}
test['predicted'] = None

for idx in test.index:
  if not (idx%100): print('|', end='')
  test_instance = test.iloc[idx]
  for c in class_list:
    current_df = train[train[class_label] == c]
    current_prob = 1
    for col in current_df:
      if col == class_label: continue
      current_prob *= gauss_probability(test_instance[col], current_df.mean()[col], current_df.std()[col])
    probs[c] = current_prob
  test.loc[idx,'predicted'] = max(probs, key=probs.get)

print()
correct = 0
for idx in test.index:
  correct += test.loc[idx,class_label] == test.loc[idx,'predicted']
print(f'accuracy: {correct/len(test)}')

from sklearn.metrics import f1_score
print(f"f1 score: {f1_score(list(test['class']), list(test['predicted']))}")

|||
accuracy: 0.8436363636363636
f1 score: 0.818565400843882


## Beta Distribution
$\frac {(x-a)^{\alpha-1}(b-x)^{\beta-1}} {B(\alpha,\beta)(b-a)^{\alpha+\beta+1}}$
where
$B(\alpha,\beta) = \int_0^1 {x^{\alpha-1}(1-x)^{\beta-1}dx}$

In [None]:
from scipy.special import beta
def beta_probability(x, data, m1, m2):
  a = round(min(data)-0.1)
  c = round(max(data)+0.1)
  alpha = (a-m1)*(a*c-a*m1-c*m1+(m1**2)+(m2**2))/((m2**2)*(c-a))
  b = -(c-m1)*(a*c-a*m1-c*m1+(m1**2)+(m2**2))/((m2**2)*(c-a))
  #print(f'alpha: {alpha} beta: {b}')
  return (((x-a)**(alpha-1))*(c-x)**(b-1))/(((c-a)**(alpha+b-1))*beta(alpha,b))

In [None]:
from math import isnan
from scipy.stats import shapiro
import warnings

warnings.filterwarnings("ignore")

class_list = set(train[class_label])
probs = {}
test['predicted'] = None

# normality dictionary
norm = {}
for c in class_list:
  norm[c] = {}
  for col in train.columns:
    if col == class_label: continue
    norm[c][col] = shapiro(train[train[class_label]==c][col])[1] > 0.05

STATS = {'gauss': 0, 'beta': 0}
for idx in test.index:
  test_instance = test.iloc[idx]
  if not (idx%100): print('|',end='')
  for c in class_list:
    current_df = train[train[class_label] == c]
    current_prob = 1
    for col in current_df:
      if col == class_label: continue
      if 'score' in col: continue
      if norm[c][col]:
        curr = gauss_probability(test_instance[col], current_df.mean()[col], current_df.std()[col])
        current_prob *= curr
        STATS['gauss'] += 1
      else:
        curr = beta_probability(test_instance[col], current_df[col], current_df.mean()[col], current_df.std()[col])
        current_prob *= curr
        STATS['beta'] += 1
    probs[c] = current_prob
    if isnan(current_prob): probs[c] = 0
  
  test.loc[idx,'predicted'] = max(probs, key=probs.get)

print()
correct = 0
for idx in test.index:
  correct += test.loc[idx,class_label] == test.loc[idx,'predicted']
print("ACCURACY")
print(correct/len(test))

print("STATS")
print(STATS)

print(test.iloc[idx])
print(len(test[test[class_label] == 0.5])/len(test[:300]))

print(f"f1 score: {f1_score(list(test['class']), list(test['predicted']))}")

|||
ACCURACY
0.9090909090909091
STATS
{'gauss': 0, 'beta': 2200}
variance     -2.6685
skewness    -10.4519
curtosis      9.1139
entropy      -1.7323
class              1
predicted          1
Name: 274, dtype: object
0.0
f1 score: 0.8979591836734693
