<a href="https://colab.research.google.com/github/reindri/Tupro3_PAI/blob/main/Tubes_Gita_PAI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Library

In [None]:
import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import MaxAbsScaler, MinMaxScaler, Normalizer, StandardScaler, RobustScaler
from sklearn.metrics import classification_report, confusion_matrix

# Dataset Preprocessing

## Train

In [None]:
train = pd.read_excel('https://github.com/reindri/Tupro3_PAI/blob/main/traintest.xlsx?raw=true', index_col='id', sheet_name='train')
train

Unnamed: 0_level_0,x1,x2,x3,y
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,60,64,0,1
2,54,60,11,0
3,65,62,22,0
4,34,60,0,1
5,38,69,21,0
...,...,...,...,...
292,59,64,1,1
293,65,67,0,1
294,53,65,12,0
295,57,64,1,0


In [None]:
train.info()

In [None]:
train.describe()

In [None]:
data_train = train[['x1','x2','x3']]
label_train = train['y']

In [None]:
data_train

In [None]:
label_train

## Scaling data

In [None]:
data_train_scaled_robust = pd.DataFrame(RobustScaler().fit_transform(data_train), columns=data_train.columns, index=data_train.index)
data_train_scaled_robust

In [None]:
data_train_scaled_standard = pd.DataFrame(StandardScaler().fit_transform(data_train), columns=data_train.columns, index=data_train.index)
data_train_scaled_standard

In [None]:
data_train_scaled_normalizer = pd.DataFrame(Normalizer().fit_transform(data_train), columns=data_train.columns, index=data_train.index)
data_train_scaled_normalizer

In [None]:
data_train_scaled_maxabs = pd.DataFrame(MaxAbsScaler().fit_transform(data_train), columns=data_train.columns, index=data_train.index)
data_train_scaled_maxabs

In [None]:
data_train_scaled_minmax = pd.DataFrame(MinMaxScaler().fit_transform(data_train), columns=data_train.columns, index=data_train.index)
data_train_scaled_minmax

# Method

In [None]:
class NaiveBayes:
  def fit(self, X, y):
    # get number of samples (rows) and features (columns)
    self.n_samples, self.n_features = X.shape
    # get number of uniques classes
    self.n_classes = len(np.unique(y))

    # create three zero-matrices to store summary stats & prior
    self.mean = np.zeros((self.n_classes, self.n_features))
    self.variance = np.zeros((self.n_classes, self.n_features))
    self.priors = np.zeros(self.n_classes)

    for c in range(self.n_classes):
      # create a subset of data for the specific class 'c'
      X_c = X[y == c]

      # calculate statistics and update zero-matrices, rows=classes, cols=features
      self.mean[c, :] = np.mean(X_c, axis=0)
      self.variance[c, :] = np.var(X_c, axis=0)
      self.priors[c] = X_c.shape[0] / self.n_samples

  def predict(self, X):
    # for each sample x in the dataset X
    y_hat = [self.get_class_probability(x) for x in X.values]
    return np.array(y_hat)

  def get_class_probability(self, x):
    # store new posteriors for each class in a single list
    posteriors = list()

    for c in range(self.n_classes):
      # get summary stats & prior
      mean = self.mean[c]
      variance = self.variance[c]
      prior = np.log(self.priors[c])
      # calculate new posterior & append to list
      posterior = np.sum(np.log(self.gaussian_density(x, mean, variance)))
      posterior = prior + posterior
      posteriors.append(posterior)

    # return the index with the highest class probability
    return np.argmax(posteriors)

  def gaussian_density(self, x, mean, var):
    # implementation of gaussian density function
    const = 1 / np.sqrt(var * 2 * np.pi)
    proba = np.exp(-0.5 * ((x - mean) ** 2 / var))
    return const * proba

In [None]:
def start_classifying(data,label, n_splits=5):
  kf = KFold(n_splits = n_splits)
  f = 1
  rata_rata_akurasi = []
  for train_index, test_index in kf.split(data):
    print('Fold :', f)
    x_train, x_test = data.iloc[train_index], data.iloc[test_index]
    y_train, y_test = label.iloc[train_index], label.iloc[test_index]

    nb = NaiveBayes()
    nb.fit(x_train, y_train)
    predict = nb.predict(x_test)

    print(classification_report(predict, y_test))
    print()

    cnf_matrix = confusion_matrix(predict,y_test)
    s = sn.heatmap(cnf_matrix,#/np.sum(cnf_matrix),
              annot=True,
              #fmt='.2%',
              xticklabels = ['0','1'],
              yticklabels = ['0','1'],
              );
    s.set_xlabel('Predicted Label');
    s.set_ylabel('True Label');
    plt.show()

    rata_rata_akurasi.append(((cnf_matrix[0,0] + cnf_matrix[1,1]) / sum(sum(cnf_matrix))) * 100)
    f+=1
    print('-'*100)

  print(f'Rata-Rata Akurasi dari \nK-Fold Cross Validation adalah : {np.mean(rata_rata_akurasi):.3f} %' )

In [None]:
start_classifying(data_train, label_train, n_splits= 5)

In [None]:
start_classifying(data_train_scaled_robust, label_train, n_splits= 5)

In [None]:
start_classifying(data_train_scaled_standard, label_train, n_splits= 5)

In [None]:
start_classifying(data_train_scaled_minmax, label_train, n_splits= 5)

In [None]:
start_classifying(data_train_scaled_maxabs, label_train, n_splits= 5)

In [None]:
start_classifying(data_train_scaled_normalizer, label_train, n_splits= 5)

# Test

In [None]:
test = pd.read_excel('https://github.com/reindri/Tupro3_PAI/blob/main/traintest.xlsx?raw=true', index_col='id', sheet_name='test')
test

In [None]:
test.info()

In [None]:
test.describe()

In [None]:
data_test = test[['x1','x2','x3']]
label_test = test['y']

In [None]:
data_test

In [None]:
label_test

In [None]:
data_test_scaled_normalizer = pd.DataFrame(Normalizer().fit(data_train).transform(data_test), columns=data_test.columns, index=data_test.index)
data_test_scaled_normalizer

In [None]:
nb = NaiveBayes()
nb.fit(data_train, label_train)
predict = nb.predict(data_test)

In [None]:
label_predict = pd.Series(predict, name='y', index=label_test.index)
label_predict

In [None]:
nb = NaiveBayes()
nb.fit(data_train_scaled_normalizer, label_train)
predict = nb.predict(data_test_scaled_normalizer)

In [None]:
label_predict = pd.Series(predict, name='y', index=label_test.index)
label_predict