# Perceptron

## Student Information

|Name|NetId|
|-|-|
|Xing Gao|xgao058|

## Introduction

Some libraries should be imported first:

In [None]:
# shared code
import math
import numpy as np
import pandas as pd

Load our dataset into a dataframe:

In [None]:
# shared code
df = pd.read_csv('ObesityDataSet_raw_and_data_sinthetic.csv')

Get `X` (a matrix of the features values) and `y` (a vector of the labels) from the dataframe:

In [None]:
# shared code
y: pd.Series = df['NObeyesdad'].str.startswith('Obesity').astype(int)
del df['NObeyesdad']
del df['Height']
del df['Weight']
X: pd.DataFrame = df

Accuracy, precision, recall and f1 are our metrics:

In [None]:
# shared code
def evaluate(y_actual, y_pred):
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
    print('accuracy:', accuracy_score(y_actual, y_pred))
    print('precision:', precision_score(y_actual, y_pred))
    print('recall:', recall_score(y_actual, y_pred))
    print('f1:', f1_score(y_actual, y_pred))
    print('confusion matrix:')
    print(confusion_matrix(y_actual, y_pred))

## Preprocessing

Since perceptron only accepts numeric values, I need to map categorical values to numeric ones.
For `MTRANS` feature, I believe the number reflects the amount of exercise.

Then normalize all features using min-max.

In [None]:
def preprocessing() -> pd.DataFrame:

    freq_map = {'no': 0, 'Sometimes': 1, 'Frequently': 2, 'Always': 3}
    yes_no_map = {'yes': 1, 'no': 0}
    #
    df['Gender'] = df['Gender'].map({'Male': 1, 'Female': 0})
    df['family_history_with_overweight'] = df['family_history_with_overweight'].map(yes_no_map)
    df['FAVC'] = df['FAVC'].map(yes_no_map)
    df['CAEC'] = df['CAEC'].map(freq_map)
    df['SMOKE'] = df['SMOKE'].map(yes_no_map)
    df['SCC'] = df['SCC'].map(yes_no_map)
    df['CALC'] = df['CALC'].map(freq_map)
    df['MTRANS'] = df['MTRANS'].map(
        {'Automobile': 0, 'Motorbike': 0, 'Public_Transportation': 0, 'Walking': 1, 'Bike': 1})

    return df

## My Implementation

We want to compare the probability of the data belonging to each class.  Using Bayes' theorem, the probability can be decomposed as($Y$ is the classes $X$ is the attribute) :

$$P(Y|X) = \frac{P(X|Y)P(Y)}{P(X)} $$

$c$ is the classes, $ x = \{ a_1 ... a_n \}$ is the attributes

$$P(x|c) = \sum_{j=1}^n P(a_j|c)$$

We assume that all the data follow a Gaussian or normal distribution and then use the gaussian function to calculate the probability of the probability of likelihoods
$a_j$ is the value of the attribute $j$, $\mu_{c,j}$ and $\sigma_{c,j}$ are the mean and squared deviation of the attribute $j$ of the class$c$ sample, respectively:


$$P(a_j|c) = \frac{1}{\sqrt{2 \pi \sigma_{c,j}}} exp(-\frac{(a_j-\mu_{c,j})^2}{2\sigma^2_{c,j}})$$



In [None]:
class NaiveBayes(object):
    def __init__(self):
        return

    def gauss(self, test, mean, std):
        t1 = (test - mean) * (test - mean)
        t2 = std * std
        if t2 == 0:
            t2 = 0.001
        res = np.exp(-t1/(t2*2)) / np.sqrt(2*t2*np.pi)
        # return np.log(res)
        return res

    def fit (self, x_train: np.ndarray, y_train: np.ndarray):
        self.normalData = []
        self.obesityData = []
        self.norMean = []
        self.norStd = []
        self.obeMean = []
        self.obeStd = []

        for i in range(y_train.shape[0]):
            if y_train[i] == 0:
                self.normalData.append(x_train[i])
            else:
                self.obesityData.append(x_train[i])

        self.normalData = np.array(self.normalData)
        self.obesityData = np.array(self.obesityData)

        self.norMean = self.normalData.mean(axis=0)
        self.norStd = self.normalData.std(axis=0)
        self.obeMean = self.obesityData.mean(axis=0)
        self.obeStd = self.obesityData.std(axis=0)
        self.norPY = (y_train.shape[0]- np.sum(y_train)) / y_train.shape[0]
        self.obePY = np.sum(y_train)/ y_train.shape[0]

        return

    def predict(self,x_test: np.ndarray):
        y_prob = []

        for i in range(x_test.shape[0]):
            curNor = self.norPY
            curObe = self.obePY
            for j in range(x_test.shape[1]):
                curNor *= self.gauss(x_test[i,j],self.norMean[j],self.norStd[j])
                curObe *= self.gauss(x_test[i,j],self.obeMean[j],self.obeStd[j])
            y_prob.append(int(curObe > curNor))

            # pxy = np.prod(self.gauss(x_test))
        print(y_prob)
        print(y_test)

        return np.array(y_prob)


## Comparison

To reduce the randomness of train_test_split, I split the dataset with 50 different random_states. For each split I train the off-the-shelf implementation and mine using the training set and predict the labels for the test set. I concatenate the $y_{actual}$ and $y_{pred}$ of each split and use them to compute the metrics.

My implementation performs better than off-the-shelf on f1 score.


In [None]:
def run_naive_bayes(X,y):
    # df = preprocessing()
    # X = df.drop(columns=['Label'])
    # y = df['Label']
    X = X.to_numpy()
    y = y.to_numpy()
    X.astype(np.int32)
    y.astype(np.int32)

    myPredAll = np.empty(shape=(0,))
    sklPredAll = np.empty(shape=(0,))
    y_test_all = np.empty(shape=(0,))

    for i in range(50):


        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=i, stratify=y)
        y_test_all = np.append(y_test_all, y_test)

        nb = NaiveBayes()
        nb.fit(X_train,y_train)
        myPred = nb.predict(X_test)
        myPredAll = np.append(myPredAll,myPred)

        clf = GaussianNB()
        clf = clf.fit(X_train, y_train)
        sklPred = clf.predict(X_test)
        sklPredAll = np.append(sklPredAll, sklPred)

    print('[+] off-the-shelf implementation:')
    evaluate(y_test_all, sklPredAll)
    print()

    print('[+] my implementation:')
    evaluate(y_test_all, myPredAll)
    print()

In [None]:
run_naive_bayes(X.copy(),y.copy())