# Code

## Group

|Name|NetId|Approach|
|-|-||
|Zhengchuan Liang|zlian064|Perceptron|
|Diana Men|lmen004|Logistic Regression|
|Xing Gao|xgao058|Naive Bayes|
|Yuwei Zhang|yzhan995|ID3|
|Shixun Wu|swu264|Cart|

## Introduction

Some libraries should be imported first:

In [3]:
# shared code
import math
import numpy as np
import pandas as pd

ModuleNotFoundError: No module named 'numpy'

Load our dataset into a dataframe:

In [2]:
# shared code
df = pd.read_csv('ObesityDataSet_raw_and_data_sinthetic.csv')

Get `X` (a matrix of the features values) and `y` (a vector of the labels) from the dataframe:

In [3]:
# shared code
y: pd.Series = df['NObeyesdad'].str.startswith('Obesity').astype(int)
del df['NObeyesdad']
del df['Height']
del df['Weight']
X: pd.DataFrame = df

Accuracy, precision, recall and f1 are our metrics:

In [4]:
# shared code
def evaluate(y_actual, y_pred):
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
    print('accuracy:', accuracy_score(y_actual, y_pred))
    print('precision:', precision_score(y_actual, y_pred))
    print('recall:', recall_score(y_actual, y_pred))
    print('f1:', f1_score(y_actual, y_pred))
    print('confusion matrix:')
    print(confusion_matrix(y_actual, y_pred))

## Perceptron

### Preprocessing

Since perceptron only accepts numeric values, I need to map categorical values to numeric ones.
For `MTRANS` feature, I believe the number reflects the amount of exercise.

Then normalize all features using min-max.

In [5]:
def preprocessing_perceptron(X: pd.DataFrame) -> pd.DataFrame:
    freq_map = {'no': 0, 'Sometimes': 1,
                'Frequently': 2, 'Always': 3}

    yes_no_map = {'yes': 1, 'no': 0}

    X = X.copy()
    X['Gender'] = X['Gender'].map({'Male': 1, 'Female': 0})
    X['family_history_with_overweight'] = X['family_history_with_overweight'].map(
        yes_no_map)
    X['FAVC'] = X['FAVC'].map(yes_no_map)
    X['CAEC'] = X['CAEC'].map(freq_map)
    X['SMOKE'] = X['SMOKE'].map(yes_no_map)
    X['SCC'] = X['SCC'].map(yes_no_map)
    X['CALC'] = X['CALC'].map(freq_map)
    X['MTRANS'] = X['MTRANS'].map(
        {'Automobile': 0, 'Motorbike': 0, 'Public_Transportation': 0, 'Walking': 1, 'Bike': 1})

    X = (X-X.min())/(X.max()-X.min())
    return X

### My implementation

The activation function of my perceptron is a step funcion:
$$
F(net)=1\text{ if } net>0,0\text{ otherwise}\\
net=\sum_{i=0}^nw_ix_i=\textbf{w}^T\textbf{x}
$$

For a single sample $(\textbf{x},y)$ where $\textbf{x}\in\mathbb{R}^n$ and $y\in\mathbb{R}$, the loss function is:
$$
L(\textbf{w})=(F(net)-y)\textbf{w}^T\textbf{x}
$$

And the gradient is:
$$
\frac{\partial}{\partial w_i}L(\textbf{w})=(F(net)-y)x_i
$$

$$
\nabla L(\textbf{w})=(F(net)-y)\textbf{x}
$$

In my implementation, I use **batch gradient** for all samples instead.

Given features matrix $\textbf{X}\in\mathbb{R}^{m\times n}$ and label vector $\textbf{y}\in\mathbb{R}^{m}$, the steps to train a perceptron are as follows:

Initialize weights $\textbf{w}$ with a random vector.

Predict the labels $\textbf{y}_{pred}$ using the perceptron.

The loss function is:
$$
L(\textbf{w})=(\textbf{y}_{pred}-\textbf{y})^T\textbf{X}\textbf{w}
$$

The batch gradient is:
$$
\nabla L(\textbf{w})^T=\frac{1}{m}(\textbf{y}_{pred}-\textbf{y})^T\textbf{X}
$$

Update the gradient:
$$
\textbf{w}\leftarrow \textbf{w}-\eta\nabla L(\textbf{w})
$$

Repeat the above steps for 200 times.

In [6]:
def add_ones_column(X: np.ndarray) -> np.ndarray:
    """ add a column of ones to the matrix
    """
    X0 = np.ones((X.shape[0], 1))
    return np.hstack((X0, X))


class MyPerceptron:
    def __init__(self, lr: float) -> None:
        self.lr: float = lr  # learning rate

    def fit(self, X: np.ndarray, y: np.ndarray, seed: int = 0) -> None:
        assert X.ndim == 2
        assert y.ndim == 1
        assert X.shape[0] == y.shape[0]

        X = add_ones_column(X)
        m = X.shape[0]  # number of samples
        n = X.shape[1]  # number of features

        # weights
        rng = np.random.default_rng(seed=seed)
        init_W = rng.random(n)
        init_W /= np.linalg.norm(init_W)  # initial weights are a random unit vector
        self.W: np.ndarray = init_W

        for epoch in range(200):
            # predictions for all samples. vector of size m
            y_pred = np.where(np.dot(X, self.W) > 0, 1, 0)

            # (average) gradient for all samples. vector of size n
            gradient = (y_pred - y) @ X / m

            # loss for all samples
            loss = (y_pred - y) @ (X @ self.W)

            # update weights
            self.W -= self.lr * gradient

    def predict(self, X: np.ndarray) -> np.ndarray:
        X = add_ones_column(X)
        return np.where(np.dot(X, self.W) > 0, 1, 0)

### Comparison

To reduce the randomness of train_test_split, I split the dataset with 50 different random_states. For each split I train the off-the-shelf implementation and mine using the training set and predict the labels for the test set. I concatenate the $y_{actual}$ and $y_{pred}$ of each split and use them to compute the metrics.

My implementation performs better than off-the-shelf on f1 score. This may result from batch gradient descent, whereas the off-the-shelf uses stochastic gradient descent.

I also compute the cosine similarity between the weights of off-the-shelf perceptron and mine. The result shows that the weights of my implementation are very similar to that of off-the-shelf. It is much higher than the average similarity of two random 15-dimensional vectors (around 0.756). This demonstrates that my implementation is correct.

In [7]:
def cosine_sim(x, y):
    """ cosine similarity
    """
    return x @ y / (np.linalg.norm(x) * np.linalg.norm(y))


def run_perceptron(X: pd.DataFrame, y: pd.Series):
    from sklearn.linear_model import Perceptron
    from sklearn.model_selection import train_test_split

    X = preprocessing_perceptron(X)
    X = X.to_numpy()
    y = y.to_numpy()

    y_test_all = np.empty(shape=(0,))
    y_pred_shelf_all = np.empty(shape=(0,))
    y_pred_my_all = np.empty(shape=(0,))
    cosine_all = np.empty(shape=(0,))
    for r in range(50):
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.1, random_state=r, stratify=y)
        y_test_all = np.append(y_test_all, y_test)

        # off-the-shelf library
        shelf = Perceptron(tol=1e-3, random_state=r)
        shelf.fit(X_train, y_train)  # train using training set
        y_pred_shelf = shelf.predict(X_test)  # predict for test set
        y_pred_shelf_all = np.append(y_pred_shelf_all, y_pred_shelf)
        W_shelf = np.append(shelf.intercept_, shelf.coef_.flatten())  # weights of library implementation

        # my implementation
        my = MyPerceptron(1)
        my.fit(X_train, y_train, seed=r)  # train using training set
        y_pred_my = my.predict(X_test)  # predict for test set
        y_pred_my_all = np.append(y_pred_my_all, y_pred_my)
        W_my = my.W  # weights of my implementation

        # cosine similarity
        cosine = cosine_sim(W_shelf, W_my)
        cosine_all = np.append(cosine_all, cosine)

    print('[+] off-the-shelf implementation:')
    evaluate(y_test_all, y_pred_shelf_all)
    print()

    print('[+] my implementation:')
    evaluate(y_test_all, y_pred_my_all)
    print()

    # average cosine similarity
    # between the weights of library and my implementation
    print("[+] average cosine similarity:", cosine_all.mean())

In [8]:
run_perceptron(X.copy(), y.copy())

[+] off-the-shelf implementation:
accuracy: 0.6788679245283019
precision: 0.6497797356828194
recall: 0.6622448979591836
f1: 0.6559531028906408
confusion matrix:
[[3951 1749]
 [1655 3245]]

[+] my implementation:
accuracy: 0.6800943396226415
precision: 0.5956880152187698
recall: 0.9585714285714285
f1: 0.7347673054360578
confusion matrix:
[[2512 3188]
 [ 203 4697]]

[+] average cosine similarity: 0.9453590450484168


## Logistic Regression

In [9]:
def run_logistic_regression(X: pd.DataFrame, y: pd.Series):
    # TODO
    pass

In [10]:
run_logistic_regression(X.copy(), y.copy())

## Naive Bayes
### Processing

In [2]:
def preprocessing() -> pd.DataFrame:
    df = pd.read_csv('D:\OneDrive - email.ucr.edu\cs235\data\ObesityDataSet_raw_and_data_sinthetic.csv')

    df['Label'] = df['NObeyesdad'].str.startswith('Obesity').astype(int)
    del df['NObeyesdad']
    del df['Height']
    del df['Weight']
    #
    freq_map = {'no': 0, 'Sometimes': 1, 'Frequently': 2, 'Always': 3}

    yes_no_map = {'yes': 1, 'no': 0}
    #
    df['Gender'] = df['Gender'].map({'Male': 1, 'Female': 0})
    df['family_history_with_overweight'] = df['family_history_with_overweight'].map(yes_no_map)
    df['FAVC'] = df['FAVC'].map(yes_no_map)
    df['CAEC'] = df['CAEC'].map(freq_map)
    df['SMOKE'] = df['SMOKE'].map(yes_no_map)
    df['SCC'] = df['SCC'].map(yes_no_map)
    df['CALC'] = df['CALC'].map(freq_map)
    df['MTRANS'] = df['MTRANS'].map(
        {'Automobile': 0, 'Motorbike': 0, 'Public_Transportation': 0, 'Walking': 1, 'Bike': 1})

    return df

NameError: name 'pd' is not defined

### My implemention

In [11]:
class NaiveBayes(object):
    def __init__(self):
        self.classes = ['1', '0']
        print('')

    def gauss(self, test, mean, std):
        t1 = (test - mean) * (test - mean)
        t2 = std * std
        res = np.exp(-t1/(t2*2)) / np.sqrt(2*t2*np.pi)

        return res

    def fit2 (self, x_train: np.ndarray, y_train: np.ndarray):
        self.normalData = []
        self.obesityData = []
        self.norMean = []
        self.norStd = []
        self.obeMean = []
        self.obeStd = []


        for i in range(y_train.shape[0]):
            if y_train[i] == 0:
                self.normalData.append(x_train[i])
            else:
                self.obesityData.append(x_train[i])


        self.normalData = np.array(self.normalData)
        self.obesityData = np.array(self.obesityData)

        self.norMean = self.normalData.mean(axis=0)
        self.norStd = self.normalData.std(axis=0)
        self.obeMean = self.obesityData.mean(axis=0)
        self.obeStd = self.obesityData.std(axis=0)
        self.norPY = (y_train.shape[0]- np.sum(y_train)) / y_train.shape[0]
        self.obePY = np.sum(y_train)/ y_train.shape[0]

        return

    def predict(self,x_test: np.ndarray):
        y_prob = []

        for i in range(x_test.shape[0]):
            curNor = self.norPY
            curObe = self.obePY
            for j in range(x_test.shape[1]):
                curNor *= self.gauss(x_test[i,j],self.norMean[j],self.norStd[j])
                curObe *= self.gauss(x_test[i,j],self.obeMean[j],self.obeStd[j])
            y_prob.append(int(curObe > curNor))

        return np.array(y_prob)

### Comparison

In [None]:
if __name__ == '__main__':
    df = preprocessing()
    X = df.drop(columns=['Label'])
    y = df['Label']
    X = X.to_numpy()
    y = y.to_numpy()
    X.astype(np.int32)
    y.astype(np.int32)

    myPredAll = np.empty(shape=(0,))
    sklPredAll = np.empty(shape=(0,))
    y_test_all = np.empty(shape=(0,))

    for i in range(50):


        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=i, stratify=y)
        y_test_all = np.append(y_test_all, y_test)

        nb = NaiveBayes()
        nb.fit2(X_train,y_train)
        myPred = nb.predict(X_test)
        myPredAll = np.append(myPredAll,myPred)

        clf = GaussianNB()
        clf = clf.fit(X_train, y_train)
        sklPred = clf.predict(X_test)
        sklPredAll = np.append(sklPredAll, sklPred)

    print('[+] off-the-shelf implementation:')
    evaluate(y_test_all, sklPredAll)
    print()

    print('[+] my implementation:')
    evaluate(y_test_all, myPredAll)
    print()

In [12]:
run_naive_bayes(X.copy(), y.copy())

[+] off-the-shelf implementation:
accuracy: 0.7536908517350158
precision: 0.6609783845278726
recall: 0.955068493150685
f1: 0.7812640071716719
confusion matrix:
[[ 9948  7152]
 [  656 13944]]

[+] my implementation:
accuracy: 0.7350788643533123
precision: 0.6640567135752831
recall: 0.8597260273972602
f1: 0.7493283983045789
confusion matrix:
[[10750  6350]
 [ 2048 12552]]


## ID3

In [13]:
def run_id3(X: pd.DataFrame, y: pd.Series):
    # TODO
    pass

In [14]:
run_id3(X.copy(), y.copy())

## CART

In [15]:
def run_cart(X: pd.DataFrame, y: pd.Series):
    # TODO
    pass

In [16]:
run_cart(X.copy(), y.copy())