In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Diabetes Mellitus Prediction**


> *   Step 1: Load the input Data
> *   Step 2: Implement Gaussian Naive Bayesian classifier
> *   Step 3: Build the classifier and check the correctness of Table building
> *   Step 4 Improve the classifier for Ranking
> *   Step 5: Make prediction and perform evaluation
> *   Step 6: Generate results

## *Import Packages*

> Note: You **cannot** import any other package

In [None]:
import numpy as np
import pandas as pd
import csv
import math
import random
import pickle

you can take a look at the input feature & ground truth format:

Please split the dataset into training set and validation set

> Note: The purpose of ***random_state*** is to ensure that you can reproduce the results each time you split your dataset. This is often helpful for debugging.


# **1. Advanced Part**
In advanced part, you need to implement the Gaussian Bayesian classifier:
- input features: ***3 physiological features***
- output prediction: ***diabetes_mellitus***

## Global attributes
> You can add your own global attributes here


## Step 1: Load the input Data
Load the input file **advanced_training.csv**
> Note: please don't change the input var name ***training_df, testing_df, X, and y***.

In [None]:
# TODO: modify your file path
training_df = pd.read_csv('advanced_training.csv')
testing_df = pd.read_csv('advanced_testing.csv')
y = training_df['diabetes_mellitus']
X = training_df.drop('diabetes_mellitus', axis=1)

you can check whether the standardization works

## Step 2: Implement Gaussian Naive Bayesian classifier
In this part, you need to implement the Gaussian Naive Bayesian classifier.

The main difference between Naive Bayesian and Gaussian Naive Bayesian is the likelihood part. For Gaussian NB, we can use the probability density function (PDF) of the ***Gaussian distribution*** (also known as the Normal distribution):

$$f(x) = \frac{1}{\sqrt{2\pi\sigma^2}} exp({-\frac{(x - \mu)^2}{2\sigma^2}})$$

The reason we need to use Gaussian is that when the data type is continuous numbers instead of discrete numbers, we can't build a table by just counting all the possible cases. However, we can assume the data distribution follows a Gaussian (or Normal) distribution by calculating its mean and variance. Then, we can approximate the values, even if some records don't appear in the training set.


In [None]:
class GaussianNaiveBayesian:
    def build_table(self, X, y):
        # classes for ground truth: there are only negative(0) and positive(1) for y (hospital_death)
        self.classes = np.unique(y)
        # record prior for two classes
        self.class_priors = {}
        # **feature_mean_var_table** is a 3D dictionary table:
        # structure: [class]    [feature]           ['mean'] = mean
        # structure: [class]    [feature]           ['var']  = var
        # example:   [0]        ['gcs_eyes_apache'] ['mean'] = mean for feature='gcs_eyes_apache' when hospital_death=0
        # example:   [0]        ['gcs_eyes_apache'] ['var']  = var for feature='gcs_eyes_apache' when hospital_death=0
        self.feature_mean_var_table = {}
        for c in self.classes:
            X_c = X[y == c]
            self.class_priors[c] = len(X_c) / len(X) # TODO: calculate prior
            self.feature_mean_var_table[c] = {}
            for feature in X.columns:
                self.feature_mean_var_table[c][feature] = {}
                # Calculate mean and var for each feature
                self.feature_mean_var_table[c][feature]['mean'] = X_c[feature].mean() # TDOO: calculate the mean
                # *** 10/19 note: make sure that if you call numpy.var, the ddof should set as 1 ***
                self.feature_mean_var_table[c][feature]['var'] = X_c[feature].var(ddof=1)  # TODO: calculate the var

    def _calculate_likelihood(self, x, mean, var):
        likelihood = (1 / np.sqrt(2 * np.pi * var)) * np.exp(-(x - mean) ** 2 / (2 * var))
        return likelihood # TODO: calculate the Gaussian (normal) distribution pdf function as likelihoo

    def predict(self, X):
        predictions = [self._predict(x) for x in X.to_dict(orient='records')]
        return predictions

    def _predict(self, x):
        log_posteriors = []
        # this for loop implement: log(posteior) = log(prior) + log(likelihood)
        for c in self.classes:
            log_prior = np.log(self.class_priors[c])
            log_likelihood = 0
            for feature, value in x.items():
              #Here
                mean = self.feature_mean_var_table[c][feature]['mean']
                var = self.feature_mean_var_table[c][feature]['var']
                log_likelihood += np.log(self._calculate_likelihood(value, mean, var)) # TODO: calculate the log likelihood
            log_posterior = log_prior + log_likelihood
            log_posteriors.append((c, log_posterior))
        return max(log_posteriors, key=lambda x: x[1])[0] # TODO: Return the class with the highest logarithm of posterior probability as the predicted class


## Step 3: Build the classifier and check the correctness of Table building
You can easily build an instance of your classifier and then create the table.

To check whether you have correctly built the table of the ***Gaussian Naive Bayesian classifier***, there is an example for you to ensure that your implementation is correct.


In [None]:
# Initialize and build_table the model
gnb_classifier = GaussianNaiveBayesian()
gnb_classifier.build_table(X, y)

And you also need to output the dictionary variable ***feature_mean_var_table*** as a pickle file, which will be examined for correctness.
> Note: Since this is for checking the implementation of the build_table method, please ensure that the input for your table building, ***X and y,*** is taken from the provided advanced_training.csv file ***BEFORE*** splitting the dataset into training and validation sets.

> Hint: Two values for you to check the implementation correctness:


> `gnb_classifier.feature_mean_var_table[0]['bmi']['mean']` is
28.61544

> `gnb_classifier.feature_mean_var_table[0]['bmi']['var']` is
63.57263

In [None]:
# *** 10/18 update: the value of mean and var***
if round(gnb_classifier.feature_mean_var_table[0]['bmi']['mean'], 5) == 28.61544 and \
   round(gnb_classifier.feature_mean_var_table[0]['bmi']['var'], 5) == 63.57263:
    print('pass')
else:
    print('fail')

pass


In [None]:
# TODO: change your path to save the pickle file
pickle_file_path = 'advanced_table'
with open(pickle_file_path, 'wb') as table_file:
    pickle.dump(gnb_classifier.feature_mean_var_table, table_file)
    table_file.close()

## Step 4 Improve the classifier for Ranking 15%:

To make your model have better performance, you can try different ways to modify your model.

> hints (**you don't need to follow the hints**):

1. You can deal with the **outliers**
2. You can try first **converting real numbers into discrete values** and then using Naive Bayesian for classification.
3. You can try **def a new class for giving the prior a different weight** for decision-making.
4. Anything you want to try based on Bayesian.

> Note: You need to consider what kind of operations should also be performed on the testing_df.

### 4.1 Deal with outliers

In [None]:
#Replace (ver1)
def handle_outliers(df, features):
  for feature in features:
    Q1 = df[feature].quantile(0.25)
    Q3 = df[feature].quantile(0.75)

    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df[feature] = df[feature].apply(lambda x: lower_bound if x < lower_bound else (upper_bound if x > upper_bound else x))

  return df


outlier_features = ['bmi', 'glucose_apache']
training_df_no_outliers = handle_outliers(training_df, outlier_features)
y = training_df_no_outliers['diabetes_mellitus']
X = training_df_no_outliers.drop('diabetes_mellitus', axis=1)


In [None]:
len(X)

65000

In [None]:
y[:10]

0    1
1    0
2    0
3    0
4    1
5    1
6    1
7    1
8    0
9    1
Name: diabetes_mellitus, dtype: int64

### 4.2 def a new class

In [None]:
class CustomGaussianNaiveBayesian(GaussianNaiveBayesian):
  def __init__(self, class_priors):
    super().__init__()
    self.custom_class_priors = class_priors

  def build_table(self, X, y):
    super().build_table(X, y)
    self.class_priors = self.custom_class_priors

### 4.3 Build Table

In [218]:
custom_priors = {0: 0.45, 1: 0.55}
custom_classifier = CustomGaussianNaiveBayesian(custom_priors)
custom_classifier.build_table(X, y)

### 4.4 Split Data

In [219]:
def adv_train_val_split(X, y, val_size, random_state):
    # TODO: implement your own train_val_split
    np.random.seed(random_state)
    num_val_samples = int(val_size * len(X))
    indices = np.arange(len(X))
    np.random.shuffle(indices)

    X_train = X.iloc[indices[num_val_samples:]]
    X_val = X.iloc[indices[:num_val_samples]]
    y_train = y.iloc[indices[num_val_samples:]]
    y_val = y.iloc[indices[:num_val_samples]]

    return X_train, X_val, y_train, y_val
# TODO: Split the data into training and validation sets
# Note: please follow template for the format of return variables
val_size = 0.2
random_state = 42
X_train_adv, X_val_adv, y_train_adv, y_val_adv = adv_train_val_split(X, y, val_size, random_state)# TODO

## Step 5: Make predictions and perform evaluation
You should test your model by evaluating the training set and validation set using the ***cal_f1_score*** function you implemented.


In [220]:
... # TODO: build table on the training set
... # TODO: Make predictions on training set and calculate the f1-score
... # TODO: Make predictions on validation set and calculate the f1-score

In [221]:
def cal_f1_score(y_true, y_pred):
    # Calculate True Positives, False Positives, False Negatives
    tp = sum(1 for true, pred in zip(y_true, y_pred) if true == 1 and pred == 1)
    fp = sum(1 for true, pred in zip(y_true, y_pred) if true == 0 and pred == 1) # TODO: calculate the false positive
    fn = sum(1 for true, pred in zip(y_true, y_pred) if true == 1 and pred == 0) # TODO: calculate the false negative

    # Calculate precision and recall
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0  # TDOO: calculate the precision
    recall =  tp / (tp + fn) if (tp + fn) > 0 else 0 # TODO: calculate the recall

    # Calculate F1-score
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0 # TODO: calculate the f1-score

    return f1_score

In [224]:
# TODO: build table on the training set
# TODO: Make predictions on training set and calculate the f1-score
train_predictions_adv = custom_classifier.predict(X_train_adv)
train_f1_score_adv = cal_f1_score(y_train_adv, train_predictions_adv)
print("Training F1-score adv:", train_f1_score_adv)

# TODO: Make predictions on validation set and calculate the f1-score
val_predictions_adv = custom_classifier.predict(X_val_adv)
val_f1_score_adv = cal_f1_score(y_val_adv, val_predictions_adv)
print("Validation F1-score adv:", val_f1_score_adv)

Training F1-score adv: 0.5511537865881989
Validation F1-score adv: 0.5553304634084578


## Step 6: Generate result
> Note: Please follow the format mentioned in the slides. You can only change the path for saving your code down below.


In [225]:
predictions = custom_classifier.predict(testing_df) # TODO: predict on the testing_df

# TODO: Specify the CSV file path
csv_file_path = 'advanced_prediction.csv'

# Write the predictions to the CSV file
with open(csv_file_path, 'w', newline='') as csv_file:
    writer = csv.writer(csv_file)
    # *** 10/21 update: header name ***
    writer.writerow(['diabetes_mellitus'])
    for prediction in predictions:
        writer.writerow([prediction])