In [46]:
""" naiveBayes.py """
import numpy as np

def naiveBayes(classes, learner, parameterised_function, train_data):
    f = {}
    parameters = {}
    g = {}
    for class_value in classes:
        # Initialize parameters and functions for each class
        parameters[class_value] = {}
        # f is a dictionary that maps feature indices to their parameterized functions
        f[class_value] = {}
        # parameters is a dictionary that maps feature indices to their learned parameters
        # parameters[class_value][feature] contains the learned parameters for the feature
        # train_x is the training data for the current class
        train_x = train_data[train_data[:, -1] == class_value][:, :-1] #Takes the features associated with datapoints in a class
        for feature in range(train_x.shape[1]): 
            parameters[class_value][feature] = learner(train_x[:,feature])
            # 
            f[class_value][feature] = parameterised_function(parameters[class_value][feature])
        def create_g(class_value):     
            def g(test_data):
                unscaled_feature_likelihoods = np.array([
                    [f[class_value][feature](test_data[point, feature]) for feature in range(test_data.shape[1])]
                    for point in range(test_data.shape[0])
                ])
                unscaled_point_likelihood = np.prod(unscaled_feature_likelihoods, axis=1).reshape(-1, 1)
                return unscaled_point_likelihood
            return g
        g[class_value] = create_g(class_value)
    return g

classes = [0,1]
def learner(train):
    mu = np.mean(train)
    sig = np.std(train)
    return [mu,sig]
def parameterised_function(parameters):
    mu = parameters[0]
    sig = parameters[1]
    return lambda x: np.exp(-0.5*(x - mu)**2/(sig**2))
train_data = np.array([[2.0, 4.0, 0.0], [1.0, 5.0, 0.0], [4.0, 2.0, 1.0], [6.0, 0.0, 1.0]])
g = naiveBayes(classes, learner, parameterised_function, train_data)
test_data = np.array([[2.0, 5.0], [3.0,3.0]])

# Evaluate the model on the test data
for class_value in classes:
    print(g[class_value](test_data)) 



[[3.67879441e-01]
 [1.23409804e-04]]
[[3.72665317e-06]
 [1.83156389e-02]]


## Question 1.1
### What does the naive Bayes classifier actually return?
It returns a dictionary g where g is a pair for each class and function
### What do the functions defined inside the main function do?
There are two functions defined in the main function:
1. `create_g(class_value)`
This function returns a specific g function for one class (class_value), where g is later used to calculate the test value for that class.
2. `g(test_data)`
This is where the test data is tested by calculating the likelihood of the new point for each feature using the naive bayes formula.

### What is the role of the inputs to the naiveBayes function, in particular the learner and the parameterized function? 
- `learner` returns the values [mu, sig] for each class and feature.
- `parametetized_function` converts the values mu, sig into a Gaussian function.

## Question 1.2
## Where does the independence assumption made by the Naive Bayes approach come into the calculation?
First, when training features, each feature is learned independently, without regard to other features.
Second, during prediction, where the point likelihood is the product of all feature likelihoods (independent).

## Question 1.3
## What class of functions does the parameterized function in the example represent?
It represents a Gaussian function.

## Question 1.4
See below.

In [53]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load iris.csv
iris = pd.read_csv('iris.csv')

# Encode species to integers
iris['Species'] = iris['Species'].astype('category').cat.codes
print(iris['Species'].value_counts())

# Split features and labels
# only use features 1-4 (omit numbering and species)
X = iris.iloc[:, 1:5].values  # Sepal.Length, Sepal.Width, Petal.Length, Petal.Width
# Use species as y 
y = iris['Species'].values

# Split into train and test set (randomly)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Prepare train_data in the required format (features + label as last column)
# hstack used to stack arrays in sequence horizontally (column wise)
# y_train.reshape(-1, 1) is used to change the shape of y_train to be a column
train_data = np.hstack([X_train, y_train.reshape(-1, 1)])
test_data = X_test

# Define classes
classes = [0, 1, 2]

# Train naive Bayes
g = naiveBayes(classes, learner, parameterised_function, train_data)

# Predict for test set
preds = []
for i in range(test_data.shape[0]):
    # For each test instance, we will compute the likelihood for each class
    # likelihood is calculated by calling the model for each class and taking the likelihood value
    # we will use test_data[i:i+1] as input
    # g[class_value](test_data[i:i+1]) will return a 2D array
    # likelihoods will contain the likelihood values for each class
    # we will use np.argmax to automatically get the class with the highest likelihood
    # np.argmax will return the index of the maximum value
    # we will use that index to get the corresponding class
    # classes[np.argmax(likelihoods)] will give the class with the highest likelihood
    # we will append that class to preds
    likelihoods = [g[class_value](test_data[i:i+1])[0, 0] for class_value in classes]
    preds.append(classes[np.argmax(likelihoods)])

# Evaluate accuracy
accuracy = np.mean(preds == y_test)
print(f'Predictions: {preds}')
print(f'Accuracy: {accuracy:.2f}')

Species
0    50
1    50
2    50
Name: count, dtype: int64
Predictions: [2, 1, 1, 2, 2, 2, 1, 1, 0, 2, 0, 0, 2, 2, 0, 2, 1, 0, 0, 0, 1, 0, 1, 2, 2, 1, 1, 1, 1, 0, 2, 2, 1, 0, 2, 0, 0, 0, 0, 2, 1, 0, 1, 2, 1]
Accuracy: 0.91


# Question 2
## Question 2.1
See code below. it shows that almost similar accuracy, but slightly better than using Naive Bayes. and even better when using sklearn. 
## Question 2.2.1 and 2.2.2
I cannot produce the modified code

In [56]:
""" BasicTree.py """
import numpy as np
from sklearn.datasets import make_friedman1
from sklearn.model_selection import train_test_split

# def makedata():
#   n_points = 500 # points
 
#   X, y =  make_friedman1(n_samples=n_points, n_features=5, 
#                          noise=1.0, random_state=100)
         
#   return train_test_split(X, y, test_size=0.5, random_state=3)
 
def main(X_train, X_test, y_train, y_test):
  # X_train, X_test, y_train, y_test = makedata()    
  maxdepth = 10 # maximum tree depth             
  # Create tree root at depth 0                       
  treeRoot = TNode(0, X_train,y_train) 
       
  # Build the regression tree with maximal depth equal to max_depth
  Construct_Subtree(treeRoot, maxdepth) 
    
  # Predict
  y_hat = np.zeros(len(X_test))
  for i in range(len(X_test)):
     y_hat[i] = Predict(X_test[i],treeRoot)          
    
  MSE = np.mean(np.power(y_hat - y_test,2))    
  print("Basic tree: tree loss = ",  MSE)
  print("Basic tree: accuracy = ",  np.mean(y_hat == y_test))

# tree node
class TNode:
   def __init__(self, depth, X, y): 
      self.depth = depth
      self.X = X   # matrix of explanatory variables
      self.y = y   # vector of response variables
      # initialize optimal split parameters
      self.j = None
      self.xi = None
      # initialize children to be None      
      self.left = None
      self.right = None
      # initialize the regional predictor
      self.g = None
      
   def CalculateLoss(self):
       if(len(self.y)==0):
           return 0
       
       return np.sum(np.power(self.y- self.y.mean(),2))
                    
  
def Construct_Subtree(node, max_depth):  
    if(node.depth == max_depth or len(node.y) == 1):
        node.g  = node.y.mean()
    else:
        j, xi = CalculateOptimalSplit(node)               
        node.j = j
        node.xi = xi
        Xt, yt, Xf, yf = DataSplit(node.X, node.y, j, xi)
              
        if(len(yt)>0):
            node.left = TNode(node.depth+1,Xt,yt)
            Construct_Subtree(node.left, max_depth)
        
        if(len(yf)>0):        
            node.right = TNode(node.depth+1, Xf,yf)
            Construct_Subtree(node.right, max_depth)      
     
    return node

# split the data-set
def DataSplit(X,y,j,xi):
    ids = X[:,j]<=xi      
    Xt  = X[ids == True,:]
    Xf  = X[ids == False,:]
    yt  = y[ids == True]
    yf  = y[ids == False]
    return Xt, yt, Xf, yf             

def CalculateOptimalSplit(node):
    X = node.X
    y = node.y
    best_var = 0
    best_xi = X[0,best_var]          
    best_split_val = node.CalculateLoss()
    
    m, n  = X.shape
    
    for j in range(0,n):
        for i in range(0,m):
            xi = X[i,j]
            Xt, yt, Xf, yf = DataSplit(X,y,j,xi)
            tmpt = TNode(0, Xt, yt) 
            tmpf = TNode(0, Xf, yf) 
            loss_t = tmpt.CalculateLoss()
            loss_f = tmpf.CalculateLoss()    
            curr_val =  loss_t + loss_f
            if (curr_val < best_split_val):
                best_split_val = curr_val
                best_var = j
                best_xi = xi
    return best_var,  best_xi


def Predict(X,node):
    if(node.right == None and node.left != None):
        return Predict(X,node.left)
    
    if(node.right != None and node.left == None):
        return Predict(X,node.right)
    
    if(node.right == None and node.left == None):
        return node.g
    else:
        if(X[node.j] <= node.xi):
            return Predict(X,node.left)
        else:
            return Predict(X,node.right)
    
main(X_train, X_test, y_train, y_test)  # run the main program

# compare with sklearn
from sklearn.tree import DecisionTreeRegressor
# X_train, X_test, y_train, y_test = makedata()    
regTree = DecisionTreeRegressor(max_depth = 10, random_state=0)
regTree.fit(X_train,y_train)
y_hat = regTree.predict(X_test)
MSE2 = np.mean(np.power(y_hat - y_test,2))    
print("DecisionTreeRegressor: tree loss = ",  MSE2)
# accuracy
accuracy = regTree.score(X_test, y_test)
print("DecisionTreeRegressor: accuracy =", accuracy)

# Show the confusion matrix
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_hat))

Basic tree: tree loss =  0.06666666666666667
Basic tree: accuracy =  0.9333333333333333
DecisionTreeRegressor: tree loss =  0.022222222222222223
DecisionTreeRegressor: accuracy = 0.9666666666666667
[[15  0  0]
 [ 0 14  1]
 [ 0  0 15]]


# Question 3
## Question 3.1
Obviously, apart from datasplit from iris dataset, each of classifiers need different hyperparameter that we can tweak and based on the theories we learned in the lecture. Those are:
1. In KNN:
    - `n_neighbors`: this parameter sets on how many neighbours will be use to compare (vote) on which class the new point will be. too small number of neightbours will make the decision boundary too noisy (rough and hard to interprete), but too many will make the boundary over smoother (default is 5).
    - `p`: controls the distance metric, in theory we use Euclidian distance which also the default for sklearn.
2. Logistic Regression:
    - `penalty`: this determines which type of regulation we are going to apply (prevent overfitting by pinalizing model coefficient). Default is `l2`, and have other options such as `l1` and `elasticnet` (combination).
3. SVM:
    - `kernel`: this parameter implements the kernel trick, for example in theory we learn about linear, poly, or gaussian (rbf). the default is gaussian.

## Question 3.2
See implementation below

In [50]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

iris = pd.read_csv('iris.csv')
iris['Species'] = iris['Species'].astype('category').cat.codes
X = iris.iloc[:, 1:5].values
y = iris['Species'].values
# Use the same data split as your previous examples
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# 2. Initialize the classifiers with reasonable parameters
knn = KNeighborsClassifier(n_neighbors=5)
log_reg = LogisticRegression(random_state=42, max_iter=200)
svm = SVC(kernel='rbf', C=1.0, random_state=42)

classifiers = {
    "K-Nearest Neighbors": knn,
    "Logistic Regression": log_reg,
    "Support Vector Machine": svm
}

print("\n--- Accuracy Comparison ---")

# 3. Fit models and calculate accuracy
for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name}: {accuracy:.4f} ({accuracy:.2%})")


--- Accuracy Comparison ---
K-Nearest Neighbors: 0.9778 (97.78%)
Logistic Regression: 0.9333 (93.33%)
Support Vector Machine: 0.9556 (95.56%)
