# Assignment 2 Workbook


Please read and sign the **Academic Integrity Declaration** in myUni Assignment 2 by putting you name and ID. Your assignment may receive 0 mark if this declaration is not signed.



## Question 1a: SVM Primal Form

Please implement the training and testing algorithms of soft-margin Linear Support Vector Machine in its primal form, that is,

$$\min_{\mathbf{w},b,\{\xi_i\}} \frac{1}{2} \|\mathbf{w}\|_2^2 + C\sum_{i=1}^N \xi_i \nonumber \\ s.t.~~ y_i (\mathbf{w}^\top \mathbf{x}_i + b) \ge 1 - \xi_i, ~~\forall i \nonumber \\ \xi_i \ge 0 \nonumber$$
Use CVXPY in your implementation strictly following the format given in this Notebook, and supplying missing code **in the indicated space. Do not add or change any other code**


In [1]:
# import required libraries
# !pip install cvxpy --upgrade # if needed
import cvxpy as cp
import pandas as pd
import numpy as np
print("cp", cp.__version__)
print("np",np.__version__)

cp 1.5.2
np 1.26.4


In [2]:
# get training dataset
train = "train1.csv"
df = pd.read_csv(train, header=None)
X_train = df[:1500].iloc[:, 1:].to_numpy()
Y_train = df[:1500].iloc[:, 0].replace(0, -1).to_numpy()

In [3]:
# get test dataset
test = "test1.csv"
df = pd.read_csv(test, header=None)
X_test = df.iloc[:500, 1:].to_numpy()
Y_test = df.iloc[:500, 0].replace(0, -1).to_numpy()

In [4]:
# train linear svm in primal form
def svm_train_primal(data_train, label_train, C):
    X, Y = data_train, label_train
    n_samples, m_features = np.shape(X)
    
    W_value = 0
    b_value = 0
    slack_var_value = 0

# ====================== YOUR CODE HERE ======================  
# DO NOT use any other import statements for this question
    w = cp.Variable(m_features)
    b = cp.Variable()
    slack_var = cp.Variable(n_samples)
    
    objective = cp.Minimize(0.5 * cp.sum_squares(W_value) + C * cp.sum(slack_var_value))
    constraints = [cp.multiply(Y, X @ w + b) >= 1 - slack_var, slack_var >= 0]
    prob = cp.Problem(objective, constraints)
    prob.solve()

    W_value = w.value
    b_value = b.value
    slack_var_value = slack_var.value
# ================================================================

    return [W_value, b_value, slack_var_value]

# train primal model
C = 1
model_primal = svm_train_primal(X_train, Y_train, C)

# output svm primal form solutions
W = model_primal[0]
b = model_primal[1]
slack = model_primal[2]

print('Sum of W, b and slack:')
print(np.round(np.sum(W)+np.sum(slack)+b,2))


Sum of W, b and slack:
5861.26


## Question 1b: SVM Primal Accuracy

Please complete and run this code and copy the result into Assignment 2 Question 1b

In [5]:
# predict accuracy of svm model on test dataset
def svm_predict(data_test, label_test, svm_model):
    
    acc = 0.0
    
# ====================== YOUR CODE HERE ======================  
# DO NOT use any other import statements for this question
    W, b = svm_model[0], svm_model[1]
    
    predictions = np.sign(data_test @ W + b)
    acc = np.mean(predictions == label_test)
# ==========================================================

    return acc

# output primal accuracy as real number, not percentage
accuracy = svm_predict(X_test, Y_test, model_primal)
print('Accuracy:')
print(np.round(accuracy,2))

Accuracy:
0.83


## Question 2a: SVM Dual Form

Please implement the training and testing algorithms of soft-margin Linear Support Vector Machine in its **dual** form, that is,

$$\max_{\alpha_i}\sum_i \alpha_i - \frac{1}{2}\sum_i \sum_j \alpha_i \alpha_j y_i y_j <\mathbf{x}_i, \mathbf{x}_j> \nonumber \\ s.t. ~~~ 0 \le \alpha_i \le C\nonumber \\ ~~~ \sum_i \alpha_i y_i = 0 \nonumber$$

Use CVXPY in your implementation strictly following the format given in this Notebook, and supplying missing code in the indicated space. **Do not modify any other code**.


In [6]:
# import required libraries
import cvxpy as cp
import pandas as pd
import numpy as np

print(cp.__version__)
print(np.__version__)

1.5.2
1.26.4


In [7]:
# get training dataset
train = "train1.csv"
df = pd.read_csv(train, header=None)
X_train = df[:1500].iloc[:, 1:].to_numpy()
Y_train = df[:1500].iloc[:, 0].replace(0, -1).to_numpy()

In [8]:
# get test dataset
test = "test1.csv"
df = pd.read_csv(test, header=None)
X_test = df.iloc[:500, 1:].to_numpy()
Y_test = df.iloc[:500, 0].replace(0, -1).to_numpy()

In [9]:
# train linear svm in dual form
def svm_train_dual(data_train, label_train, C):
    
    alphas = 0
    x, y = data_train, label_train
    n_samples, n_features = np.shape(x)
    

# ====================== YOUR CODE HERE ======================  
# DO NOT use any other import statements for this question
    alpha = cp.Variable(n_samples)
    K = x @ x.T
    # PSD wrap to ensure positive semi-definiteness
    Q = cp.psd_wrap((y[:, None] * y[None, :]) * K)

    objective = cp.Maximize(cp.sum(alpha) - 0.5 * cp.quad_form(alpha, Q))
    constraints = [alpha >= 0, alpha <= C, cp.sum(cp.multiply(alpha, y)) == 0]
    
    prob = cp.Problem(objective, constraints)
    prob.solve(solver=cp.SCS)
    alphas = alpha.value
# ===========================================================

    # return svm dual model alphas
    return alphas

# train dual model
c = 1
alphas = svm_train_dual(X_train, Y_train, c)

# output svm dual form solutions
print('Sum of alphas:')
print(np.round(np.sum(alphas),2))

Sum of alphas:
443.7


## Question 2b: Dual model parameters

Complete the code where indicated, run it and copy the result into Assignment 2 Question 2b.

In [None]:
# obtain primal w*, b* from dual solution
def find_model_params_from_dual(data, label, alphas, C):
    
    # this value is used to compare values generated by CVXPYY to zero.
    zero_threshold = 0.0001
    n_samples, n_features = np.shape(data)
    a = alphas
   
    a[np.isclose(a, 0, atol=zero_threshold)] = 0  # zero out nearly zeros
    a[np.isclose(a, C, atol=zero_threshold)] = C  # round the ones that are nearly C


# ====================== YOUR CODE HERE ======================  
# DO NOT use any other import statements for this question
# Note: b_dual is a scalar, but maybe different for each a parameter, therefore calculate the mean
    w_dual = np.zeros(n_features)
    for i in range(n_samples):
        w_dual += a[i] * label[i] * data[i]
        
    flag = (a > 0) & (a < C)
    b = label[flag] - data[flag] @ w_dual
    b_dual = np.mean(b)

# =========================================================================
    return w_dual, b_dual

# output reconstructed w* and b* from svm dual problem
C = 1
model_dual = find_model_params_from_dual(X_train, Y_train, alphas, C)

print('Sum of W and b:')
print(np.round(np.sum(model_dual[0])+model_dual[1],2))

Sum of W and b:
0.36


## Question 3: Choosing SVM kernel and soft margin parameter C

In this question, you will apply training/testing methodology to choose two parameters for `sklearn.SVC`, which is Python library implementing classification SVM. This trainint/testing methodology is typical for Machine Learning. Since this could be the first time for many students applying training/testing methodology, all steps are specified.

Read SVC documentation and examples on how to train and test SVC models. (https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html)

Please use the following steps **strictly** in order to apply correct methodology and obtain correct final result.

1. Read the training and test data (already coded in the below template, do not change)
2. Split training data into train and validation (dev) sets (already coded in the below template, do not change)

Your code will start from this point.

3. Create four new svc models and train using x_train with the following parameters: `kernel=k, random_state=42`, where k in ('linear', 'poly', rbf', 'sigmoid'). **Leave the rest of options in the model unchanged**. Test using x_dev, get F1 score for each model. 
4. Choose the kernel which has highest F1 score, call it best_kernel.
5. Create five new svc models and train using x_train with the following parameters: `kernel=best_kernel, C=c, random_state=42`, where c in (0.1, 0.25, 0.5, 1.0, 2.0). **Leave the rest of options in the model unchanged**. Test using x_dev, get F1 score for each model.
6. Choose C from point 5 with highest F1. Call it best_c
7. Create new svc model and train using **x_train_full** with the following parameters: `kernel=best_kernel, C=best_c, random_state=42`. **leave the rest of options in the model unchanged**. Test using **x_test**, get F1 score. 
8. Enter output of this code into Question 3 in myUni.
    
 

In [11]:
'''
    This is template for your implementation of Question 3. 
    Do not add any imports
    Do not change any provided code
    Write your code in indicated space only
'''

# imports
import pandas as pd
import numpy as np
from sklearn.svm import SVC as svc
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# 1. Read the training and test data (already coded in the below template, do not change)
train = "train1.csv"
df = pd.read_csv(train, header=None)
x_train_full = df[:1500].iloc[:, 1:].to_numpy()
y_train_full = df[:1500].iloc[:, 0].replace(0, -1).to_numpy()

test = "test1.csv"
df = pd.read_csv(test, header=None)
x_test = df.iloc[:500, 1:].to_numpy()
y_test = df.iloc[:500, 0].replace(0, -1).to_numpy()

# 2. Split training data into train and validation (devset) sets (already coded in the below template, do not change)
x_train, x_dev, y_train, y_dev = train_test_split(x_train_full, y_train_full, test_size=500, random_state=42)

best_f1 = 0 
best_c = 0

#===================== your code is in this space ====================
# Find best kernel
kernels = ['linear', 'poly', 'rbf', 'sigmoid']
f1_scores_k = []

for k in kernels:
    model = svc(kernel=k, random_state=42)
    model.fit(x_train, y_train)
    y_pred = model.predict(x_dev)
    f1_scores_k.append(f1_score(y_dev, y_pred))
    
best_kernel = kernels[np.argmax(f1_scores_k)]

# Find best C
C_list = [0.1, 0.25, 0.5, 1.0, 2.0]
f1_scores_c = []

for c in C_list:
    model = svc(kernel=best_kernel, C=c, random_state=42)
    model.fit(x_train, y_train)
    y_pred = model.predict(x_dev)
    f1_scores_c.append(f1_score(y_dev, y_pred))
    
best_c = C_list[np.argmax(f1_scores_c)]

# Train final model with best kernel and C
final_model = svc(kernel=best_kernel, C=best_c, random_state=42)
final_model.fit(x_train_full, y_train_full)

y_pred_test = final_model.predict(x_test)
best_f1 = f1_score(y_test, y_pred_test)

#=====================================================================

print("Best F1 and best C")
print(np.round(best_f1+best_c,3))


Best F1 and best C
1.894


## Question 4a: PCA implementation

1. Perform PCA on the dataset to reduce each sample into a 10-dimensional feature vector.
2. Print the required result and enter into Assignment 2 Question 3a.

=========================================================================
- Implementing PCA algorithm.
    - Start
        - Input: $m$ number of samples as matrix $X$ of $m$ rows and $n$ columns.
        - Calculate the mean vector for each column. $$mean = \frac {1}{m} \sum \limits _{i=1} ^{n}X_{ij}$$
        - Calculate the centralised matrix $X_C$ and covariance matrix $C$. $$X_C=X-mean$$ $$C = \frac {1}{m}(X_C)^TX_C$$
        - Calculate the eigenvalues and eigenvectors using convariance matrix.
        - Select top $k$ principal components - as eigen vector corresponding to top $k$ eigen values. Construct matrix $P$.
    - End
    
- Transforming the the data using the principal components (matrix $P$) obtained using the PCA algorithm. $$Transformed \: Data = X_C P$$
- Calculating the covariance matrix of the transformed data by first centralising it (mean subtracted) and then obtaining the covariance matrix.

In [12]:
# import required libraries
import pandas as pd
import numpy as np

In [13]:
# get training dataset
train = "train1.csv"
df = pd.read_csv(train, header=None)
X = df[:1500].iloc[:, 1:].to_numpy()

In [14]:
# Selecting top 10 Principal components
no_of_components = 10

covariance_matrix_X = 0
covariance_matrix_X_transformed = 0

# ====================== YOUR CODE HERE ======================  
# DO NOT use any other import statements for this question
X_mean = np.mean(X, axis=0)
X_c = X - X_mean

covariance_matrix_X = (1/X.shape[0]) * (X_c.T @ X_c)
eigenvalues, eigenvectors = np.linalg.eig(covariance_matrix_X)
indices = np.argsort(eigenvalues)[::-1][:no_of_components]
eigenvectors_selected = eigenvectors[:, indices]
X_transformed = X_c @ eigenvectors_selected


X_transformed_C = X_transformed - np.mean(X_transformed, axis=0)
covariance_matrix_X_transformed = (1 / X_transformed.shape[0]) * X_transformed_C.T @ X_transformed_C

# ==========================================================================

sum_cov_X = np.sum(covariance_matrix_X)
sum_cov_X_transformed = np.sum(covariance_matrix_X_transformed)

print("sum_cov_X_transformed:")
print(np.round(sum_cov_X + sum_cov_X_transformed,2))

sum_cov_X_transformed:
325.08


## Question 4b: PCA data reconstructon

For this question:
1. Reconstruct the X dataset from your results in code Question 3a as X_back.
1. Centre  X_back and calculate covariance matrix for X_back and enter its sum into Assignment 2 Question 3b.

For this part, use the libraries imported in Question 4a, and do not import any moore libraries.

In [15]:
# Do not import any additional libraries for this section
# Do not change any code outside of this area marked with =============================

# ====================== YOUR CODE HERE ======================  
# DO NOT use any other import statements for this question
X_C_back = X_transformed @ eigenvectors_selected.T
X_back = X_C_back + X_mean

X_back_C = X_back - np.mean(X_back, axis=0)
covariance_matrix_X_back = (1 / X_back.shape[0]) * (X_back_C.T @ X_back_C)
# ============================================================
print("Sum of covariance_matrix_X_back:")
print(np.round(np.sum(covariance_matrix_X_back),2))

Sum of covariance_matrix_X_back:
68.06


## Questions 5a: Kernel k-Means derivation

In this question you will derive a new mathematical formula for Kernel k=Means and then implement it.

The task is specified as follows:
1. We have a dataset with M instances and N features in each instance. We can express this dataset as a set of n-dimensional vectors $\mathbf{X}=\{\mathbf{x}_i \}\in \mathbb{R}^{M\times N}$. Each of these vectors is $\mathbf{x}_i \in \mathbb{R}^N$. 

2. We have a vector of averages $\mathbf{\mu} = \frac{1}{M}\sum_i \mathbf{x}_i$, where i-th value of that vector is an average of i-th feature over the entire $X$ dataset. Therefore $\mu$ is an "average instance" for that dataset and $\mathbf{\mu} \in \mathbb{R}^N$.

3. Let $m_1 = \frac{1}{M^2}\sum_i\sum_j \|\mathbf{x}_i - \mathbf{x}_j\|_2^2$ which is average of squared pairwise distance between instances. 

4. Let $m_2 = \frac{1}{M} \sum_i\|\mathbf{x}_i - \mathbf{\mu}\|_2^2$ which is variance of distances between each $\mathbf{x}_i$ and mean $\mu$.

Note that $m_1$ and $m_2$ are scalars.

5. k-Means with RBF-kernel is given as follows:

$$k(\mathbf{x}_i,\mathbf{x}_j) = \exp(\frac{-\|\mathbf{x}_i - \mathbf{x}_j\|_2^2}{2\sigma^2})$$

As you can see $2\sigma^2$ can be expressed in terms of $m_1$, which gives you

$$k(\mathbf{x}_i,\mathbf{x}_j) = \exp(\frac{-\|\mathbf{x}_i - \mathbf{x}_j\|_2^2}{2m_2})$$

Your tasks are as follows:
1. Derive the relationship between $m_1$ and $m_2$. Let $m_2=f(m_1)$ be the dependency between $m_1$ and $m_2$.

Important: **Include all steps and transformations that you use and comment on them by answering the "why" question (why this equals that).** Points may be deducted if a transformation is not clearly explained.

2. Replace $m_2$ with your derived $f(m_1)$ in the formula of k-Means with RBF-kernel


$$k(\mathbf{x}_i,\mathbf{x}_j) = \exp(\frac{-\|\mathbf{x}_i - \mathbf{x}_j\|_2^2}{f(m_1)})$$


You can use derivation space below or use derivation space in myUni

===============================================================================
###  Derivation of the relationship between $m_1$ and $m_2$
To derive $m_2=f(m_1)$, we need to expand $m_1$ and $m_2$ to find the exact form of the function $f$.

#### Expand $m_1$:
Given that 
$$
m_1 = \frac{1}{M^2}\sum_i\sum_j \|\mathbf{x}_i - \mathbf{x}_j\|_2^2
$$
Substituting the definition of the squared Euclidean distance, we have:
\begin{align*}
m_1 
&= \frac{1}{M^2}\sum_i\sum_j \left( \|\mathbf{x}_i\|_2^2 - 2{\mathbf{x}_i}^\top \mathbf{x}_j + \|\mathbf{x}_j\|_2^2 \right) \\
&= \frac{1}{M^2}\left( \sum_i\sum_j \|\mathbf{x}_i\|_2^2 - 2\sum_i\sum_j {\mathbf{x}_i}^\top \mathbf{x}_j + \sum_i\sum_j \|\mathbf{x}_j\|_2^2 \right) \\
&= \frac{1}{M^2}\left( M\sum_i \|\mathbf{x}_i\|_2^2 + M\sum_j \|\mathbf{x}_j\|_2^2 - 2\sum_i\sum_j {\mathbf{x}_i}^\top \mathbf{x}_j \right) \\
&= \frac{2}{M}\sum_i \|\mathbf{x}_i\|_2^2 - \frac{2}{M^2}\sum_i\sum_j {\mathbf{x}_i}^\top \mathbf{x}_j
\end{align*}

#### Expand $m_2$:
Now, we expand $m_2$ in terms of dot products:
\begin{align*}
    m_2 
    &= \frac{1}{M} \sum_i\|\mathbf{x}_i - \mathbf{\mu}\|_2^2 \\
    &= \frac{1}{M} \sum_i \left( \|\mathbf{x}_i\|_2^2 - 2{\mathbf{x}_i}^\top \mathbf{\mu} + \|\mathbf{\mu}\|_2^2 \right) \\
    &= \frac{1}{M} \left( \sum_i \|\mathbf{x}_i\|_2^2 - 2\sum_i {\mathbf{x}_i}^\top \mathbf{\mu} + M\|\mathbf{\mu}\|_2^2 \right) \\
\end{align*}
Note that $\mathbf{\mu} = \frac{1}{M}\sum_i \mathbf{x}_i$, so we can have
$$
\sum_i \mathbf{x}_i^\top \mu = \sum_i \mathbf{x}_i^\top \left( \frac{1}{M} \sum_j \mathbf{x}_j \right) = \frac{1}{M} \sum_i \sum_j \mathbf{x}_i^\top \mathbf{x}_j
$$
Substituting this into the expression for $m_2$, we get:
$$
m_2 = \frac{1}{M} \sum_i \|\mathbf{x}_i\|_2^2 - \frac{2}{M^2} \sum_i\sum_j {\mathbf{x}_i}^\top \mathbf{x}_j + \|\mathbf{\mu}\|_2^2
$$
Then we can express $\|\mathbf{\mu}\|_2^2$ as:
$$ 
\|\mathbf{\mu}\|_2^2 = \left( \frac{1}{M} \sum_i \mathbf{x}_i \right)^\top \left( \frac{1}{M} \sum_j \mathbf{x}_j \right) = \frac{1}{M^2} \sum_i\sum_j {\mathbf{x}_i}^\top \mathbf{x}_j
$$
Substituting this into the expression for $m_2$, we have:
\begin{align*}
m_2 &= \frac{1}{M} \sum_i \|\mathbf{x}_i\|_2^2 - \frac{2}{M^2} \sum_i\sum_j {\mathbf{x}_i}^\top \mathbf{x}_j + \frac{1}{M^2} \sum_i\sum_j {\mathbf{x}_i}^\top \mathbf{x}_j \\
&= \frac{1}{M} \sum_i \|\mathbf{x}_i\|_2^2 - \frac{1}{M^2} \sum_i\sum_j {\mathbf{x}_i}^\top \mathbf{x}_j
\end{align*}
Thus, comparing the two expressions for $m_1$ and $m_2$, we can see that:
$$m_2 = f(m_1) = \frac{1}{2} m_1$$
Which means that the derivation of the relationship between $m_1$ and $m_2$ is $\frac{1}{2}$.

### Replace $m_2$ with your derived $f(m_1)$
Given that 
$$
k(\mathbf{x}_i,\mathbf{x}_j) = \exp \left( \frac{-\|\mathbf{x}_i - \mathbf{x}_j\|_2^2}{2\sigma^2} \right)
= \exp \left( \frac{-\|\mathbf{x}_i - \mathbf{x}_j\|_2^2}{2m_2} \right)
$$
Substituting $m_2 = f(m_1) = \frac{1}{2} m_1$, we have:
$$
k(\mathbf{x}_i,\mathbf{x}_j) = \exp \left( \frac{-\|\mathbf{x}_i - \mathbf{x}_j\|_2^2}{2 \cdot \frac{1}{2} m_1} \right) = \exp \left( \frac{-\|\mathbf{x}_i - \mathbf{x}_j\|_2^2}{m_1} \right)
$$



## Question 5b: Kernel k-Means implemention


In Question 5a you have proved the relationship between $m_1$ and $m_2$ as $m_2=f(m_1)$

In this question you will implement kernel k-means with modified RBF-kernel, using $m_2=f(m_1)$, and using SpectralClustering from sklearn to reduce the programming efford of the full implementation. 
RBF-kernel to implement is as follows:

$$k(\mathbf{x}_i,\mathbf{x}_j) = \exp(\frac{-\|\mathbf{x}_i - \mathbf{x}_j\|_2^2}{f(m_1)})$$


In order to calculate paiwise distance matrix, please use `scipy.spatial` distance function `cdist`, which is alread pre-loaded in the code. <br>
Please check the documentation of the SpectralClustering function to use precomputed affinity matrix, which you are going to supply with the above specification.


**NOTE:** to calculate distance, use `distance.cdist` with `"sqeuclidean"`


For reference, please write **below** the full formula of your RBF kernel replaceing $f(m_1)$ with your derivation 


In [16]:
import pandas as pd
import numpy as np
from sklearn.cluster import SpectralClustering
from scipy.spatial import distance

df = pd.read_csv('train1.csv', header = None)
X = df.iloc[:1000,1:].to_numpy(copy=True)
Y = df.iloc[:1000,:1].to_numpy(copy=True)

In [None]:
n_clusters = 10
centroids_rbf = np.array(n_clusters)

centroids_rbf = []

# ====================== YOUR CODE HERE ======================  
# DO NOT use any other import statements for this question
"""
# use random_state=0 in the SpectralClustering function to make the results reproducible.
"""
dist_sq = distance.cdist(X, X, metric="sqeuclidean")
M = X.shape[0]

m1 = np.sum(dist_sq) / (M * M)
K = np.exp(-dist_sq / m1).astype(float)

model = SpectralClustering(
    n_clusters=n_clusters,
    affinity='precomputed',
    random_state=0
)
labels = model.fit_predict(K)

centroids_rbf = np.vstack([X[labels==c].mean(axis=0) for c in range(n_clusters)])
# =================================================================
    
print('Sum of centroids:')
print(np.round(np.sum(centroids_rbf),2))


Sum of centroids:
23.95


## Question 6a: Adaboost questions

Answer questions in myUni assignment 2

## Question 6b: Adaboost code correction
The following code shows an incomplete implementation of Adaboost algorithm. The code does not implement the full logic specific to Adaboost. Please modify it to make a correct implementation. After completing the implementation, run the code and copy the result into space provided in myUni Question 6.

1. **Do not delete anything, just add what is required to eslack_varsting code. Adding code is sufficient to make correct implementation** <br>
2. **You can add code only in sections as indicated.** <br>
3. **Please clearly comment the purpose of the code that you added.**
4. **Do not change or add any imports**



In [18]:
# Do not change this section

# import required libraries
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from math import exp, log as ln

# DO NOT use any other import statements for this question

In [19]:
# Do not change this section

# get training dataset
train = "train1.csv"
df = pd.read_csv(train, header=None)
X_train = df.iloc[:, 1:].to_numpy()
Y_train = df.iloc[:, 0].replace(0, -1).to_numpy()

In [20]:
# Do not change this section

# get test dataset
test = "test1.csv"
df = pd.read_csv(test, header=None)
X_test = df.iloc[:, 1:].to_numpy()
Y_test = df.iloc[:, 0].replace(0, -1).to_numpy()

In [21]:
# sample_weights for training data
def weak_classifier_train(train_data, train_label, sample_weights=None):
    # Create a decision stump
    stump = DecisionTreeClassifier(max_depth=1)
    
    # Train the stump using the weighted samples
    stump.fit(train_data, train_label, sample_weight=sample_weights) # Added: support for sample_weight
    
    return stump

def weak_classifier_predict(test_data, model):
    
    return model.predict([test_data]) 


def Adaboost_train(train_data, train_label, T):

# train_data: N x d matrix
# train_label: N x 1 vector
# T: the number of weak classifiers in the ensemble
    # Added: initialize sample weights
    N = train_data.shape[0]
    sample_weights = np.ones(N) / N  
    
    ensemble_models = []
    alphas = []  # store alpha values
    for t in range(0,T):
        # model_param_t returns the model parameters of the learned weak classifier
        model = weak_classifier_train(train_data, train_label, sample_weights) # Added: pass sample weights
        # Added: predict using the weak classifier
        pred = model.predict(train_data)
        # Added: calculate error
        err = np.sum(sample_weights * (pred != train_label)) / np.sum(sample_weights)
        # Added: avoid division by zero
        err = max(err, 1e-10)
        # Added: calculate alpha
        alpha = 0.5 * np.log((1 - err) / err)
        alphas.append(alpha)
        # Added: update sample weights
        sample_weights *= np.exp(-alpha * train_label * pred)
        sample_weights /= np.sum(sample_weights)  # normalize
        # Added: store model and alpha
        model = (model, alpha)
        ensemble_models.append(model)
    
    return ensemble_models


def Adaboost_test(test_data, ensemble_models):
# test_data: n x d
    predictions = []

    for i in range(len(test_data)):
        decision_ensemble = 0
        
        for k in range(len(ensemble_models)):

            prediction = weak_classifier_predict(test_data[i], ensemble_models[k][0])   # Added: tuple index for model
            decision_ensemble = decision_ensemble + prediction * ensemble_models[k][1]  # Added: tuple index for alpha
            
            if decision_ensemble > 0:
                prediction = 1
            else:
                prediction = -1
        predictions.append(prediction)
            
    return predictions

# predict and output accuracy

ensemble_models = Adaboost_train(X_train, Y_train, 3)
predicted_labels = Adaboost_test(X_test, ensemble_models)

In [22]:
######## do not change this code ############

# Calculate accuracy
accuracy = accuracy_score(Y_test, predicted_labels)

print('Accuracy:')
print(np.round(accuracy,3))

Accuracy:
0.876
