In [None]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
torch.manual_seed(2025)

### P2_P1 (5 points, non-coding task)

The high level idea of affine transformation in math is that for each column vector $x \in \mathbb{R}^N$, an affine transformation maps it to another column vector $y \in \mathbb{R}^M$ via

$$y = Wx + b$$

where $W \in \mathbb{R}^{M \times N}$ and $b \in \mathbb{R}^M$.

Let $W = \begin{bmatrix} 2 & -3 & 1 & 3 & -2 \\ 0 & 1 & 2 & 5 & -1 \\ 7 & -1 & -3 & 7 & 0 \end{bmatrix}$, $b = \begin{bmatrix} 1 \\ 0 \\ -1 \end{bmatrix}$, and $x = \begin{bmatrix} 1 \\ 2 \\ -3 \\ 1 \\ -2 \end{bmatrix}$.

#### Answers:

1. **What is the value of $N$?**
   $W$ has 5 columns, so $N = 5$.

2. **What is the value of $M$?**
   $W$ has 3 rows, so $M = 3$.

3. **What is the value of $y$?**
   **Reasoning:**
   First, compute $Wx$:
   $$Wx = \begin{bmatrix} 2 & -3 & 1 & 3 & -2 \\ 0 & 1 & 2 & 5 & -1 \\ 7 & -1 & -3 & 7 & 0 \end{bmatrix} \begin{bmatrix} 1 \\ 2 \\ -3 \\ 1 \\ -2 \end{bmatrix} = \begin{bmatrix} (2 \cdot 1) + (-3 \cdot 2) + (1 \cdot -3) + (3 \cdot 1) + (-2 \cdot -2) \\ (0 \cdot 1) + (1 \cdot 2) + (2 \cdot -3) + (5 \cdot 1) + (-1 \cdot -2) \\ (7 \cdot 1) + (-1 \cdot 2) + (-3 \cdot -3) + (7 \cdot 1) + (0 \cdot -2) \end{bmatrix}$$
   $$Wx = \begin{bmatrix} 2 - 6 - 3 + 3 + 4 \\ 0 + 2 - 6 + 5 + 2 \\ 7 - 2 + 9 + 7 + 0 \end{bmatrix} = \begin{bmatrix} 0 \\ 3 \\ 21 \end{bmatrix}$$
   Then, add $b$:
   $$y = Wx + b = \begin{bmatrix} 0 \\ 3 \\ 21 \end{bmatrix} + \begin{bmatrix} 1 \\ 0 \\ -1 \end{bmatrix} = \begin{bmatrix} 1 \\ 3 \\ 20 \end{bmatrix}$$
   So, $y = \begin{bmatrix} 1 \\ 3 \\ 20 \end{bmatrix}$.

In [4]:
# Verification of P2_P1
W = torch.tensor([[2, -3, 1, 3, -2],
                  [0, 1, 2, 5, -1],
                  [7, -1, -3, 7, 0]])

b = torch.tensor([[1], 
                  [0], 
                  [-1]])

x = torch.tensor([[1], 
                  [2], 
                  [-3], 
                  [1], 
                  [-2]])

y = torch.matmul(W, x) + b
print(f"y = \n{y}")

y = 
tensor([[ 1],
        [ 3],
        [20]])


### P2_P3 (10 points, coding task)

In this part, you are asked to build an affine transformation module from scratch by using **NumPy**, NOT PyTorch or TensorFlow.

Define such a class as `My_Linear_NumPy`.

- **Attributes**
    - `in_features`: Number of input features
    - `out_features`: Number of output features
    - `weight`: This refers to matrix $W$ in Part 1. The shape is `(out_features, in_features)`.
    - `bias`: This refers to vector $b$ in Part 1. The shape is `(out_features,)`.
    - `random_seed`: The NumPy random seed number used to generate initial values of weight and bias.
- **Method `__init__`**:
    - To initialize an object in this class, you need to specify `in_features` and `out_features`.
    - You may initialize the object by specifying a value for `random_seed`. If it is not specified, then its default value is 42.
    - The initial values of weight and bias are random that follow standard normal distributions generated with the seed number attribute `random_seed`.
- **Method `forward`**:
    - Input `x`: numpy array with shape $(n_0, n_1, \dots, n_{d-1}, \text{in\_features})$ with an arbitrary dimension $d=1, 2, \dots$.
    - Output `y`: numpy array with shape $(n_0, n_1, \dots, n_{d-1}, \text{out\_features})$.
    - The affine transformation works in a way that given the first $d$ indices in `x` and `y`, it does affine transformation along the last axis of `x` and `y`.
    - **Do not use any loop in your code.**

In [2]:
class My_Linear_NumPy:
    def __init__(self, in_features, out_features, random_seed=42):
        self.in_features = in_features
        self.out_features = out_features
        self.random_seed = random_seed
        
        # Initialize weight and bias
        np.random.seed(self.random_seed)
        self.weight = np.random.randn(self.out_features, self.in_features)
        self.bias = np.random.randn(self.out_features)
        
    def forward(self, x):
        """
        x shape: (..., in_features)
        weight shape: (out_features, in_features)
        bias shape: (out_features,)
        """
        # Using broadcasting to perform matrix multiplication without loops
        # x[..., np.newaxis] shape: (..., in_features, 1)
        # self.weight.T shape: (in_features, out_features)
        # result shape: (..., in_features, out_features)
        # np.sum along axis -2 results in (..., out_features)
        return np.sum(x[..., np.newaxis] * self.weight.T, axis=-2) + self.bias

# Verification with example from Part 1
model = My_Linear_NumPy(in_features=5, out_features=3, random_seed=None)
model.weight = np.array([[2, -3, 1, 3, -2],
                         [0, 1, 2, 5, -1],
                         [7, -1, -3, 7, 0]])
model.bias = np.array([1, 0, -1])

x_np = np.array([1, 2, -3, 1, -2])
y_np = model.forward(x_np)
print(f"y (NumPy) = {y_np}")

y (NumPy) = [ 1  3 20]


### P2_P4 (5 points, coding task)

Do the following tasks in this part.

1.  Construct an object in the class `My_Linear_NumPy` called `linear_model_np`.
2.  Set `in_features = 3` and `out_features = 5`.
3.  Create multiple $X$ with the following different shapes, but common numpy random seed number 2025 and the same standard normal distribution.
    *   `(in_features,)`
    *   `(10, in_features)`
    *   `(10, 20, in_features)`
    *   `(10, 20, 30, in_features)`
    After generating $X$, reset the numpy random seed number to its default value.
4.  We call our constructed function with each of the above $X$ as the input. Print the shape of each output.

In [3]:
in_features = 3
out_features = 5

linear_model_np = My_Linear_NumPy(in_features, out_features)

np.random.seed(2025)
X_list = [
    np.random.randn(in_features),
    np.random.randn(10, in_features),
    np.random.randn(10, 20, in_features),
    np.random.randn(10, 20, 30, in_features)
]
np.random.seed() # Reset to default value

for X in X_list:
    print(linear_model_np.forward(X).shape)

(5,)
(10, 5)
(10, 20, 5)
(10, 20, 30, 5)


### P2_P5 (10 points, coding task)

In this part, you are asked to program with **PyTorch, NOT NumPy**.

**Define a deep neural network module (class) named `Linear_Model`.**

It has the following architecture:

*   2 layers: 1 hidden layer and 1 output layer.
*   No activation function. That is, the connection between two consecutive layers is only an affine transformation.

In [None]:
class Linear_Model(nn.Module):
    def __init__(self, in_features, hidden_features, out_features):
        super().__init__()
        self.in_features = in_features
        self.hidden_features = hidden_features
        self.out_features = out_features
        self.linear0 = nn.Linear(in_features, hidden_features)
        self.linear1 = nn.Linear(hidden_features, out_features)

    def forward(self, x):
        x = self.linear0(x)
        x = self.linear1(x)
        return x

### P2_P6 (5 points, non-coding task)

We make the following modifications on the previous part.

*   We consider a special symmetric neural network that `out_features = in_features`.
*   No bias in all affine transformations.
*   The transformation matrix from the hidden layer to the output layer is binded to be the transpose of the transformation matrix from the input layer to the hidden layer.

**What is the total number of learnable parameters in this model?**

*   Reasoning is not required.

The total number of learnable parameters is:
**`in_features * hidden_features`**

### P2_P9 (5 points, coding task)

This question follows Part 6.

In this part, you are asked to program with **PyTorch, not NumPy**.

**Build a deep neural network class named as `Symmetric_Linear_Model` that meets the modifications imposed in Part 6.**

In [None]:
class Symmetric_Linear_Model(nn.Module):
    def __init__(self, in_features, hidden_features):
        super().__init__()
        self.in_features = in_features
        self.hidden_features = hidden_features
        self.linear0 = nn.Linear(in_features, hidden_features, bias=False)

    def forward(self, x):
        x = self.linear0(x)
        # Multiply by the transpose of the first layer's weight
        x = torch.sum(self.linear0.weight * x.reshape(*x.shape, 1), dim=-2)
        return x

### P2_P10 (5 points, coding task)

Rectified Linear Unit, or the "ReLU", is one of the most common used function in deep learning. It is defined as

$$\text{ReLU}(x) = \max \{0, x\}.$$

In this part, you are asked to use **PyTorch** to build a ReLU class named `My_ReLU` that subclasses `nn.Module`.

A successful class works in the following ways:

*   The initialization of an object in `My_ReLU` does not take any input.
*   Suppose we have a `My_ReLU` object called `activation0`. When we call `activation0(x)` with input `x` that is a tensor `x` with an arbitrary dimension and shape, we get an output `y` from the element-wise ReLU activation on `x`.

In [None]:
class My_ReLU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return torch.max(torch.zeros_like(x), x)

### P2_P11 (10 points, coding task)

It is known by math that the combination of several linear layers can still be seen as an linear layer, so we can add some non-linear activation functions, such as ReLU, in between to get better effect.

Multi-Layer Perceptron (MLP), is such a neural network composed of multiple fully connected layers with non-linear activations, commonly used in deep learning.

**Please define a class called `My_MLP_Model` that subclasses `nn.Module` and works in the following ways:**

*   The architecture consists of two hidden layers and one output layer.
*   Each hidden layer consists of an affine transformation module and a ReLU activation module.
*   Each affine transformation module shall be initialized with the build-in class `nn.Linear`.
*   Each ReLU activation module shall be initialized with your self-defined class `My_ReLU`.

In [None]:
class My_MLP_Model(nn.Module):
    def __init__(self, in_features, hidden_features1, hidden_features2, out_features):
        super().__init__()
        self.in_features = in_features
        self.hidden_features1 = hidden_features1
        self.hidden_features2 = hidden_features2
        self.out_features = out_features
        
        self.seq = nn.Sequential(
            nn.Linear(in_features, hidden_features1),
            My_ReLU(),
            nn.Linear(hidden_features1, hidden_features2),
            My_ReLU(),
            nn.Linear(hidden_features2, out_features)
        )

    def forward(self, x):
        x = self.seq(x)
        return x

### P2_P12 (5 points, coding task)

After building our deep neural network architecture in Part 11 and before using it to train our model, we need to prepare our training dataset.

Let us look at a simple application of deep neural network in studying harmonic motion in physics.

**Write code to construct the following training dataset:**

*   Use `sample_size` to store the number of samples. Set the value as 1000.
*   Define `x_train` as a tensor whose shape is `(sample_size,)` and the value on each entry is uniformly drawn between 0 and 1.
*   Define `y_train` as a tensor whose values are obtained from the following element-wise mapping from `x_train`:

    $$y = \sin(2\pi x) + 0.1 \cdot \mathcal{N}(0, 1),$$

    where $\mathcal{N}(0, 1)$ is a standard normal random variable.
*   Print the dimensions of `x_train` and `y_train`.
*   Print the shapes of `x_train` and `y_train`.

In [None]:
sample_size = 1000
x_train = torch.rand(sample_size)
y_train = torch.sin(2 * np.pi * x_train) + 0.1 * torch.randn_like(x_train)

print(x_train.ndim)
print(y_train.ndim)

print(x_train.shape)
print(y_train.shape)

### P2_P13 (15 points, coding task)

In this part, we use the training dataset constructed in Part 12 to train a model defined in Part 11.

- Use mean-squared error (MSE) as the loss function.
- Use Adam as the optimization algorithm.
- Do whole-batch training in each epoch.
- After every 10 epochs, print the following sentence:
  `Epoch: XXX. Loss: XXX.`
  The loss value should be with 4 decimal places.
- Generate an epoch-MSE loss plot after completing the training. Set the x-label as `epoch` and the y-label as `MSE loss`.

In [None]:
# HYPERPARAMETERS
''' DO NOT CHANGE ANYTHING IN THIS CODE CELL '''

hidden_features1 = 32
hidden_features2 = 16

num_epochs = 500
learning_rate = 1e-3

### WRITE YOUR SOLUTION HERE ###

my_mlp_model = My_MLP_Model(1, hidden_features1, hidden_features2, 1)
optimizer = torch.optim.Adam(my_mlp_model.parameters(), lr=learning_rate)
loss_fn = torch.nn.MSELoss()

loss_list_plot = []

for epoch in range(num_epochs):
    optimizer.zero_grad()
    
    # Forward pass: whole-batch training
    y_pred = my_mlp_model(x_train.reshape(-1, 1))
    loss = loss_fn(y_pred, y_train.reshape(-1, 1))
    
    # Backward pass and optimization
    loss.backward()
    optimizer.step()
    
    # Log loss every 10 epochs
    if epoch % 10 == 0:
        print(f"Epoch: {epoch}. Loss: {loss.item():.4f}")
    
    loss_list_plot.append(loss.item())

# Plotting the loss
plt.plot(loss_list_plot)
plt.xlabel("epoch")
plt.ylabel("MSE loss")
plt.show()

### P3_P1 (5 points, coding task)

In [None]:
import numpy as np
import pandas as pd
import copy
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
np.random.seed(2025)

We study the dataset `USAAIO_2025_round1_prob3_train.csv` provided in this contest.

The dataset can be found here:

url = "https://drive.google.com/file/d/125YsFPS2nCNRvYyy1tgnD8RhYIUglLX9/view?usp=sharing"

Do the following tasks in this part.

1. Load `USAAIO_2025_round1_prob3_train.csv` into a pandas DataFrame object called `df_1`.

2. Print the first 10 rows.

3. Define a function called `data_summary` that

    - Takes a DataFrame object as an input.

    - Prints the shape of the DataFrame.

    - Prints the data type for each column.

    - Prints the count of missing values for each column.

    - Delivers no output.

4. After defining the above function, call it by feeding `df_1` to it.

In [5]:
### WRITE YOUR SOLUTION HERE ###

df_1 = pd.read_csv('USAAIO_2025_round1_prob3_train.csv')
print(df_1.head(10))

def data_summary(df):
    print(f"Shape: {df.shape}")
    print(f"Data Types: {df.dtypes}")
    print(f"Missing Values per Column: {df.isnull().sum()}")

data_summary(df_1)

""" END OF THIS PART """

### P3_P2 (5 points, coding task)

Do the following tasks in this part.

1. Create a DataFrame object called `df_2` that keeps the following columns in `df_1` (all other columns in `df_1` shall not appear in `df_2`):
    - `Survived`
    - `Sex`
    - `Age`
    - `SibSp`
    - `Parch`
    - `Fare`
    - `Embarked`
2. Print the first 5 rows of `df_2`.
3. Print the shape of `df_2`.

In [None]:
df_2 = df_1[['Survived', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]

print(df_2.head())
print(df_2.shape)

### P3_P3 (5 points, coding task)

Do the following tasks in this part.

1. In `df_2`, remove all rows that contain null (missing) values.
2. Save the updated DataFrame object as `df_3` (that is, the change of `df_2` should not be inplace).
3. For `df_3`, print the count of missing values per column.
4. Print the shape of `df_3`.

In [None]:
df_3 = df_2.dropna()

print(df_3.isnull().sum())
print(df_3.shape)

### P3_P4 (5 points, coding task)

Do the following tasks in this part.

1. Create a deep copy of `df_3`, named `df_4`.
2. In `df_4`, create a new column called `GroupSize`. Its value is equal to `SibSp + Parch + 1`.
3. Print the first five rows of `df_3` and `df_4`.
4. Print the shapes of `df_3` and `df_4`.

In [None]:
df_4 = copy.deepcopy(df_3)
df_4['GroupSize'] = df_4['SibSp'] + df_4['Parch'] + 1

print(df_3.head())
print(df_4.head())

print(df_3.shape)
print(df_4.shape)

### P3_P5 (5 points, coding task)

Do the following tasks in this part.

1. Remove columns `SibSp` and `Parch` in `df_4` , and save this new DataFrame object as `df_5` (changes on `df_4` should not be inplace).

2. Print the first five rows of `df_4` and `df_5`.

3. Print the shapes of `df_4` and `df_5`.

In [None]:
### WRITE YOUR SOLUTION HERE ###
df_5 = df_4.drop(columns=['SibSp', 'Parch'])

print(df_4.head())
print(df_5.head())

print(df_4.shape)
print(df_5.shape)

""" END OF THIS PART """

### P3_P6 (5 points, coding and conceptual reasoning task)

In `df_5`, columns `Sex` and `Embarked` are categorical data.

**Do the following tasks to process these categorical data.**

1. To do logistic regression on this dataset, we need to do one hot encoding on these two columns. Explain why?

2. Do one hot encoding on these two columns. Set `drop_first = True` and `dtype = np.int8`. Save the new dataframe object as `df_6`.

3. Explain what `drop_first = True` means and why we do so.

4. Print the first five rows of `df_5` and `df_6`.

5. Print the shapes of `df_5` and `df_6`.

In [None]:
### WRITE YOUR SOLUTION HERE ###

# Question 1
"""
Answer:

Logistic regression requires numerical data, not categorical data.
"""

# Question 2
# Answer: (put your code here)

df_6 = pd.get_dummies(df_5, columns=['Sex', 'Embarked'], drop_first=True, dtype=np.int8)

# Question 3
"""
Answer:

Suppose a categorical variable takes value k chosen from K categories, indexed as 0, 1, ..., K-1.
By setting drop_first = True, it is replaced by a vector with shape K-1.
If k = 0, then in this vector, all entries are 0.
If k is not 0, then in this vector, the (k-1)th entry (entry indices starts from 0) is 1.

Setting drop_first = True avoids multicollinearity.
"""

# Question 4
# Answer: (put your code here)

print(df_5.head())
print(df_6.head())

# Question 5
# Answer: (put your code here)

print(df_5.shape)
print(df_6.shape)

""" END OF THIS PART """

### P3_P7 (5 points, coding task)

Do the following tasks in this part.

1. Define `X` that keeps all features in `df_6` and drops the label column `Survived`.

2. Define `y` that keeps the label column `Survived` in `df_6` only.

3. Print the types of objects `X` and `y`.

4. Print the first five rows of `X` and the first five elements in `y`.

In [None]:
### WRITE YOUR SOLUTION HERE ###
X = df_6.drop(columns=['Survived'])
y = df_6['Survived']

print(type(X))
print(type(y))

print(X.head())
print(y.head())

""" END OF THIS PART """

### P3_P8 (5 points, coding task)

Do the following tasks in this part.

1.  Define a function called `my_train_test_split` that splits the whole dataset into the training component and the test/validation component.
    *   The split is random
    *   **Inputs**
        *   `X`: A DataFrame object of features of all sample data.
        *   `y`: A Series object of labels of all sample data.
        *   `test_size`: It takes a value between 0 and 1 that denotes the fraction of samples used for testing. That is, the number of samples used for testing is `int(total number of samples * test_size)`.
    *   **Outputs**
        *   `X_train`: It keeps samples in `X` for training.
        *   `X_test`: It keeps samples in `X` for testing.
        *   `y_train`: It keeps samples in `y` for training.
        *   `y_test`: It keeps samples in `y` for testing.

2.  Call this function with inputs
    *   `X = X`
    *   `y = y`
    *   `test_size = 0.2`

3.  Print object types and shapes of `X_train`, `X_test`, `y_train`, `y_test`.

In [None]:
### WRITE YOUR SOLUTION HERE ###
def my_train_test_split(X, y, test_size):
    num_samples = X.shape[0]
    num_test_samples = int(num_samples * test_size)
    indices = np.random.permutation(num_samples)
    test_indices = indices[:num_test_samples]
    train_indices = indices[num_test_samples:]

    X_train = X.iloc[train_indices]
    X_test = X.iloc[test_indices]
    y_train = y.iloc[train_indices]
    y_test = y.iloc[test_indices]
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = my_train_test_split(X, y, 0.2)

print(type(X_train))
print(type(X_test))
print(type(y_train))
print(type(y_test))

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

### P3_P9 (5 points, coding task)

Use `StandardScaler` that has been imported from `sklearn.preprocessing` (DO NOT IMPORT IT AGAIN) to do the following tasks.

1.  Create an object called `scaler`.

2.  Use `scaler.fit_transform` to scale each column in `X_train` to standard normal. Save the scaled training dataset as `X_train_scaled`.

3.  Use `scaler.transform` to scale `X_test`. Save the scaled test dataset as `X_test_scaled`.

4.  Add a column to `X_train_scaled` with all 1s. Do the same thing for `X_test_scaled`.

5.  Print the types of objects `X_train_scaled` and `X_test_scaled`.

6.  Print the shapes of objects `X_train_scaled` and `X_test_scaled`.

7.  Print the first five rows of `X_train_scaled` and `X_test_scaled`.

In [None]:
### WRITE YOUR SOLUTION HERE ###
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled = np.concat([X_train_scaled, np.ones(X_train_scaled.shape[0], 1)])
X_test_scaled = np.concat([X_test_scaled, np.ones((X_test_scaled.shape[0], 1))])

print(type(X_train_scaled))
print(type(X_test_scaled))

print(X_train_scaled.shape)
print(X_test_scaled.shape)

print(X_train_scaled[:5])
print(X_test_scaled[:5])

### P3_P16 (5 points, coding task)

Define a function called `my_sigmoid`:

*   Input: A numpy array with any shape.

*   Output: Elementwise sigmoid functional values.

*   No loop in the body of your function.

In [None]:
### WRITE YOUR SOLUTION HERE ###
def my_sigmoid(z):
    return 1 / (1 + np.exp(-z))

""" END OF THIS PART """