In [73]:
import numpy as np

In [30]:
def generate_fake_data(num_data):
    #generate number of each label
    num_1 = sum(np.random.randint(2, size = num_data))
    num_2 = num_data - num_1

    #generate data based on number of each label
    mean_1 = [2, 3]
    cov_1 = [[0.6, 0], [0, 0.6]]
    mean_2 = [0, 4]
    cov_2 = [[0.4, 0], [0, 0.4]]

    fake_point_1 = np.random.multivariate_normal(mean_1, cov_1, size = num_1)
    fake_point_2 = np.random.multivariate_normal(mean_2, cov_2, size = num_2)
    fake_points = np.concatenate([fake_point_1, fake_point_2], axis = 0)
    fake_points = np.concatenate([np.ones_like(fake_points[:, 0]).reshape(-1, 1), fake_points], axis = 1)

    label_1 = np.ones(num_1)
    label_2 = np.ones(num_2) * (-1)
    labels = np.concatenate([label_1, label_2], axis = 0)
    return fake_points, labels

In [41]:
data_train, label_train = generate_fake_data(200)
data_test, label_test = generate_fake_data(5000)

In [42]:
print(data_train.shape)
print(label_train.shape)
print(data_test.shape)
print(label_test.shape)

(200, 3)
(200,)
(5000, 3)
(5000,)


# Problem 13

In [48]:
def solve_linear_regression(data, label):
    w = np.linalg.lstsq(data, label, rcond=None)[0].reshape(-1, 1)
    y_hat = np.dot(data, w).reshape(-1)
    return w, y_hat

In [68]:
num_experiment = 100
ttl_loss = 0
for i in range(num_experiment):
    np.random.seed(i)
    data_train, label_train = generate_fake_data(200)
    data_test, label_test = generate_fake_data(5000)
    _, y_hat = solve_linear_regression(data_train, label_train)
    avg_squared_error = np.mean((y_hat - label_train) ** 2)
    ttl_loss += avg_squared_error
print(f"E_in^sqr is {ttl_loss / num_experiment}")

E_in^sqr is 0.2865835469477231


# Probelm 14

In [72]:
num_experiment = 100
ttl_loss = 0
ttl_train = 0
ttl_test = 0
for i in range(num_experiment):
    np.random.seed(i)
    data_train, label_train = generate_fake_data(200)
    data_test, label_test = generate_fake_data(5000)

    #training error
    w, y_hat_train = solve_linear_regression(data_train, label_train)
    y_hat_train = np.sign(y_hat_train)
    num_correct_train = sum(y_hat_train == label_train)
    num_wrong_train = len(data_train) - num_correct_train
    error_train = num_wrong_train / len(data_train)
    ttl_train += error_train

    #testing error
    y_hat_test = np.dot(data_test, w).reshape(-1)
    y_hat_test = np.sign(y_hat_test)
    num_correct_test = sum(y_hat_test == label_test)
    num_wrong_test = len(data_test) - num_correct_test
    error_test = num_wrong_test / len(data_test)
    ttl_test += error_test

    #compute error difference
    ttl_loss += np.abs(error_train - error_test)

print(f"Average error difference is {ttl_loss / num_experiment}")

Average error difference is 0.013424


# Problem 15

In [189]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def solve_logistic_regression(data_train, label_train):
    num_iter = 500
    eta = 0.1
    w = np.zeros_like(data_train[0, :])
    for i in range(num_iter):
        grad = np.zeros_like(w)
        for j in range(len(data_train)):
            logit = sigmoid(-1 * label_train[j] * np.dot(w, data_train[j]))
            grad += logit * (-label_train[j]) * data_train[j]
        grad = grad / len(data_train)
        w = w - eta * grad
    return w, y_hat

In [193]:
num_experiment = 100
ttl_loss = 0
ttl_A = 0
ttl_B = 0
for i in range(num_experiment):
    np.random.seed(i)
    data_train, label_train = generate_fake_data(200)
    data_test, label_test = generate_fake_data(5000)

    #model - A
    w_lin, y_hat_train = solve_linear_regression(data_train, label_train)
    y_hat_train = np.sign(y_hat_train)

    #testing error
    y_hat_test = np.dot(data_test, w_lin).reshape(-1)
    y_hat_test = np.sign(y_hat_test)
    num_correct_test = sum(y_hat_test == label_test)
    num_wrong_test = len(data_test) - num_correct_test
    error_test_A = num_wrong_test / len(data_test)
    ttl_A += error_test_A

    #model - B 
    w, y_hat_train = solve_logistic_regression(data_train, label_train)
    y_hat_test = np.dot(data_test, w).reshape(-1)
    y_hat_test = np.sign(y_hat_test)
    
    num_correct_test = sum(y_hat_test == label_test)
    num_wrong_test = len(data_test) - num_correct_test
    error_test_B = num_wrong_test / len(data_test)
    ttl_B += error_test_B
    #print(error_test_A, error_test_B)
print(f"Average error difference in model A is {ttl_A / num_experiment}")
print(f"Average error difference in model B is {ttl_B / num_experiment}")

Average error difference in model A is 0.058165999999999975
Average error difference in model B is 0.05943399999999998


# Problem 16

In [198]:
def generate_outlier(num_data):
    #generate data based on number of each label
    mean = [6, 0]
    cov = [[0.3, 0], [0, 0.1]]
    fake_point = np.random.multivariate_normal(mean, cov, size = num_data)
    fake_point = np.concatenate([np.ones_like(fake_point[:, 0]).reshape(-1, 1), fake_point], axis = 1)

    label = np.ones(num_data)
    return fake_point, label

In [199]:
num_experiment = 100
ttl_loss = 0
ttl_A = 0
ttl_B = 0

for i in range(num_experiment):
    np.random.seed(i)

    data_train, label_train = generate_fake_data(200)
    data_train_out, label_train_out = generate_outlier(20)
    data_train = np.concatenate([data_train, data_train_out], axis = 0)
    label_train = np.concatenate([label_train, label_train_out], axis = 0)

    data_test, label_test = generate_fake_data(5000)

    #model - A
    w_lin, y_hat_train = solve_linear_regression(data_train, label_train)
    y_hat_train = np.sign(y_hat_train)

    #testing error
    y_hat_test = np.dot(data_test, w_lin).reshape(-1)
    y_hat_test = np.sign(y_hat_test)
    num_correct_test = sum(y_hat_test == label_test)
    num_wrong_test = len(data_test) - num_correct_test
    error_test_A = num_wrong_test / len(data_test)
    ttl_A += error_test_A

    #model - B 
    w, y_hat_train = solve_logistic_regression(data_train, label_train)
    y_hat_test = np.dot(data_test, w).reshape(-1)
    y_hat_test = np.sign(y_hat_test)
    
    num_correct_test = sum(y_hat_test == label_test)
    num_wrong_test = len(data_test) - num_correct_test
    error_test_B = num_wrong_test / len(data_test)
    ttl_B += error_test_B
    #print(error_test_A, error_test_B)
print(f"Average error difference in model A is {ttl_A / num_experiment}")
print(f"Average error difference in model B is {ttl_B / num_experiment}")

Average error difference in model A is 0.09350199999999996
Average error difference in model B is 0.059572
