In [1]:
import numpy as np 
from helpers import * 
from implementations import *
from utilities import *

In [13]:
def clean_X(x_data):
    x_data = x_data[:, 1:]  # remove ids
    x_data[np.isnan(x_data)] = 0  # fill nan with 0
    x_data = normalize(x_data)  # normalize (x - mean) / std
    return x_data

def clean_Y(y_data): 
    y_data = y_data[:, 1]  # remove ids
    y_data[y_data == -1] = 0  # set -1 to 0 
    return y_data

In [8]:
MAX_ROWS = 10000

x_data = np.genfromtxt('data/dataset/x_train.csv', delimiter=",", skip_header=1,  max_rows=MAX_ROWS)
y_data = np.genfromtxt('data/dataset/y_train.csv', delimiter=",", skip_header=1,  max_rows=MAX_ROWS)

x_data = clean_X(x_data)
y_data = clean_Y(y_data)


x_train, x_test, y_train, y_test = split_data(x_data, y_data, 0.8)


In [5]:


N, D = x_train.shape
initial_w = np.random.rand(D)

w, loss = logistic_regression(y_train.reshape(-1, 1), x_train, initial_w.reshape(-1, 1), 10000, 0.01)

pred = predict_logistic(x_test, w)
np.sum(pred == y_test) / len(y_test), compute_f1(y_test, pred)

(321,)


(0.6485, 0.2774922918807811)

In [16]:
# balanced data

x_train_balanced, y_train_balanced = balance_dataset(x_train, y_train)

N, D = x_train_balanced.shape
initial_w = np.random.rand(D)

w, loss = logistic_regression(y_train_balanced.reshape(-1, 1), x_train_balanced, initial_w.reshape(-1, 1), 10000, 0.01)

pred = predict_logistic(x_test, w)
np.sum(pred == y_test) / len(y_test), compute_f1(y_test, pred)

(0.605, 0.26716141001855287)

In [9]:
# balanced data with regularization

x_train_balanced, y_train_balanced = balance_dataset(x_train, y_train)

N, D = x_train_balanced.shape
initial_w = np.random.rand(D)

w, loss = reg_logistic_regression(y_train_balanced.reshape(-1, 1), x_train_balanced, 0.0001, initial_w.reshape(-1, 1), 10000, 0.01)

pred = predict_logistic(x_test, w)
np.sum(pred == y_test) / len(y_test), compute_f1(y_test, pred)

  loss = -np.mean(y * np.log(sigmoids) + (1 - y) * np.log(1 - sigmoids))
  loss = -np.mean(y * np.log(sigmoids) + (1 - y) * np.log(1 - sigmoids))


(0.637, 0.2546201232032854)

In [4]:
# hyperparamter tuning
x_balanced, y_balanced = balance_dataset(x_data, y_data)

N, D = x_balanced.shape
initial_w = np.random.rand(D)

hyperparameter_tuning(x_balanced, y_balanced, reg_logistic_regression,lambdas= [0, 0.0001, 0.001, 0.01] , gammas=[0.01, 0.03, 0.07, 0.1], model_params={'initial_w': initial_w ,'max_iters': 50000})


  loss = -np.mean(y * np.log(sigmoids) + (1 - y) * np.log(1 - sigmoids))
  loss = -np.mean(y * np.log(sigmoids) + (1 - y) * np.log(1 - sigmoids))


 lambda= 0 gamma= 0.01 , CV accuracy = 0.7864, f1_score = 0.8184
 lambda= 0.0001 gamma= 0.01 , CV accuracy = 0.7888, f1_score = 0.8204
 lambda= 0.0005 gamma= 0.01 , CV accuracy = 0.7922, f1_score = 0.8226
 lambda= 0.001 gamma= 0.01 , CV accuracy = 0.7818, f1_score = 0.8160
 lambda= 0 gamma= 0.03 , CV accuracy = 0.7986, f1_score = 0.8276
 lambda= 0.0001 gamma= 0.03 , CV accuracy = 0.7908, f1_score = 0.8207
 lambda= 0.0005 gamma= 0.03 , CV accuracy = 0.7920, f1_score = 0.8224
 lambda= 0.001 gamma= 0.03 , CV accuracy = 0.7924, f1_score = 0.8231
 lambda= 0 gamma= 0.07 , CV accuracy = 0.8076, f1_score = 0.8324
 lambda= 0.0001 gamma= 0.07 , CV accuracy = 0.8048, f1_score = 0.8307
 lambda= 0.0005 gamma= 0.07 , CV accuracy = 0.7914, f1_score = 0.8202
 lambda= 0.001 gamma= 0.07 , CV accuracy = 0.7966, f1_score = 0.8261
 lambda= 0 gamma= 0.1 , CV accuracy = 0.8134, f1_score = 0.8348
 lambda= 0.0001 gamma= 0.1 , CV accuracy = 0.8052, f1_score = 0.8305
 lambda= 0.0005 gamma= 0.1 , CV accuracy = 0.

(0, 0.1)

In [31]:

x_test = np.genfromtxt('data/dataset/x_test.csv', delimiter=",", skip_header=1)
x_data = np.genfromtxt('data/dataset/x_train.csv', delimiter=",", skip_header=1)
y_data = np.genfromtxt('data/dataset/y_train.csv', delimiter=",", skip_header=1)

In [34]:
x_data = reduced_data(x_data)
x_test = reduced_data(x_test)
x_data.shape

(328135, 193)

In [36]:
def dimensionality_reduction(x_train, number_of_features):
    normalized_x = normalize(x_train)
    cov_matrix = np.cov(normalized_x, rowvar=False)
    eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)
    eigenvalue_indices = np.argsort(eigenvalues)[::-1]
    k = number_of_features  # Replace with the desired number of components
    top_eigenvalue_indices = eigenvalue_indices[:k]
    top_eigenvectors = eigenvectors[:, top_eigenvalue_indices]
    reduced_data = np.dot(normalized_x, top_eigenvectors)
    return reduced_data

IndexError: too many indices for array: array is 1-dimensional, but 2 were indexed

In [21]:
x_test = np.genfromtxt('data/dataset/x_test.csv', delimiter=",", skip_header=1)
x_data = np.genfromtxt('data/dataset/x_train.csv', delimiter=",", skip_header=1)
y_data = np.genfromtxt('data/dataset/y_train.csv', delimiter=",", skip_header=1)
print('data loaded')

y_data = clean_Y(y_data)
x_data = clean_X_0(x_data)
x_test = clean_X_0(x_test)
print('data cleaned')

x_train_balanced, y_train_balanced = (x_data, y_data)
x_train_balanced = dimensionality_reduction(x_train_balanced, 200)
print('not balanced')

N, D = x_train_balanced.shape
initial_w = np.random.rand(D)

w, loss = reg_logistic_regression(y_train_balanced.reshape(-1, 1), x_train_balanced, 1e-5, initial_w.reshape(-1, 1), 10000, 0.1)
print('trained')

data loaded
data cleaned


NameError: name 'dimensionality_reduction' is not defined

In [19]:
# next time reduction on the test set as well
x_test = np.genfromtxt('data/dataset/x_test.csv', delimiter=",", skip_header=1)

x_test_clean = clean_X_0(x_test)
y_pred = (sigmoid(x_test_clean @ w) >= 0.2).flatten()
y_pred = np.where(y_pred, 1, -1)

create_csv_submission(x_test[:, 0], y_pred, 'prediction.csv')

In [20]:
((y_pred == 1).sum() / len(y_pred)) * 100

13.556532789657979