In [1]:
import numpy as np 
from helpers import * 
from implementations import *
from utilities import *
import matplotlib.pyplot as plt

In [64]:
# hyperparameter tuning with only a part of the dataset
MAX_ROWS = 10000

x_data = np.genfromtxt('data/dataset/x_train.csv', delimiter=",", skip_header=1, max_rows=MAX_ROWS)
y_data = np.genfromtxt('data/dataset/y_train.csv', delimiter=",", skip_header=1, max_rows=MAX_ROWS)

In [43]:
# choose which type of cleaning is the best

x_train = clean_X_0(x_data)  # replace nan with zero add bias column
y_train = clean_Y(y_data)

accuracy, f1_score = k_fold_cross_validation(x_train, y_train, logistic_regression, k= 5,
                                             model_params={'initial_w': np.random.rand(x_train.shape[1]),'max_iters': 5000, 'gamma':0.25})

print(f'nan -> 0 f1-score : {f1_score}, accuracy {accuracy}')

x_train = clean_X_median(x_data)  # replace nan with median add bias column

accuracy, f1_score = k_fold_cross_validation(x_train, y_train, logistic_regression, k= 5,
                                                model_params={'initial_w': np.random.rand(x_train.shape[1]),'max_iters': 5000, 'gamma':0.25})

print(f'nan -> median f1-score :{f1_score}, accuracy {accuracy}')


x_train = clean_X_mean(x_data)  # replace nan with mean add bias column

accuracy, f1_score = k_fold_cross_validation(x_train, y_train, logistic_regression, k= 5,
                                                model_params={'initial_w': np.random.rand(x_train.shape[1]), 'max_iters': 5000, 'gamma':0.25})

print(f'nan -> mean f1-score :{f1_score}, accuracy {accuracy}')

nan -> 0 f1-score : 0.28935100837719596, accuracy 0.9131
nan -> median f1-score :0.3909191143465812, accuracy 0.8744
nan -> mean f1-score :0.26755627093500856, accuracy 0.9118


In [45]:

# feature selection

x_train = reduced_data(x_data, 0.9)
x_train = clean_X_median(x_train)  # replace nan with zero add bias column
y_train = clean_Y(y_data)

accuracy, f1_score = k_fold_cross_validation(x_train, y_train, logistic_regression, k= 5,
                                                model_params={'initial_w': np.random.rand(x_train.shape[1]), 'max_iters': 5000, 'gamma':0.25})

print(f'without sparse column f1-score :{f1_score}, accuracy {accuracy}')


x_train = drop_highly_correlated_features(x_data)
x_train = clean_X_median(x_train)  # replace nan with zero add bias column
y_train = clean_Y(y_data)

accuracy, f1_score = k_fold_cross_validation(x_train, y_train, logistic_regression, k= 5,
                                                model_params={'initial_w': np.random.rand(x_train.shape[1]), 'max_iters': 5000, 'gamma':0.25})

print(f'without correlated column f1-score :{f1_score}, accuracy {accuracy}')


x_train = clean_X_median(x_data)  # replace nan with zero add bias column
y_train = clean_Y(y_data)

accuracy, f1_score = k_fold_cross_validation(x_train, y_train, logistic_regression, k= 5,
                                                model_params={'initial_w': np.random.rand(x_train.shape[1]), 'max_iters': 5000, 'gamma':0.25})

print(f'with sparse columns f1-score :{f1_score}, accuracy {accuracy}')

without sparse column f1-score :0.3776743101772099, accuracy 0.8688
without correlated column f1-score :0.37720079127937944, accuracy 0.8744
with sparse columns f1-score :0.38214614915496914, accuracy 0.8720000000000001


In [50]:
x_train = clean_X_median(normalize(x_data))  # replace nan with zero add bias column
y_train = clean_Y(y_data)

accuracy, f1_score = k_fold_cross_validation(x_train, y_train, logistic_regression, k= 5,
                                                model_params={'initial_w': np.random.rand(x_train.shape[1]), 'max_iters': 5000, 'gamma':0.25})

print(f'with sparse columns f1-score :{f1_score}, accuracy {accuracy}')


accuracy, f1_score = k_fold_cross_validation(x_train, y_train, reg_logistic_regression_batch, k= 5,
                                                model_params={'initial_w': np.random.rand(x_train.shape[1]),
                                                            'max_iters': 20000, 'gamma':0.25, 'lambda_': 0, 'batch_size' : 500}, threshold=0.5)


print(f'with sparse columns f1-score :{f1_score}, accuracy {accuracy}')

with sparse columns f1-score :0.3776482711082799, accuracy 0.8695
with sparse columns f1-score :0.29756588932865186, accuracy 0.8036999999999999


In [58]:
x_train = clean_X_mean(normalize(x_data))  
y_train = clean_Y(y_data)

initial_w = np.random.rand(x_train.shape[1])



best_param_lambda, best_param_gamma, best_param_threshold, best_batch_size = hyperparameter_tuning(x_train, y_train , 
                                            reg_logistic_regression, lambdas= [0, 1e-4, 1e-5], gammas=[0.15, 0.2, 0.25, 0.3, 0.35], 
                                            model_params={'initial_w': initial_w ,'max_iters': 5000})


best_param_lambda, best_param_gamma


  loss = -np.mean(y * np.log(sigmoids) + (1 - y) * np.log(1 - sigmoids))
  loss = -np.mean(y * np.log(sigmoids) + (1 - y) * np.log(1 - sigmoids))


 lambda= 0, gamma= 0.15, CV accuracy = 0.9110, f1_score = 0.2743
 lambda= 0.0001, gamma= 0.15, CV accuracy = 0.9130, f1_score = 0.2833
 lambda= 1e-05, gamma= 0.15, CV accuracy = 0.9134, f1_score = 0.2802


KeyboardInterrupt: 

In [135]:
# finding best threshold make it more likely to predict 

accuracies = []
f1_scores = []

x_train, x_test, y_train, y_test = split_data(x_data, y_data, 0.8)
x_train = (clean_X_mean(x_train))
y_train = clean_Y(y_train)
x_test = (clean_X_mean(x_test))
y_test = clean_Y(y_test)

initial_w  = np.random.rand(x_train.shape[1])

w, loss = reg_logistic_regression(y_train.reshape(-1, 1), x_train, initial_w=initial_w.reshape(-1, 1), lambda_=0, max_iters=5000, gamma=0.25) 


y_pred = predict_logistic(x_test, w, 0.1)

print (compute_f1(y_test, y_pred), np.mean(y_pred == y_test))
# accuracy = np.mean(y_pred == y_test)
# f1 = compute_f1(y_test, y_pred)
# accuracies.append(accuracy)
# f1_scores.append(f1)

# for t in np.arange(0.1, 0.6, 0.01):
#     y_pred = predict_logistic(x_test, w, t)
    
#     accuracy = np.mean(y_pred == y_test)
#     f1 = compute_f1(y_test, y_pred)
#     accuracies.append(accuracy)
#     f1_scores.append(f1)


0 0.917


In [123]:

for t in np.arange(0.5, 0.51, 0.01):
    y_pred = predict_logistic(x_train, w, t)
    
    accuracy = np.mean(y_pred == y_train)
    print(y_pred.sum(), y_train.sum(), len(y_train), accuracy)
    f1 = compute_f1(y_train, y_pred)
    accuracies.append(accuracy)
    f1_scores.append(f1)

3374 706.0 8000 0.65175
3160 706.0 8000 0.676


In [125]:
for t in np.arange(0.1, 0.6, 0.01):
    y_pred = predict_logistic(x_test, w, t)
    
    accuracy = np.mean(y_pred == y_test)
    f1 = compute_f1(y_test, y_pred)
    accuracies.append(accuracy)
    f1_scores.append(f1)

In [129]:
f1_score, accuracy

(0.29756588932865186, 0.917)

In [13]:
# load dataset for final training and prediction on test set

x_data = np.genfromtxt('data/dataset/x_trabin.csv', delimiter=",", skip_header=1)
y_data = np.genfromtxt('data/dataset/y_train.csv', delimiter=",", skip_header=1)
x_test = np.genfromtxt('data/dataset/x_test.csv', delimiter=",", skip_header=1)


x_test_clean = clean_X_median(x_test)
y_data = clean_Y(y_data)
x_data = clean_X_median(x_data)


initial_w = np.random.rand(x_data.shape[1])

# train with optimal hyperparameters
w, loss = reg_logistic_regression_batch(y_data.reshape(-1, 1), x_data,  initial_w.reshape(-1, 1), best_param_lambda , 10000, best_param_gamma, best_batch_size) 


# predict with optimal threshold
y_pred = (sigmoid(x_test_clean @ w) >= best_param_threshold).flatten() 
y_pred = np.where(y_pred, 1, -1)

create_csv_submission(x_test[:, 0], y_pred, 'prediction.csv')

  loss = -np.mean(y * np.log(sigmoids) + (1 - y) * np.log(1 - sigmoids))
  loss = -np.mean(y * np.log(sigmoids) + (1 - y) * np.log(1 - sigmoids))
  if len(losses) > 2 and np.abs(losses[-2] - losses[-1]) < 1e-3 and not half:
  elif len(losses) > 2 and np.abs(losses[-2] - losses[-1]) < 1e-5:


In [None]:
# other things to try 
# removing features/outliers and retraining with them

In [10]:
x_data = np.genfromtxt('data/dataset/x_train.csv', delimiter=",", skip_header=1)

In [20]:
def remove_rows_by_indices(matrix, indices_to_remove):
    # Create a mask to select rows that are not in the list of indices to remove
    mask = np.ones(matrix.shape[0], dtype=bool)
    mask[indices_to_remove] = False

    # Apply the mask to the matrix to remove specified rows
    cleaned_matrix = matrix[mask]

    return cleaned_matrix

def remove_outliers_from_matrix(x_data, max_deviations):
    # Calculate the mean and standard deviation for the entire matrix
    mean = np.mean(x_data, axis=0)
    #print("mean = ", mean)
    std = np.std(x_data, axis=0)
    #print("std = ",std)
    
    # Calculate the absolute deviation from the mean for the entire matrix
    distance_from_mean = []
    for i in range(x_data.shape[0]):
        distance_from_mean.append( np.abs(x_data[i] - mean))
    #print("distance = ", distance_from_mean)

    distance_from_mean = np.array(distance_from_mean)

    #print("max_deviations * std = ", max_deviations * std)

    # Create a mask to identify outliers based on the absolute deviation
    outlier_sample = distance_from_mean > max_deviations * std

    #print("outlier samples = \n", outlier_sample)
    count = np.zeros(x_data.shape[0])
    samples_to_drop = []
    for i in range(x_data.shape[0]):
        count[i] = np.count_nonzero(outlier_sample[i])
        if count[i] > 0.5 * x_data.shape[1]:
            samples_to_drop.append(i)
    #print("count = ", count)
    #print("to drop = ", samples_to_drop)

    x_data_cleaned = remove_rows_by_indices(x_data, samples_to_drop)
    

    return x_data_cleaned


# Example usage:
# Create a sample data matrix with outliers
data_matrix = np.array([[1, 1, 1],
                        [4, 5, 6],
                        [7, 80000, 900000],
                        [20000, 10000, 10000000],
                        [-20000, -10000, -10000000]])  # Outlier in each column 



# Remove outliers from the matrix using the function
cleaned_data_x = remove_outliers_from_matrix(x_data,0.05)

print("Original Data:")
print(x_data.shape)
print("\nCleaned Data:")
print(cleaned_data_x.shape)

Original Data:
(328135, 322)

Cleaned Data:
(328135, 322)
