**installing and importing useful libraries**

In [1]:
# Standard libraries
import os
import sys
import math
import random
import datetime

# Numerical computing
import numpy as np

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use("seaborn-v0_8")
sns.set_theme()

#importing optimization techniques
from implementations import *
from cross_validation import *
from helpers import *

**importing data**

In [2]:
#IF WE USE OUTLIERS DETECTION WE SHOULD ACHANGE Y TRAIN


# Paths to the CSV files ---- > change with yours
datapath_1 = "/Users/neilabenlamri/PycharmProjects/project-1-girl_power/data/dataset/pca_train_projection.csv"
datapath_2 = "/Users/neilabenlamri/PycharmProjects/project-1-girl_power/data/dataset/y_train.csv"


# Load the data
x_train = np.loadtxt(datapath_1, delimiter=',', dtype=float, skiprows=1)
y_train = np.loadtxt(datapath_2, delimiter=',' , dtype=float, skiprows=1, usecols=1)

# Convert y_train from {-1, 1} to {0, 1}
y_train = np.where(y_train == -1, 0, y_train)


print("x_train shape:", x_train.shape)
print("y_train shape:", y_train.shape)




x_train shape: (328135, 60)
y_train shape: (328135,)


### Splitting the data between 20 % validation set and 80 % training set

In [3]:
def train_val_split(X, y, val_ratio=0.20, seed=42):
    X = np.asarray(X); y = np.asarray(y)
    n = X.shape[0]
    rng = np.random.default_rng(seed)
    idx = np.arange(n)
    rng.shuffle(idx)

    n_val = int(np.round(n * val_ratio))
    val_idx = idx[:n_val]
    train_idx = idx[n_val:]

    return X[train_idx], X[val_idx], y[train_idx], y[val_idx], train_idx, val_idx



x_train_split, x_val_split, y_train_split, y_val_split, tr_idx, va_idx = train_val_split(x_train, y_train, val_ratio=0.20, seed=42)
print(x_train_split.shape)
print(x_val_split.shape)


(262508, 60)
(65627, 60)


In [4]:
# OVERSAMPLING AND BALANCING DATA
def make_balanced_subset(x_train_filtered, y_train, majority_class=-1, minority_class=1,
                         seed_major=0, seed_minor=42, seed_shuffle=7):
    # Boolean masks
    maj_mask = (y_train == majority_class)
    min_mask = (y_train == minority_class)

    # Indices per class
    maj_idx = np.nonzero(maj_mask)[0]
    min_idx = np.nonzero(min_mask)[0]

    # Target size = size of minority (undersample majority)
    n = len(min_idx)

    # Sample without replacement
    rs_maj = np.random.RandomState(seed_major)
    rs_min = np.random.RandomState(seed_minor)
    sampled_maj = rs_maj.choice(maj_idx, size=n, replace=False)
    sampled_min = rs_min.choice(min_idx, size=n, replace=False)

    # Combine and shuffle
    balanced_idx = np.concatenate([sampled_maj, sampled_min])
    rs_shuf = np.random.RandomState(seed_shuffle)
    rs_shuf.shuffle(balanced_idx)

    # Slice arrays
    x_bal = x_train_filtered[balanced_idx]
    y_bal = y_train[balanced_idx]
    return x_bal, y_bal, balanced_idx


In [None]:
# NORMALIZE DATA --- > not needed with pca

### Hyperparameters definition and metrics

In [5]:
#lambdas = [ 1e-4, 1e-3, 1e-2, 1e-1, 1, 1e1]  # regularization parameters list
#gammas = [1e-4, 1e-3, 1e-2, 1e-1, 1] # step-size parameters list
#max_iters = [100, 1000, 10000] # max iters list

lambdas = [ 1e-4, 1e-3, 1e-2, 1e-1]  # regularization parameters list
gammas = [1e-4, 1e-3] # step-size parameters list
max_iters = [100] # max iters list


def compute_auc(y_true, y_scores):
    """
    AUC calculation using Mann-Whitney statistics
    Inputs : 
            - y_true : numpy array containing the real {0, 1} values of the dataset
            - y_scores : numpy array containing our predictions
    Output : 
            AUC Area under the ROC curve 
    """
    order = np.argsort(y_scores)
    y_true_sorted = y_true[order]

    n_pos = np.sum(y_true)
    n_neg = len(y_true) - n_pos

    # rank positions 
    rank_positions = np.arange(1, len(y_true_sorted) + 1)
    rank_sum = np.sum(rank_positions[y_true_sorted == 1])

    # AUC using Mann–Whitney
    auc = (rank_sum - n_pos*(n_pos+1)/2) / (n_pos * n_neg)
    return auc

def compute_accuracy(y_true, y_scores) : 
    """
    Accuracy computation
    Inputs : 
            - y_true : numpy array containing the real {0, 1} values of the dataset
            - y_scores : numpy array containing our predictions
    Output : 
            Accuracy = correct predictions / total predictions %

    """
    correct_pred = (y_true == y_scores)
    accuracy = np.mean(correct_pred) * 100
    return accuracy


### K-fold cross validation functions

In [6]:

def build_k_indices(N, k_fold, seed=21): 
    """build k indices for k-fold.

    Args:
        N:      num of samples
        k_fold: K in K-fold, i.e. the fold num
        seed:   the random seed

    Returns:
        A 2D array of shape=(k_fold, N/k_fold) that indicates the data indices for each fold

    """
    num_row = N
    interval = int(num_row / k_fold)
    np.random.seed(seed)
    indices = np.random.permutation(num_row)
    k_indices = [indices[k * interval : (k + 1) * interval] for k in range(k_fold)]
    return np.array(k_indices)
     

def k_fold_cross_validation(y_train, x_train, lambdas, gammas, max_iters, k_fold, methods, seed, oversampling = True) :
    # dictionary to contain the best method with the best parameters and its metrics
    best_overall = {"method": "", "lambda_": 0, "gamma": 0, "max_iter": 0, "train loss": 0, "test loss": 0, "AUC": 0, "accuracy": 0, "y_pred" : None, "w_opt": None}
    results =[] # to keep the best results per method
    #creating k folders on the train set 
    k_indices = build_k_indices(len(y_train), k_fold, seed)

    for method in methods : #scrolling methods
        best_per_method =  {"method": "", "lambda_": 0, "gamma": 0, "max_iter": 0, "train loss": 0, "test loss": 0, "AUC": 0, "accuracy": 0, "y_pred" : None, "w_opt": None} 
        for lam in lambdas : # scrolling lambdas
            for gam in gammas : #scrolling gammas
                for max_it in max_iters : #scrolling iters ----> model defined at this point
                    logistic_loss_tr = []
                    logistic_loss_te =[]
                    AUC=  []
                    accuracies = []
                    
                    for k in range(k_fold) : 
                        #k-th subgroup in test, others in train
                        test_mask = np.isin(np.arange(len(y_train)), k_indices[k, :])
                        y_test_k = y_train[test_mask]
                        x_test_k = x_train[test_mask]
    
                        y_train_k=y_train[~test_mask]
                        x_train_k=x_train[~test_mask]
                        
                        if oversampling :
                            #oversampling dataset
                            x_train_k , y_train_k, _ = make_balanced_subset(x_train_k, y_train_k, 0, 1, seed_major=0, seed_minor=42, seed_shuffle=7)




                        #train the model
                        if method == "reg_logistic_regression" :
                            w_opt, loss = reg_logistic_regression(y_train_k, x_train_k,lam, np.zeros(x_train_k.shape[1]), max_it, gam)
                        elif method == "least_squares" :
                            w_opt, loss = least_squares(y_train_k, x_train_k)
                        elif method == "adam_reg_logistic_regression":
                            w_opt, loss = reg_logistic_regression_adam(y_train_k, x_train_k, lam, np.zeros(x_train_k.shape[1]), max_it, 0.9, 0.999, gam, 700 )
                        elif method == "ridge_regression" :
                            w_opt, loss = ridge_regression(y_train_k, x_train_k, lam)

                        #computing metrics 
                        logistic_loss_tr.append(compute_logistic_loss(y_train_k, x_train_k, w_opt)) #without penalizing term
                        logistic_loss_te.append(compute_logistic_loss(y_test_k, x_test_k, w_opt)) #without penalizing term CAPIRE CHE SENSO HA COMPARARE STE LOSS

                        if method in ["adam_reg_logistic_regression", "reg_logistic_regression"]:
                            pred = sigmoid(x_test_k @ w_opt) 
                        else:
                            pred = x_test_k @ w_opt

                        AUC.append(compute_auc(y_test_k, pred))
                        accuracies.append(compute_accuracy(y_test_k, (pred >= 0.5).astype(int)))
                    # updating    
                    if np.mean(AUC) > best_per_method["AUC"]:
                        best_per_method.update({"method": method, "gamma": gam, "lambda_": lam, "max_iter": max_it, "train loss": np.mean(logistic_loss_tr), "test loss": np.mean(logistic_loss_te), "AUC": np.mean(AUC), "accuracy": np.mean(accuracies), "y_pred" : (pred >= 0.5).astype(int) , "w_opt": w_opt})

                    if np.mean(AUC) > best_overall["AUC"]:
                        best_overall.update({"method": method, "gamma": gam, "lambda_": lam, "max_iter": max_it, "train loss": np.mean(logistic_loss_tr), "test loss": np.mean(logistic_loss_te), "AUC": np.mean(AUC), "accuracy": np.mean(accuracies), "y_pred" : (pred >= 0.5).astype(int) , "w_opt": w_opt})

        results.append(best_per_method)
        print(f"For method {method}, best λ={best_per_method['lambda_']}, γ={best_per_method['gamma']}, max_iter={best_per_method['max_iter']}")


    
    print(f"The best method is {best_overall['method']}, with best λ={best_overall['lambda_']}, γ={best_overall['gamma']}, max_iter={best_overall['max_iter']}")



    return best_overall, results 

        
methods=["reg_logistic_regression", "least_squares", "adam_reg_logistic_regression", "ridge_regression"]
best_method, results = k_fold_cross_validation(y_train_split, x_train_split, lambdas, gammas, max_iters, 5, methods, seed = 21, oversampling=False)






For method reg_logistic_regression, best λ=0.0001, γ=0.001, max_iter=100
For method least_squares, best λ=0.0001, γ=0.0001, max_iter=100
For method adam_reg_logistic_regression, best λ=0.1, γ=0.001, max_iter=100
For method ridge_regression, best λ=0.1, γ=0.0001, max_iter=100
The best method is ridge_regression, with best λ=0.1, γ=0.0001, max_iter=100


In [7]:
print(best_method) 
print(results)

{'method': 'ridge_regression', 'lambda_': 0.1, 'gamma': 0.0001, 'max_iter': 100, 'train loss': 0.6822609623977103, 'test loss': 0.6822688720494428, 'AUC': 0.8495749109319746, 'accuracy': 91.17197767661568, 'y_pred': array([0, 0, 0, ..., 0, 0, 0]), 'w_opt': array([ 3.41701858e-02, -8.57237163e-03,  1.56383468e-03, -5.50011864e-04,
        4.29371982e-03,  1.08214384e-02, -9.40474552e-03, -1.09135964e-02,
       -1.87978022e-02, -3.49819476e-04,  8.92197078e-03, -5.72526585e-03,
        2.38404292e-03,  1.74682936e-02, -5.14353827e-03, -1.46119664e-02,
       -5.69444551e-03,  6.45979725e-03,  6.39013233e-03,  6.24638904e-03,
        1.32962532e-02, -8.34851992e-04, -4.50626188e-04, -4.90655481e-03,
        3.90526624e-03, -7.52640965e-03,  5.38449923e-03, -8.85165204e-03,
       -5.06578449e-03,  3.56742992e-03, -1.88480608e-03, -6.85890782e-04,
        5.66216435e-03,  1.86427124e-03, -1.12577010e-03, -4.83110887e-03,
        4.23838711e-03,  1.13252708e-03, -1.02152826e-03,  8.6668711

### Validation on our validation sample

In [8]:
def validation(w_opt, x_val, y_val): 
    w_opt = best_method["w_opt"]
    if best_method["method"] in ["reg_logistic_regression", "adam_reg_logistic_regression"] :
        predictions = sigmoid(x_val @ w_opt) 
    else:
        predictions = x_val @ w_opt
    AUC = compute_auc(y_val, predictions)
    accuracy = compute_accuracy(y_val, (predictions>=0.5).astype(int))

    return AUC, accuracy


AUC, accuracy = validation(best_method["w_opt"], x_val_split, y_val_split) 

print(f"the best method {best_method['method']} has an accuracy = {accuracy} and an AUC = {AUC} on our validation set")

the best method ridge_regression has an accuracy = 91.27950386273942 and an AUC = 0.847318401354341 on our validation set


In [9]:


cross_validation_visualization(param_grid, logistic_loss_tr, logistic_loss_te)

num_par = len(param_grid)
w = 0.3 # bar width
pos = np.arange(num_par)
plt.bar(pos - w, AUC, width = w, label='AUC' )
plt.bar(pos, accuracies, width=w, label= 'Accuracy')
plt.bar(pos + w, logistic_loss_tr, width = w, label = 'train logistic loss' )
plt.bar(pos + w, logistic_loss_te, width=w, label= "test logistic loss" )

plt.xticks(pos, param_grid)
plt.xlabel('Different regularization hyperparameter values')
plt.title('Finding the best regularization hyperparamter - ADAM case')
plt.legend()

plt.show()

NameError: name 'param_grid' is not defined

### Test the model and generate the predictions

In [15]:
import csv
import numpy as np
from helpers import load_csv_data, create_csv_submission
from implementations import (
    reg_logistic_regression,
    reg_logistic_regression_adam,
    least_squares,
    ridge_regression,
)
# If sigmoid not imported already:
def sigmoid(z): return 1 / (1 + np.exp(-z))

# ====== CONFIG: paths used in dataprocessing.ipynb ======
DATA_DIR = "/Users/neilabenlamri/PycharmProjects/project-1-girl_power/data"
IMPUTE_NPZ = f"{DATA_DIR}/dataset/impute_vals.npz"
SCALER_NPZ = f"{DATA_DIR}/dataset/scaler_zscore.npz"
PCA_NPZ    = f"{DATA_DIR}/dataset/pca_model.npz"

XTEST_CSV  = f"{DATA_DIR}/dataset/x_test.csv"
XTRAIN_PCA = f"{DATA_DIR}/dataset/pca_train_projection.csv"   # already used in run.ipynb
YTRAIN_CSV = f"{DATA_DIR}/dataset/y_train.csv"

# ====== 1) Utilities to load artifacts & apply same preprocessing to TEST ======

def load_feature_names_from_imputer(npz_path):
    d = np.load(npz_path, allow_pickle=True)
    # saved by `clean_then_impute(..., save_path)`: contains 'fillers' and 'feature_names'
    feature_names = [str(x) for x in d["feature_names"]]
    fillers = d["fillers"].astype(float)
    return feature_names, fillers

def load_headers(csv_path):
    with open(csv_path, "r") as f:
        reader = csv.reader(f)
        header = next(reader)
    return header  # full header (incl. Id)

def load_matrix(csv_path):
    # loads all numeric cols except header
    return np.genfromtxt(csv_path, delimiter=",", skip_header=1)

def extract_by_names(X_full, full_header, wanted_names):
    # Map name -> index, then take columns in that exact order
    idx_map = {name: i for i, name in enumerate(full_header)}
    col_idx = [idx_map[name] for name in wanted_names]
    return X_full[:, col_idx]

def apply_imputation(X, fillers):
    X = X.copy()
    # assume NaN are present where needed; use learned fillers (mode/median) from TRAIN
    for j in range(X.shape[1]):
        m = np.isnan(X[:, j])
        if np.any(m):
            X[m, j] = fillers[j]
    return X

def load_scaler(npz_path):
    d = np.load(npz_path, allow_pickle=True)
    return {"mean": d["mean"].astype(float), "std": d["std"].astype(float)}

def transform_standardizer(matrix, scaler):
    mean, std = scaler["mean"], scaler["std"]
    return (matrix - mean) / std

def pca_transform_npz(standardized_data, pca_npz_path):
    X = np.asarray(standardized_data, dtype=float)
    d = np.load(pca_npz_path, allow_pickle=True)
    components = d["components"]    # (p x k)
    n_features = int(d["n_features"][0])
    assert X.shape[1] == n_features, f"Expected {n_features} features, got {X.shape[1]}"
    return X @ components

# ====== 2) Build Z_test by EXACTLY mirroring train preprocessing ======

def build_Z_test_from_raw():
    # a) load full test (with Id col in col 0)
    X_full = load_matrix(XTEST_CSV)           # shape (n_test, 1 + p_full)
    ids_test = X_full[:, 0].astype(int)
    X_full = X_full[:, 1:]                    # drop Id

    # b) select the same feature subset + order as TRAIN (from imputer npz)
    full_header = load_headers(XTEST_CSV)[1:] # drop 'Id' to align with X_full
    feat_names, fillers = load_feature_names_from_imputer(IMPUTE_NPZ)
    X_sel = extract_by_names(X_full, full_header, feat_names)

    # c) impute with TRAIN fillers
    X_imp = apply_imputation(X_sel, fillers)

    # d) standardize with TRAIN scaler
    scaler = load_scaler(SCALER_NPZ)
    X_std = transform_standardizer(X_imp, scaler)

    # e) PCA project with TRAIN PCA
    Z_test = pca_transform_npz(X_std, PCA_NPZ)
    return ids_test, Z_test

# ====== 3) Refit BEST model on FULL TRAIN (your CV said ridge_regression best) ======

def refit_best_on_full(best_method):
    method = best_method["method"]
    lam    = best_method["lambda_"]
    gam    = best_method["gamma"]
    max_it = best_method["max_iter"]

    # We refit on the PCA train we already loaded/used in run.ipynb:
    Z_train = np.loadtxt(XTRAIN_PCA, delimiter=",", dtype=float, skiprows=1)
    y_train = np.loadtxt(YTRAIN_CSV, delimiter=",", dtype=float, skiprows=1, usecols=1)
    # Our run.ipynb converts {-1,1} -> {0,1} for CV; we keep same target encoding if needed
    # If we kept y in {0,1} during CV, keep it consistent here:
    y_tr = np.where(y_train == -1, 0, y_train)  # match CV

    if method == "reg_logistic_regression":
        w_opt, _ = reg_logistic_regression(y_tr, Z_train, lam, np.zeros(Z_train.shape[1]), max_it, gam)
    elif method == "adam_reg_logistic_regression":
        w_opt, _ = reg_logistic_regression_adam(y_tr, Z_train, lam, np.zeros(Z_train.shape[1]), max_it, 0.9, 0.999, gam, 700)
    elif method == "least_squares":
        w_opt, _ = least_squares(y_tr, Z_train)
    elif method == "ridge_regression":
        w_opt, _ = ridge_regression(y_tr, Z_train, lam)
    else:
        raise ValueError(f"Unknown method: {method}")

    return w_opt, method

# ====== 4) Predict on TEST and write AIcrowd CSV ======

def predict_labels(X, w, method):
    # Match your CV decision rules:
    if method in ["reg_logistic_regression", "adam_reg_logistic_regression"]:
        scores = sigmoid(X @ w)
        y01 = (scores >= 0.5).astype(int)
    else:  # least_squares, ridge_regression
        scores = X @ w
        y01 = (scores >= 0.5).astype(int)
    # AIcrowd expects {-1, 1}
    y = y01.copy()
    y[y == 0] = -1
    return y

def make_submission(best_method, out_path="submission.csv"):
    # Build Z_test using the saved artifacts from dataprocessing.ipynb
    test_ids, Z_test = build_Z_test_from_raw()

    # Refit best model on FULL TRAIN (Z_train)
    w_best, method = refit_best_on_full(best_method)

    # Predict and write CSV
    y_pred = predict_labels(Z_test, w_best, method)
    create_csv_submission(test_ids, y_pred, out_path)
    print(f"[OK] Saved {out_path} with {len(y_pred)} rows — method={method}, λ={best_method['lambda_']}")

# === Run it ===
make_submission(best_method, out_path="submission.csv")


FileNotFoundError: [Errno 2] No such file or directory: '/Users/neilabenlamri/PycharmProjects/project-1-girl_power/data/dataset/scaler_zscore.npz'