In [None]:
import os
import csv
import numpy as np
import matplotlib.pyplot as plt
from helpers import create_csv_submission
from helpers import load_csv_data
from implementations import mean_squared_error_gd, mean_squared_error_sgd, least_squares, ridge_regression, logistic_regression, reg_logistic_regression, sigmoid

In [None]:
x_train, x_test, y_train, train_ids, test_ids = load_csv_data("data/dataset", sub_sample=False)


<h3>Performing manual analysis</h3>

We start by looking into the data and removing data corresponding to identifiers and administrative codes and those corresponding to dates and times.
Moreover, we want to drop the features when the percentage of missing values is higher than an optimal percentage (check the references in the report and explanations).

Now we need to look into the Data and check how many data remain if we choose a certain percentage of missing values. For that purpose we need to plot the number of remaining features according to the chosen threshold.

<h5>x-axis = thresholds (% of missingness allowed).</h5>
<h5>y-axis = number of features with at most that much missing data.</h5>
The bar plot shows how strict or lenient we can be with missingness.
<h5>Left side (low threshold): shows how many features are almost complete.</h5>
<h5>Right side (higher threshold): shows how many features remain if you allow more missing data.</h5>
<h5>We can use it to decide: “If I drop all features with more than 20% missing values, how many features will I keep?”</h5>

In [None]:
CSV_PATH = "data/dataset/x_train.csv"

# 1) read headers
with open(CSV_PATH, 'r') as file:
    reader = csv.reader(file)
    headers = next(reader)[1:]
    feature_names = next(reader)

# 2) load numeric data (skip header row)
X = np.genfromtxt(
    CSV_PATH,
    delimiter=",",
    skip_header=1,
    dtype=float,
    missing_values=("","7","77","777","9","99","999", "88"),
    filling_values=np.nan
)

# 3) compute % missing
missing_pct = np.isnan(X).mean(axis=0) * 100
thresholds = np.arange(0, 101, 5)
feature_counts = [(missing_pct <= threshold).sum() for threshold in thresholds]

plt.figure(figsize=(10, 6))
plt.bar(thresholds, feature_counts, color='b', width=4)
plt.xlabel('Threshold % of missing data')
plt.ylabel('Number of features with at most x% missing data')
plt.title('Number of features with at most x% missing data')
plt.xticks(thresholds)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.savefig("plot.png", dpi=300)
plt.show()

# 4) keep features with ≤40% missing
mask40 = missing_pct <= 40
X = X[:, mask40]
feature_names = [name for i, name in enumerate(feature_names) if mask40[i]]

# 5) remove specific administrative/date/ID columns
remove_cols = ['Id', 'FMONTH', 'IDATE', 'IMONTH', 'IDAY', 'IYEAR',
               'DISPCODE', 'SEQNO', '_PSU', 'QSTVER', 'MSCODE', '_STSTR']
remove_idx = [i for i, name in enumerate(feature_names) if name in remove_cols]

X_reduced = np.delete(X, remove_idx, axis=1)
feature_names_reduced = [name for i, name in enumerate(feature_names) if i not in remove_idx]

# 6) check results
print("Final shape:", X_reduced.shape)
print("Remaining features:", len(feature_names_reduced))
print("kept features:", feature_names_reduced)


### We have decided to keep features with missing data up to 40% after plot analysis and some article readings.

#### 1. Demographics (baseline predictors)
- `SEX`
- `AGEG5YR`, `_AGE65YR` (senior flag), `_AGE80` (80+)
- `INCOME2`, `_INCOMG`
- `MARITAL`
- `EMPLOY1`
- `RENTHOM1`
- `CHILDREN`, `_CHLDCNT`
- `_RACE`, `_PRACE1`, `_HISPANC` (broad + detailed race/ethnicity)

#### 2. General health & healthcare access
- `GENHLTH`
- `PHYSHLTH`
- `MENTHLTH`
- `HLTHPLN1` (health coverage)
- `PERSDOC2` (personal doctor)
- `MEDCOST` (cost barrier)
- `CHECKUP1` (routine checkup)

#### 3. Chronic conditions
- `BPHIGH4` (high blood pressure)
- `BLOODCHO`, `CHOLCHK`, `TOLDHI2` (cholesterol)
- `CVDSTRK3` (stroke)
- `ASTHMA3`
- `CHCSCNCR`, `CHCOCNCR` (cancers)
- `CHCCOPD1` (COPD)
- `HAVARTH3` (arthritis)
- `ADDEPEV2` (depression)
- `CHCKIDNY` (kidney disease)
- `DIABETE3` (diabetes)

#### 4. Behaviors – smoking, alcohol, diet, exercise, prevention
- **Smoking:** `SMOKE100`, `USENOW3`, `_SMOKER3`, `_RFSMOK3`
- **Alcohol:** `ALCDAY5`, `DRNKANY5`, `_RFBING5`, `_RFDRHV5`
- **Diet:** `FRUIT1`, `FVGREEN`, `FVORANG`, `VEGETAB1`, `FRUITJU1`, `FVBEANS`, `_FRUTSUM`, `_VEGESUM`, `_FRTLT1`, `_VEGLT1`
- **Physical activity:** `EXERANY2`, `_TOTINDA`, `STRENGTH`, `_PA150R2`, `_PA300R2`, `_PASTRNG`
- **Safety & prevention:** `SEATBELT`, `FLUSHOT6`, `PNEUVAC3`, `HIVTST6`

#### 5. Anthropometrics
- `WEIGHT2`, `HEIGHT3`, `WTKG3`
- `_BMI5`, `_BMI5CAT`, `_RFBMI5`

#### 6. Psychosocial / functional health
- `QLACTLM2` (activity limitation)
- `USEEQUIP` (equipment use for disability)
- `BLIND`
- `DECIDE` (cognitive difficulties)
- `DIFFWALK`, `DIFFDRES`, `DIFFALON`


In [None]:
kept_features = [
    # 1. Demographics
    "SEX", "AGEG5YR", "_AGE65YR", "_AGE80",
    "INCOME2", "_INCOMG", "MARITAL", "EMPLOY1",
    "RENTHOM1", "CHILDREN", "_CHLDCNT",
    "_RACE", "_PRACE1", "_HISPANC",

    # 2. General health & healthcare access
    "GENHLTH", "PHYSHLTH", "MENTHLTH",
    "HLTHPLN1", "PERSDOC2", "MEDCOST", "CHECKUP1",

    # 3. Chronic conditions
    "BPHIGH4", "BLOODCHO", "CHOLCHK", "TOLDHI2",
    "CVDSTRK3", "ASTHMA3", "CHCSCNCR", "CHCOCNCR",
    "CHCCOPD1", "HAVARTH3", "ADDEPEV2",
    "CHCKIDNY", "DIABETE3",

    # 4. Behaviors – smoking, alcohol, diet, exercise, prevention
    "SMOKE100", "USENOW3", "_SMOKER3", "_RFSMOK3",
    "ALCDAY5", "DRNKANY5", "_RFBING5", "_RFDRHV5",
    "FRUIT1", "FVGREEN", "FVORANG", "VEGETAB1",
    "FRUITJU1", "FVBEANS", "_FRUTSUM", "_VEGESUM",
    "_FRTLT1", "_VEGLT1",
    "EXERANY2", "_TOTINDA", "STRENGTH",
    "_PA150R2", "_PA300R2", "_PASTRNG",
    "SEATBELT", "FLUSHOT6", "PNEUVAC3", "HIVTST6",

    # 5. Anthropometrics
    "WEIGHT2", "HEIGHT3", "WTKG3",
    "_BMI5", "_BMI5CAT", "_RFBMI5",

    # 6. Psychosocial / functional health
    "QLACTLM2", "USEEQUIP", "BLIND",
    "DECIDE", "DIFFWALK", "DIFFDRES", "DIFFALON"
]
count = len(kept_features)
print("Number of features kept:", count)


<h3>Now we perform first data preprocessing</h3>

In [None]:
x = [i in kept_features for i in headers] #headers filtering

new_headers = [] # to maintain the order
for i in headers:
    if i in kept_features:
        new_headers.append(i) #makes a list of only the kept features, preserve the original order

mapping = {new_headers[i]:i for i in range(len(new_headers))} # Builds a dictionary linking each feature name to its column index in the reduced dataset, for preprocessing

In [None]:
print("x_train shape:", x_train.shape)
print("x_test shape:", x_test.shape)
print("mask length (x):", len(new_headers))

In [None]:
#reduction of number of columns from 321 to 75
x_train_filtered = x_train[:,x] #keeps only the columns where the mask is True.
x_test_filtered = x_test[:,x]  #Ensures both train and test are filtered in the exact same way.
x_list = [x_train_filtered, x_test_filtered] #for the purpose of preprocessing the 2 sets in the same way

DATA CLEANING : 
- separate numerical features from categorical features
- univariate outliers detection fro rough cleaning 
- multivariate outliers detection for final cleaning


In [None]:
#checking which variables, if any, are numerical
numerical_headers= []
for i in new_headers :
    if len(np.unique(x_train_filtered[:, mapping[i]], )) > 20: #considering them continuous numerical features if they can have more than 20 different values
        numerical_headers.append(i)   

print(len(numerical_headers))
print(numerical_headers)


For each numerical variable we check if there are some elements in the sample that are too distant from the others

In [None]:
#visualization --- > plots saved in features plots, don't run again
num_numerical = len(numerical_headers)

for i, header in enumerate(numerical_headers):
    plt.figure(figsize=(10, 6)) 
    plt.scatter(
        np.arange(x_train_filtered.shape[0]),
        x_train_filtered[:, mapping[header]],
        color="blue",
        marker="o",
        label=header
    )
    plt.title(header)
    plt.legend()
    plt.tight_layout()
    
    
    plt.savefig(f"{header}.png")  
    
    
    #plt.show()



In [None]:
#replacing nans with the median value of the same feature 

for i in new_headers : 
    col = x_train_filtered[:, mapping[i]]
    median = np.nanmedian(col)
    mask_nan = np.isnan(col)
    col[mask_nan] = median
    x_train_filtered[:, mapping[i]] = col

we apply IQR algorithm for univariate outliers detection for a rough cleaning of data

In [None]:
#import sys, importlib
#sys.path.insert(0, r'C:\\Users\\sanni\\OneDrive\\Desktop\\POLIMI\\EPFL\\ML\\Project_1\\project-1-girl_power')

import outliers
#importlib.reload(outliers)  

outliers_indexes = []



for i in numerical_headers : 
    outliers_indexes.append(outliers.iqr(x_train_filtered[:, mapping[i]], 0.01, 0.99))

outliers_indexes = np.unique([idx for sublist in outliers_indexes for idx in sublist]) #removing doubles

print(outliers_indexes)
print(outliers_indexes.shape)
print(x_train.shape[0])

We then apply the FAST-MCD algorithm for multivariate outliers analysis

In [None]:
import sys, importlib
sys.path.insert(0, r'C:\\Users\\sanni\\OneDrive\\Desktop\\POLIMI\\EPFL\\ML\\Project_1\\project-1-girl_power')

import outliers
importlib.reload(outliers)  


indices = [mapping[h] for h in numerical_headers]
X_num = x_train_filtered[:, indices]


h_val = int(0.75 * len(y_train))

best_mu, best_Sigma, best_d = outliers.fast_mcd(X_num, h_val, 10, 1000, 1e-6, 1)

print(best_mu)
print(best_Sigma)
print(best_d)