In [19]:
import pandas as pd
import numpy as np
import joblib
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
from sklearn.model_selection import cross_val_predict

# another way of ignoring warnings
import warnings
warnings.filterwarnings('ignore')

In [31]:
### Importing data
Xnew = pd.read_csv("case1Data_Xnew.csv")
# splitting to continuous and categorical variables
Xcont = Xnew.iloc[:, :95]
Xcat = Xnew.iloc[:, 95:]
Xnew.head(10)

Unnamed: 0,x_01,x_02,x_03,x_04,x_05,x_06,x_07,x_08,x_09,x_10,...,x_91,x_92,x_93,x_94,x_95,C_01,C_02,C_03,C_04,C_05
0,-0.843969,-9.104918,-5.076919,-4.222152,3.606609,-4.505494,-11.481997,16.201722,15.93947,,...,-13.884702,7.465161,-4.667464,3.949705,-10.715577,73.0,72.0,73.0,75.0,73.0
1,0.802093,-10.196678,-4.50037,-7.827837,5.199002,,-15.928708,20.151309,13.707194,-8.517576,...,-14.937164,5.229448,-6.92797,3.271193,-12.420893,73.0,72.0,73.0,71.0,75.0
2,4.234883,-10.798261,-0.465914,-6.05485,,,-16.182312,16.419564,12.152861,-6.418069,...,-11.058964,4.692879,-0.929818,,-14.551448,,72.0,73.0,71.0,73.0
3,7.041336,-5.169413,-4.158334,-4.270638,14.939894,0.008338,-10.556799,,14.18083,,...,,7.460901,-2.484389,8.149697,-11.598544,72.0,72.0,74.0,75.0,75.0
4,1.135564,-12.048088,-4.828939,-6.565217,7.4931,-2.789944,-15.859234,21.560086,14.147759,-4.848519,...,-9.607803,5.654679,-3.020357,3.030958,-13.320599,72.0,72.0,71.0,74.0,73.0
5,2.70094,-10.946685,-6.465114,-4.881989,4.638099,-6.987986,,21.61587,19.024904,-2.53236,...,-8.42064,9.082611,-6.250724,6.314635,-9.204663,74.0,72.0,,71.0,75.0
6,5.390682,,-4.149893,-2.791737,8.812586,-3.873233,-17.031808,,11.510675,-7.52057,...,-14.226802,9.110132,-4.711318,3.122558,-10.525525,74.0,72.0,,71.0,71.0
7,2.455633,-6.247203,-1.761414,,12.467152,-1.965168,-15.475678,,,4.462509,...,-14.501091,9.869636,-7.113231,2.301097,-9.705442,71.0,72.0,72.0,,74.0
8,-4.55122,-10.161342,-5.896081,-8.75356,7.2466,-3.280027,-11.823492,22.51582,,-8.37778,...,-14.616431,5.733823,-4.996681,2.051505,-9.497427,75.0,72.0,74.0,72.0,71.0
9,,-12.226706,-5.68976,-7.563683,8.756352,-6.947685,-13.352728,18.347187,12.665331,-7.265669,...,-14.291779,4.274889,-4.248666,2.483933,-13.065352,74.0,,74.0,74.0,75.0


In [32]:
Xtrain = pd.read_csv("Xytrain_regression.csv").iloc[:, 1:]
ytrain = pd.read_csv("Xytrain_regression.csv").iloc[:, 0]
Xtest = pd.read_csv("Xytest_regression.csv").iloc[:, 1:]
ytest = pd.read_csv("Xytest_regression.csv").iloc[:, 0]

XtrainALL = pd.concat([Xtrain, Xtest], axis=0).reset_index(drop=True)
ytrainALL = pd.concat([ytrain, ytest], axis=0).reset_index(drop=True)

In [26]:
def center_dataframe(data):
    mu = data.mean()
    data = data - mu
    return data, mu

In [33]:
# --- Scaling Continuous Variables ---
standard_scaler = StandardScaler()
Xcont_scaled = standard_scaler.fit_transform(Xcont)

# --- KNN Imputation for Continuous Variables ---
knn_imput = KNNImputer(n_neighbors=5)
Xcont_imputed = knn_imput.fit_transform(Xcont_scaled)

# Convert to DataFrame 
Xcont_imputed = pd.DataFrame(Xcont_imputed, columns=Xcont.columns)

# --- Imputation for Categorical Variables ---
simple_imput = SimpleImputer(strategy="most_frequent")
Xcat_imputed = simple_imput.fit_transform(Xcat)

# Convert to DataFrame 
Xcat_imputed = pd.DataFrame(Xcat_imputed, columns=Xcat.columns)

# --- 1 HOT encoding ---
Xcat_imputed1HOT = pd.get_dummies(Xcat_imputed, columns=Xcat_imputed.columns, drop_first=False).astype(int)

# --- Align the target variable with the feature DataFrames ---
# --- for Regression models ---
# --- Concatenate Continuous and Categorical Data ---
Xnew = pd.concat([Xcont_imputed, Xcat_imputed1HOT], axis=1)

In [34]:
import pandas as pd

# Load models
LAsso_trained1 = joblib.load("LAssoTrained_model.pkl")
LAsso_trained2 = joblib.load("LAssoTrained2_model.pkl")

# Make predictions
y_pred1 = LAsso_trained1.predict(Xnew)
y_pred2 = LAsso_trained2.predict(Xnew)

# Convert predictions into a DataFrame
y_pred_df = pd.DataFrame({
    "y_pred1": y_pred1,
    "y_pred2": y_pred2
})

# Display the DataFrame
y_pred_df

Unnamed: 0,y_pred1,y_pred2
0,-53.317019,-34.633572
1,59.849713,26.467369
2,65.849474,49.882100
3,-5.271491,4.621427
4,-64.256286,-40.235698
...,...,...
995,6.936261,-8.498795
996,-34.119224,-40.178692
997,-76.565194,-53.657865
998,51.035747,56.443851


In [35]:
cv_predictions = cross_val_predict(LAsso_trained1, XtrainALL, ytrainALL, cv=10)
bias = np.mean(cv_predictions) - np.mean(ytrainALL)  # Bias is the difference between predicted and true mean
bias_squared = bias ** 2

# Step 4: Estimate the Variance using bootstrap resampling
n_iterations = 100  # Number of bootstrap iterations
predictions = []

for _ in range(n_iterations):
    # Resample data (bootstrap)
    X_resample, y_resample = resample(XtrainALL, ytrainALL, n_samples=len(XtrainALL), random_state=0)
    
    # Train model on resampled data and predict on X_new
    LAsso_trained1.fit(X_resample, y_resample)
    predictions.append(LAsso_trained1.predict(Xnew))  # Collect predictions for X_new

# Calculate variance of predictions
variance = np.var(predictions, axis=0).mean()

# Step 5: Estimate the Irreducible Error (σ²ₑ)
# This can be approximated as the variance in y_train
irreducible_error = np.var(ytrainALL)

# Step 6: Calculate the Total EPE
EPE = irreducible_error + bias_squared + variance

# Print the result
print(f"Total EPE for the new dataset = {EPE}")

Total EPE for the new dataset = 5200.682998620417
