In [1]:
import pandas as pd

# Read the CSV file specifying the delimiter as a semicolon
df = pd.read_csv("Projet 2.csv", sep=";", decimal=",")


# Display the first few rows to confirm it loaded correctly
df


Unnamed: 0,Extract Concentration g/L,KI Concentration M,Immersion Time H,Temperature Deg C,IE%
0,0.50,0.000,2,30,81.63
1,0.50,0.000,2,40,71.24
2,0.50,0.000,2,50,63.21
3,0.50,0.000,5,30,83.96
4,0.50,0.000,5,40,72.91
...,...,...,...,...,...
74,1.25,0.055,5,30,90.59
75,1.25,0.055,5,50,88.83
76,1.25,0.055,2,40,90.90
77,1.25,0.055,8,40,87.32


In [2]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# 1. Read the CSV file:
#    - The file is delimited by semicolons.
#    - The 'decimal' parameter tells pandas to interpret commas as decimal points.
df = pd.read_csv("Projet 2.csv", sep=";", decimal=",")

# 2. Check for missing values:
print("Missing values before imputation:")
print(df.isnull().sum())

# 3. Impute missing values using the median strategy.
#    This is especially helpful when the number of data points is small.
imputer = SimpleImputer(strategy='median')
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

# 4. Handle outliers using the IQR method:
def cap_outliers(col):
    # Calculate the 25th (Q1) and 75th (Q3) percentiles.
    Q1 = col.quantile(0.25)
    Q3 = col.quantile(0.75)
    IQR = Q3 - Q1
    # Define lower and upper bounds (1.5 times the IQR)
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    # Cap the values outside the bounds
    return col.clip(lower=lower_bound, upper=upper_bound)

# Apply the outlier capping function to all numerical columns.
# (If some columns are non-numeric, you may need to select only the numeric ones.)
df_imputed = df_imputed.apply(cap_outliers)

# 5. Feature scaling: Standardize the features.
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df_imputed), columns=df_imputed.columns)

# Optional: Display the first few rows of the preprocessed data.
print("Preprocessed (scaled) data:")
df_scaled

Missing values before imputation:
Extract Concentration g/L    0
KI Concentration M           0
Immersion Time H             0
Temperature Deg C            0
IE%                          0
dtype: int64
Preprocessed (scaled) data:


Unnamed: 0,Extract Concentration g/L,KI Concentration M,Immersion Time H,Temperature Deg C,IE%
0,-0.414019,-0.732985,-1.20953,-1.20953,-0.205788
1,-0.414019,-0.732985,-1.20953,0.00000,-1.544562
2,-0.414019,-0.732985,-1.20953,1.20953,-2.378236
3,-0.414019,-0.732985,0.00000,-1.20953,0.094437
4,-0.414019,-0.732985,0.00000,0.00000,-1.329379
...,...,...,...,...,...
74,0.547966,2.693395,0.00000,-1.20953,0.948727
75,0.547966,2.693395,0.00000,1.20953,0.721947
76,0.547966,2.693395,-1.20953,0.00000,0.988671
77,0.547966,2.693395,1.20953,0.00000,0.527380


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, RidgeCV, LassoCV
from sklearn.metrics import mean_squared_error, r2_score

# Assume df_scaled is your preprocessed DataFrame.
# For example, it might look like:
#    feature1   feature2   feature3   ...   IE%
# 0  -0.414019  -0.732985  -1.20953   ...   0.205788
# 1  -0.414019  -0.732985   0.00000   ...  -1.544562
# ...

# 1. Separate the predictors (X) from the target variable (y), which is in the column "IE%"
X = df_scaled.drop('IE%', axis=1)
y = df_scaled['IE%']

# 2. Split the data into training and testing sets (e.g., 80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [13]:

# ------------------------------------------
# 3. Regularized Linear Model Without Cross-Validation
#    Using Ridge regression with a fixed alpha value.
# ------------------------------------------
alpha_value = 1 # 1.0  # Choose an alpha value
ridge_model = Ridge(alpha=alpha_value)
ridge_model.fit(X_train, y_train)

# Predictions on train and test sets
y_train_pred = ridge_model.predict(X_train)
y_test_pred = ridge_model.predict(X_test)

# Evaluate the model
print("Ridge Regression (fixed alpha = 1.0):")
print("Train MSE:", mean_squared_error(y_train, y_train_pred))
print("Test MSE:", mean_squared_error(y_test, y_test_pred))
print("Test R²:", r2_score(y_test, y_test_pred))

# ------------------------------------------
# 4. Regularized Model with Cross-Validation for Hyperparameter Tuning
#    Using RidgeCV to select the best alpha from a candidate set.
# ------------------------------------------
# Define a candidate range for alpha (regularization strength)
alphas = np.logspace(-3, 3, 100)  # From 0.001 to 1000
ridge_cv = RidgeCV(alphas=alphas, cv=5)
ridge_cv.fit(X_train, y_train)

print("\nRidge Regression with Cross-Validation:")
print("Best alpha selected:", ridge_cv.alpha_)

# Evaluate the tuned RidgeCV model on the test set.
y_test_pred_cv = ridge_cv.predict(X_test)
print("Test MSE (RidgeCV):", mean_squared_error(y_test, y_test_pred_cv))
print("Test R² (RidgeCV):", r2_score(y_test, y_test_pred_cv))

# ------------------------------------------
# 5. Using LassoCV for Lasso Regression (another regularization method)
# ------------------------------------------
lasso_cv = LassoCV(alphas=None, cv=5, random_state=42)  # Let LassoCV determine the best alpha
lasso_cv.fit(X_train, y_train)

print("\nLasso Regression with Cross-Validation:")
print("Best alpha selected:", lasso_cv.alpha_)

# Evaluate the Lasso model on the test set.
y_test_pred_lasso = lasso_cv.predict(X_test)
print("Test MSE (LassoCV):", mean_squared_error(y_test, y_test_pred_lasso))
print("Test R² (LassoCV):", r2_score(y_test, y_test_pred_lasso))


Ridge Regression (fixed alpha = 1.0):
Train MSE: 0.6423566080748396
Test MSE: 0.3203918899431294
Test R²: 0.20951697741065622

Ridge Regression with Cross-Validation:
Best alpha selected: 20.09233002565046
Test MSE (RidgeCV): 0.2998344708863525
Test R² (RidgeCV): 0.26023702140278493

Lasso Regression with Cross-Validation:
Best alpha selected: 0.057820399419265346
Test MSE (LassoCV): 0.27411224835167847
Test R² (LassoCV): 0.3236998644246044


Simple Linear Models with Regularization  i ned to try with the Outlier Treatment:

In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import RidgeCV, LassoCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score

# Assume df_scaled is your preprocessed DataFrame with the target column "IE%"
# For example, your data may look like:
#    feature1   feature2   feature3   ...   IE%
# 0  -0.414019  -0.732985  -1.20953   ...   0.205788
# 1  -0.414019  -0.732985   0.00000   ...  -1.544562
# ...

# Separate predictors (X) and target (y)
X = df_scaled.drop('IE%', axis=1)
y = df_scaled['IE%']

# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ------------------------------------------
# Create a pipeline for Ridge Regression with polynomial features
# ------------------------------------------
ridge_pipeline = Pipeline([
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),  # Generate polynomial features (squared & interactions)
    ('scaler', StandardScaler()),                                # Scale the features
    ('ridgecv', RidgeCV(alphas=np.logspace(-3, 3, 100), cv=5))     # Tune alpha via 5-fold CV
])

ridge_pipeline.fit(X_train, y_train)
y_pred_ridge = ridge_pipeline.predict(X_test)

print("RidgeCV with Polynomial Features:")
print("Best alpha selected:", ridge_pipeline.named_steps['ridgecv'].alpha_)
print("Test MSE:", mean_squared_error(y_test, y_pred_ridge))
print("Test R²:", r2_score(y_test, y_pred_ridge))

# ------------------------------------------
# Create a pipeline for Lasso Regression with polynomial features
# ------------------------------------------
lasso_pipeline = Pipeline([
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('scaler', StandardScaler()),
    ('lassocv', LassoCV(alphas=None, cv=5, random_state=42))       # LassoCV will choose its own alpha range
])

lasso_pipeline.fit(X_train, y_train)
y_pred_lasso = lasso_pipeline.predict(X_test)

print("\nLassoCV with Polynomial Features:")
print("Best alpha selected:", lasso_pipeline.named_steps['lassocv'].alpha_)
print("Test MSE:", mean_squared_error(y_test, y_pred_lasso))
print("Test R²:", r2_score(y_test, y_pred_lasso))


RidgeCV with Polynomial Features:
Best alpha selected: 0.6135907273413176
Test MSE: 0.2126807080396137
Test R²: 0.4752660906384444

LassoCV with Polynomial Features:
Best alpha selected: 0.01778902890119987
Test MSE: 0.19373047874210853
Test R²: 0.522020815099516


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import Ridge, RidgeCV, LassoCV, ElasticNetCV
from sklearn.svm import SVR
from sklearn.feature_selection import RFE
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline

# ------------------------------
# PREPROCESSING: Polynomial Feature Generation & Scaling
# ------------------------------
# Assume df_scaled is your preprocessed DataFrame with target column "IE%"
X = df_scaled.drop('IE%', axis=1)
y = df_scaled['IE%']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Generate degree-2 polynomial features.
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

# Scale the polynomial features.
scaler = StandardScaler()
X_train_poly_scaled = scaler.fit_transform(X_train_poly)
X_test_poly_scaled = scaler.transform(X_test_poly)

# Total number of polynomial features generated:
n_poly_features = X_train_poly_scaled.shape[1]
print("Total polynomial features:", n_poly_features)

# Dictionary to hold the performance results of each model option
results = []

# ------------------------------
# OPTION A: LassoCV for Feature Selection, then Ridge
# ------------------------------
lasso_cv = LassoCV(alphas=None, cv=5, random_state=42)
lasso_cv.fit(X_train_poly_scaled, y_train)

# Identify non-zero coefficient indices.
nonzero_indices = np.where(lasso_cv.coef_ != 0)[0]
n_nonzero_A = len(nonzero_indices)
print("Option A: LassoCV selected {} out of {} features.".format(n_nonzero_A, n_poly_features))

# Reduce datasets to selected features.
X_train_A = X_train_poly_scaled[:, nonzero_indices]
X_test_A  = X_test_poly_scaled[:, nonzero_indices]

# Retrain Ridge on the LassoCV-selected features.
ridge_A = Ridge(alpha=20.09)
ridge_A.fit(X_train_A, y_train)
y_pred_A = ridge_A.predict(X_test_A)

mse_A = mean_squared_error(y_test, y_pred_A)
r2_A = r2_score(y_test, y_pred_A)
print("\nOption A (Ridge on LassoCV-selected features):")
print("Test MSE:", mse_A)
print("Test R²:", r2_A)

results.append({
    'Option': 'A (Ridge on LassoCV-selected)',
    'Selected Features': n_nonzero_A,
    'Best Alpha': np.nan,  # Not applicable since Ridge was fixed here.
    'Test MSE': mse_A,
    'Test R²': r2_A
})

# ------------------------------
# OPTION B: RFE with Ridge estimator, then Ridge
# ------------------------------
ridge_estimator = Ridge(alpha=20.09)
n_features_to_select_B = n_poly_features // 2  # Example: select half the features
print("\nOption B: Full polynomial feature set has {} features. Selecting {} features via RFE."
      .format(n_poly_features, n_features_to_select_B))

rfe = RFE(estimator=ridge_estimator, n_features_to_select=n_features_to_select_B, step=1)
rfe.fit(X_train_poly_scaled, y_train)
selected_features_B = rfe.support_
n_selected_B = np.sum(selected_features_B)
print("RFE selected {} features.".format(n_selected_B))

X_train_B = X_train_poly_scaled[:, selected_features_B]
X_test_B  = X_test_poly_scaled[:, selected_features_B]

ridge_B = Ridge(alpha=20.09)
ridge_B.fit(X_train_B, y_train)
y_pred_B = ridge_B.predict(X_test_B)

mse_B = mean_squared_error(y_test, y_pred_B)
r2_B = r2_score(y_test, y_pred_B)
print("\nOption B (Ridge on RFE-selected features):")
print("Test MSE:", mse_B)
print("Test R²:", r2_B)

results.append({
    'Option': 'B (Ridge on RFE-selected)',
    'Selected Features': n_selected_B,
    'Best Alpha': np.nan,
    'Test MSE': mse_B,
    'Test R²': r2_B
})

# ------------------------------
# OPTION C: Train LassoCV Directly on Full Polynomial Features
# ------------------------------
lasso_model = LassoCV(alphas=None, cv=5, random_state=42)
lasso_model.fit(X_train_poly_scaled, y_train)
y_pred_C = lasso_model.predict(X_test_poly_scaled)

mse_C = mean_squared_error(y_test, y_pred_C)
r2_C = r2_score(y_test, y_pred_C)
print("\nOption C (LassoCV directly on full features):")
print("Best alpha selected:", lasso_model.alpha_)
print("Test MSE:", mse_C)
print("Test R²:", r2_C)

results.append({
    'Option': 'C (LassoCV directly)',
    'Selected Features': n_poly_features,  # All features used.
    'Best Alpha': lasso_model.alpha_,
    'Test MSE': mse_C,
    'Test R²': r2_C
})

# ------------------------------
# OPTION D: RFE with LassoCV as Base Estimator, then Ridge
# ------------------------------
lasso_cv_for_rfe = LassoCV(alphas=None, cv=5, random_state=42)
n_features_to_select_D = n_poly_features // 2  # For example, select half of the features
rfe_D = RFE(estimator=lasso_cv_for_rfe, n_features_to_select=n_features_to_select_D, step=1)
rfe_D.fit(X_train_poly_scaled, y_train)
selected_features_D = rfe_D.support_
n_selected_D = np.sum(selected_features_D)
print("\nOption D: RFE with LassoCV as base estimator selected {} features.".format(n_selected_D))

X_train_D = X_train_poly_scaled[:, selected_features_D]
X_test_D  = X_test_poly_scaled[:, selected_features_D]

ridge_D = Ridge(alpha=20.09)
ridge_D.fit(X_train_D, y_train)
y_pred_D = ridge_D.predict(X_test_D)

mse_D = mean_squared_error(y_test, y_pred_D)
r2_D = r2_score(y_test, y_pred_D)
print("\nOption D (Ridge on RFE-selected features with LassoCV base):")
print("Test MSE:", mse_D)
print("Test R²:", r2_D)

results.append({
    'Option': 'D (Ridge on RFE(LassoCV)-selected)',
    'Selected Features': n_selected_D,
    'Best Alpha': np.nan,
    'Test MSE': mse_D,
    'Test R²': r2_D
})

# ------------------------------
# ENSEMBLE / STACKING: Averaging Predictions from Options A, B, and C
# ------------------------------
ensemble_pred = (y_pred_A + y_pred_B + y_pred_C) / 3.0
mse_ensemble = mean_squared_error(y_test, ensemble_pred)
r2_ensemble = r2_score(y_test, ensemble_pred)
print("\nEnsemble (average of Options A, B, and C):")
print("Test MSE:", mse_ensemble)
print("Test R²:", r2_ensemble)

results.append({
    'Option': 'Ensemble (A+B+C average)',
    'Selected Features': '-',  # Not applicable for ensemble.
    'Best Alpha': '-',        # Not applicable.
    'Test MSE': mse_ensemble,
    'Test R²': r2_ensemble
})

# ------------------------------
# Additional Model: ElasticNetCV on full polynomial features
# ------------------------------
elastic_net_cv = ElasticNetCV(cv=5, random_state=42, alphas=np.logspace(-3, 3, 100))
elastic_net_cv.fit(X_train_poly_scaled, y_train)
y_pred_en = elastic_net_cv.predict(X_test_poly_scaled)
mse_en = mean_squared_error(y_test, y_pred_en)
r2_en = r2_score(y_test, y_pred_en)
print("\nElasticNetCV on full polynomial features:")
print("Best alpha:", elastic_net_cv.alpha_)
print("Best l1_ratio:", elastic_net_cv.l1_ratio_)
print("Test MSE:", mse_en)
print("Test R²:", r2_en)

results.append({
    'Option': 'ElasticNetCV',
    'Selected Features': n_poly_features,
    'Best Alpha': elastic_net_cv.alpha_,
    'Test MSE': mse_en,
    'Test R²': r2_en
})

# ------------------------------
# Additional Model: Support Vector Regression (SVR) with RBF kernel
# ------------------------------
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1]
}
svr = SVR(kernel='rbf')
grid_search = GridSearchCV(svr, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train_poly_scaled, y_train)
best_svr = grid_search.best_estimator_
y_pred_svr = best_svr.predict(X_test_poly_scaled)
mse_svr = mean_squared_error(y_test, y_pred_svr)
r2_svr = r2_score(y_test, y_pred_svr)
print("\nSVR (RBF kernel) with GridSearchCV:")
print("Best parameters:", grid_search.best_params_)
print("Test MSE:", mse_svr)
print("Test R²:", r2_svr)

results.append({
    'Option': 'SVR (RBF kernel)',
    'Selected Features': n_poly_features,
    'Best Alpha': '-',  # Not applicable for SVR.
    'Test MSE': mse_svr,
    'Test R²': r2_svr
})

# ------------------------------
# Nested Cross-Validation Example with Ridge Regression Pipeline
# ------------------------------
pipeline_ridge = Pipeline([
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('scaler', StandardScaler()),
    ('ridgecv', RidgeCV(alphas=np.logspace(-3, 3, 100), cv=5))
])
outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)
nested_scores = cross_val_score(pipeline_ridge, X, y, cv=outer_cv, scoring='neg_mean_squared_error')
avg_nested_mse = -np.mean(nested_scores)
print("\nNested CV for Ridge Regression Pipeline:")
print("Average Nested CV Test MSE:", avg_nested_mse)

results.append({
    'Option': 'Nested CV (Ridge Pipeline)',
    'Selected Features': 'Pipeline',
    'Best Alpha': '-',  # Not directly applicable.
    'Test MSE': avg_nested_mse,
    'Test R²': '-'      # R² not computed in nested CV summary.
})

# ------------------------------
# Display all results in a summary table.
# ------------------------------
results_df = pd.DataFrame(results)
print("\nSummary of Model Performance:")
print(results_df)


Total polynomial features: 14
Option A: LassoCV selected 14 out of 14 features.

Option A (Ridge on LassoCV-selected features):
Test MSE: 0.24941847184827753
Test R²: 0.38462528639149296

Option B: Full polynomial feature set has 14 features. Selecting 7 features via RFE.
RFE selected 7 features.

Option B (Ridge on RFE-selected features):
Test MSE: 0.23253784995743076
Test R²: 0.4262737969634458

Option C (LassoCV directly on full features):
Best alpha selected: 0.01778902890119987
Test MSE: 0.19373047874210853
Test R²: 0.522020815099516

Option D: RFE with LassoCV as base estimator selected 7 features.

Option D (Ridge on RFE-selected features with LassoCV base):
Test MSE: 0.23232688203398888
Test R²: 0.42679430502568594

Ensemble (average of Options A, B, and C):
Test MSE: 0.20036763349300135
Test R²: 0.5056453751662198

ElasticNetCV on full polynomial features:
Best alpha: 0.01873817422860384
Best l1_ratio: 0.5
Test MSE: 0.20122528577691487
Test R²: 0.5035293429225854

SVR (RBF ker

In [28]:
results_df

Unnamed: 0,Option,Selected Features,Best Alpha,Test MSE,Test R²
0,A (Ridge on LassoCV-selected),14,,0.249418,0.384625
1,B (Ridge on RFE-selected),7,,0.232538,0.426274
2,C (LassoCV directly),14,0.017789,0.19373,0.522021
3,D (Ridge on RFE(LassoCV)-selected),7,,0.232327,0.426794
4,Ensemble (A+B+C average),-,-,0.200368,0.505645
5,ElasticNetCV,14,0.018738,0.201225,0.503529
6,SVR (RBF kernel),14,-,0.262174,0.353155
7,Nested CV (Ridge Pipeline),Pipeline,-,0.490283,-


In [30]:
# Convert the 'Test MSE' and 'Test R²' columns to numeric, coercing errors (i.e. '-' becomes NaN)
results_df['Test MSE'] = pd.to_numeric(results_df['Test MSE'], errors='coerce')
results_df['Test R²'] = pd.to_numeric(results_df['Test R²'], errors='coerce')

# Option 1: Best by Test MSE (lowest error)
best_mse_row = results_df.loc[results_df['Test MSE'].idxmin()]

# Option 2: Best by Test R² (highest variance explained)
best_r2_row = results_df.loc[results_df['Test R²'].idxmax()]

# Create a new DataFrame that contains both "best" rows for clarity.
best_results = pd.DataFrame([best_mse_row, best_r2_row])
print("\nSummary of Best Model Results:")
print(best_results)



Summary of Best Model Results:
                 Option  Selected Features  Best Alpha  Test MSE   Test R²
2  C (LassoCV directly)                 14    0.017789   0.19373  0.522021
2  C (LassoCV directly)                 14    0.017789   0.19373  0.522021


In [34]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import Ridge, RidgeCV, LassoCV, ElasticNetCV
from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import SVR
from sklearn.ensemble import StackingRegressor
from sklearn.feature_selection import RFE
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline

# ------------------------------
# PREPROCESSING: Polynomial Feature Generation & Scaling
# ------------------------------
# Assume df_scaled is your preprocessed DataFrame with target column "IE%"
X = df_scaled.drop('IE%', axis=1)
y = df_scaled['IE%']

# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Generate degree-2 polynomial features.
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

# Scale the polynomial features.
scaler = StandardScaler()
X_train_poly_scaled = scaler.fit_transform(X_train_poly)
X_test_poly_scaled = scaler.transform(X_test_poly)

# Total number of polynomial features generated:
n_poly_features = X_train_poly_scaled.shape[1]
print("Total polynomial features:", n_poly_features)

# ------------------------------
# Option C (from your previous code): LassoCV Directly on Full Polynomial Features
# ------------------------------
lasso_model = LassoCV(alphas=None, cv=5, random_state=42)
lasso_model.fit(X_train_poly_scaled, y_train)
y_pred_C = lasso_model.predict(X_test_poly_scaled)
mse_C = mean_squared_error(y_test, y_pred_C)
r2_C = r2_score(y_test, y_pred_C)
print("\nOption C (LassoCV directly on full features):")
print("Best alpha selected:", lasso_model.alpha_)
print("Test MSE:", mse_C)
print("Test R²:", r2_C)

# ------------------------------
# 1. ElasticNetCV: Exploring l1_ratio parameter space
# ------------------------------
l1_ratios = [0.1, 0.3, 0.5, 0.7, 0.9]  # candidate values for the mixing parameter
elastic_net_cv = ElasticNetCV(l1_ratio=l1_ratios, cv=5, random_state=42,
                              alphas=np.logspace(-3, 3, 100))
elastic_net_cv.fit(X_train_poly_scaled, y_train)
y_pred_en = elastic_net_cv.predict(X_test_poly_scaled)
mse_en = mean_squared_error(y_test, y_pred_en)
r2_en = r2_score(y_test, y_pred_en)
print("\nElasticNetCV on full polynomial features:")
print("Best alpha:", elastic_net_cv.alpha_)
print("Best l1_ratio:", elastic_net_cv.l1_ratio_)
print("Test MSE:", mse_en)
print("Test R²:", r2_en)

# ------------------------------
# 2. Kernel Ridge Regression (KRR) with RBF kernel
# ------------------------------
param_grid_krr = {
    'alpha': np.logspace(-3, 3, 50),
    'gamma': [0.001, 0.01, 0.1, 1, 10, 'scale', 'auto']
}
grid_search_krr = GridSearchCV(KernelRidge(kernel='rbf'), param_grid_krr, cv=5,
                               scoring='neg_mean_squared_error')
grid_search_krr.fit(X_train_poly_scaled, y_train)
best_krr = grid_search_krr.best_estimator_
y_pred_krr = best_krr.predict(X_test_poly_scaled)
mse_krr = mean_squared_error(y_test, y_pred_krr)
r2_krr = r2_score(y_test, y_pred_krr)
print("\nKernel Ridge Regression (RBF) with GridSearchCV:")
print("Best parameters:", grid_search_krr.best_params_)
print("Test MSE:", mse_krr)
print("Test R²:", r2_krr)

# ------------------------------
# 3. Stacking with a Meta‑Model
# ------------------------------
# Define base models:
base_estimators = [
    ('lasso', LassoCV(alphas=None, cv=5, random_state=42)),
    ('elasticnet', ElasticNetCV(l1_ratio=l1_ratios, cv=5, random_state=42, alphas=np.logspace(-3, 3, 100))),
    ('svr', SVR(kernel='rbf', C=10, gamma='scale'))  # using a preset parameter; you can refine via grid search
]
# Define a meta-model (using Ridge here)
meta_model = Ridge(alpha=20.09)

# Create the stacking regressor.
stack_reg = StackingRegressor(estimators=base_estimators, final_estimator=meta_model, cv=5)
stack_reg.fit(X_train_poly_scaled, y_train)
y_pred_stack = stack_reg.predict(X_test_poly_scaled)
mse_stack = mean_squared_error(y_test, y_pred_stack)
r2_stack = r2_score(y_test, y_pred_stack)
print("\nStacking Regressor (meta-model = Ridge):")
print("Test MSE:", mse_stack)
print("Test R²:", r2_stack)

# ------------------------------
# 4. Weighted Averaging of Predictions
# ------------------------------
# We'll weight each model's prediction inversely proportional to its training MSE.
# For demonstration, we use the predictions from Option C (LassoCV), ElasticNetCV, and SVR.
# (For SVR, we use the best SVR from a previous GridSearchCV if available. Here, we'll use the SVR defined in stacking.)

# If needed, run a separate grid search for SVR; here we reuse the SVR parameters from stacking.
svr_model = SVR(kernel='rbf', C=10, gamma='scale')
svr_model.fit(X_train_poly_scaled, y_train)
y_pred_svr = svr_model.predict(X_test_poly_scaled)
mse_svr = mean_squared_error(y_test, y_pred_svr)
r2_svr = r2_score(y_test, y_pred_svr)
print("\nSVR (RBF kernel) standalone:")
print("Test MSE:", mse_svr)
print("Test R²:", r2_svr)

# Collect the MSE values from three models.
mse_values = np.array([mse_C, mse_en, mse_svr])
# Compute weights as the inverse of MSE (lower MSE gets higher weight).
weights = 1.0 / mse_values
weights /= weights.sum()  # Normalize to sum to 1
print("\nWeighted Averaging:")
print("Weights:", weights)

# Weighted ensemble prediction:
y_pred_weighted = weights[0]*y_pred_C + weights[1]*y_pred_en + weights[2]*y_pred_svr
mse_weighted = mean_squared_error(y_test, y_pred_weighted)
r2_weighted = r2_score(y_test, y_pred_weighted)
print("Test MSE:", mse_weighted)
print("Test R²:", r2_weighted)

# ------------------------------
# Summary: Collect results in a table.
# ------------------------------
results = [
    {'Option': 'C (LassoCV directly)', 'Test MSE': mse_C, 'Test R²': r2_C},
    {'Option': 'ElasticNetCV', 'Test MSE': mse_en, 'Test R²': r2_en},
    {'Option': 'Kernel Ridge Regression (RBF)', 'Test MSE': mse_krr, 'Test R²': r2_krr},
    {'Option': 'Stacking Regressor', 'Test MSE': mse_stack, 'Test R²': r2_stack},
    {'Option': 'SVR (RBF standalone)', 'Test MSE': mse_svr, 'Test R²': r2_svr},
    {'Option': 'Weighted Averaging', 'Test MSE': mse_weighted, 'Test R²': r2_weighted}
]

results_df = pd.DataFrame(results)
print("\nSummary of Model Performance (Best Results):")
print(results_df.sort_values(by='Test MSE'))  # Sorting by Test MSE (lowest is best)


Total polynomial features: 14

Option C (LassoCV directly on full features):
Best alpha selected: 0.01778902890119987
Test MSE: 0.19373047874210853
Test R²: 0.522020815099516

ElasticNetCV on full polynomial features:
Best alpha: 0.016297508346206444
Best l1_ratio: 0.9
Test MSE: 0.1967981473574057
Test R²: 0.5144521467422958


500 fits failed out of a total of 1750.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
250 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\souha\anaconda3\envs\lab\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\souha\anaconda3\envs\lab\Lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "c:\Users\souha\anaconda3\envs\lab\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\souha\anaconda3\envs\lab\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_paramete


Kernel Ridge Regression (RBF) with GridSearchCV:
Best parameters: {'alpha': 0.0030888435964774815, 'gamma': 0.001}
Test MSE: 0.20189700469113672
Test R²: 0.501872052540676

Stacking Regressor (meta-model = Ridge):
Test MSE: 0.16011783853193742
Test R²: 0.6049511958755727

SVR (RBF kernel) standalone:
Test MSE: 0.20975117345627092
Test R²: 0.4824939494729231

Weighted Averaging:
Weights: [0.3438751  0.33851481 0.31761009]
Test MSE: 0.1767275549555829
Test R²: 0.5639710735471153

Summary of Model Performance (Best Results):
                          Option  Test MSE   Test R²
3             Stacking Regressor  0.160118  0.604951
5             Weighted Averaging  0.176728  0.563971
0           C (LassoCV directly)  0.193730  0.522021
1                   ElasticNetCV  0.196798  0.514452
2  Kernel Ridge Regression (RBF)  0.201897  0.501872
4           SVR (RBF standalone)  0.209751  0.482494


In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import Ridge, LassoCV, ElasticNetCV
from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import SVR
from sklearn.ensemble import StackingRegressor
from sklearn.feature_selection import RFE
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline

# ================================
# Helper function to evaluate a model.
# It accepts an extra_info parameter.
# ================================
def evaluate_model(model, X_train, y_train, X_test, y_test, option_name, extra_info=None):
    if extra_info is None:
        extra_info = {}
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    result = {'Option': option_name, 'Test MSE': mse, 'Test R²': r2}
    result.update(extra_info)
    return result, y_pred

# ================================
# PREPROCESSING: Polynomial Feature Generation & Scaling
# ================================
# Assume df_scaled is your preprocessed DataFrame with target column "IE%"
X = df_scaled.drop('IE%', axis=1)
y = df_scaled['IE%']

# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Generate degree-2 polynomial features.
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

# Scale the polynomial features.
scaler = StandardScaler()
X_train_poly_scaled = scaler.fit_transform(X_train_poly)
X_test_poly_scaled = scaler.transform(X_test_poly)

n_poly_features = X_train_poly_scaled.shape[1]
print("Total polynomial features:", n_poly_features)

# ================================
# Container for results and predictions
# ================================
results_list = {}
predictions = {}

# ------------------------------
# Option A: LassoCV for feature selection, then Ridge on selected features
# ------------------------------
lasso_cv = LassoCV(alphas=None, cv=5, random_state=42, max_iter=10000)
lasso_cv.fit(X_train_poly_scaled, y_train)
nonzero_indices = np.where(lasso_cv.coef_ != 0)[0]
n_nonzero_A = len(nonzero_indices)
print(f"Option A: LassoCV selected {n_nonzero_A} out of {n_poly_features} features.")

X_train_A = X_train_poly_scaled[:, nonzero_indices]
X_test_A  = X_test_poly_scaled[:, nonzero_indices]

ridge_A = Ridge(alpha=20.09)
result_A, y_pred_A = evaluate_model(ridge_A, X_train_A, y_train, X_test_A, y_test,
                                    'A (Ridge on LassoCV-selected)')
results_list['A (Ridge on LassoCV-selected)'] = result_A
predictions['A'] = y_pred_A

# ------------------------------
# Option B: RFE with Ridge estimator, then Ridge on RFE-selected features
# ------------------------------
ridge_estimator = Ridge(alpha=20.09)
n_features_to_select_B = n_poly_features // 2
print(f"Option B: Full polynomial feature set has {n_poly_features} features. Selecting {n_features_to_select_B} features via RFE.")

rfe = RFE(estimator=ridge_estimator, n_features_to_select=n_features_to_select_B, step=1)
rfe.fit(X_train_poly_scaled, y_train)
selected_features_B = rfe.support_
n_selected_B = np.sum(selected_features_B)
print(f"RFE selected {n_selected_B} features.")

X_train_B = X_train_poly_scaled[:, selected_features_B]
X_test_B  = X_test_poly_scaled[:, selected_features_B]

ridge_B = Ridge(alpha=20.09)
result_B, y_pred_B = evaluate_model(ridge_B, X_train_B, y_train, X_test_B, y_test,
                                    'B (Ridge on RFE-selected)')
results_list['B (Ridge on RFE-selected)'] = result_B
predictions['B'] = y_pred_B

# ------------------------------
# Option C: Train LassoCV directly on full polynomial features
# ------------------------------
lasso_model = LassoCV(alphas=None, cv=5, random_state=42, max_iter=10000)
result_C, y_pred_C = evaluate_model(lasso_model, X_train_poly_scaled, y_train, 
                                    X_test_poly_scaled, y_test,
                                    'C (LassoCV directly)')
result_C['Best Alpha'] = lasso_model.alpha_
results_list['C (LassoCV directly)'] = result_C
predictions['C'] = y_pred_C

# ------------------------------
# Option D: RFE with LassoCV as base estimator, then Ridge on selected features
# ------------------------------
lasso_cv_for_rfe = LassoCV(alphas=None, cv=5, random_state=42, max_iter=10000)
n_features_to_select_D = n_poly_features // 2
rfe_D = RFE(estimator=lasso_cv_for_rfe, n_features_to_select=n_features_to_select_D, step=1)
rfe_D.fit(X_train_poly_scaled, y_train)
selected_features_D = rfe_D.support_
n_selected_D = np.sum(selected_features_D)
print(f"Option D: RFE with LassoCV as base estimator selected {n_selected_D} features.")

X_train_D = X_train_poly_scaled[:, selected_features_D]
X_test_D  = X_test_poly_scaled[:, selected_features_D]

ridge_D = Ridge(alpha=20.09)
result_D, y_pred_D = evaluate_model(ridge_D, X_train_D, y_train, X_test_D, y_test,
                                    'D (Ridge on RFE(LassoCV)-selected)')
results_list['D (Ridge on RFE(LassoCV)-selected)'] = result_D
predictions['D'] = y_pred_D

# ------------------------------
# Option E: ElasticNetCV on full polynomial features
# ------------------------------
l1_ratios = [0.1, 0.3, 0.5, 0.7, 0.9]
elastic_net_cv = ElasticNetCV(l1_ratio=l1_ratios, cv=5, random_state=42,
                              alphas=np.logspace(-3, 3, 100), max_iter=10000)
result_en, y_pred_en = evaluate_model(elastic_net_cv, X_train_poly_scaled, y_train,
                                      X_test_poly_scaled, y_test, 'ElasticNetCV')
results_list['ElasticNetCV'] = result_en
predictions['ElasticNetCV'] = y_pred_en

# ------------------------------
# Option F: Kernel Ridge Regression (KRR) with RBF kernel using GridSearchCV
# ------------------------------
param_grid_krr = {
    'alpha': np.logspace(-3, 3, 50),
    'gamma': [0.001, 0.01, 0.1, 1, 10]
}
grid_search_krr = GridSearchCV(KernelRidge(kernel='rbf'), param_grid_krr, cv=5,
                               scoring='neg_mean_squared_error')
grid_search_krr.fit(X_train_poly_scaled, y_train)
best_krr = grid_search_krr.best_estimator_
y_pred_krr = best_krr.predict(X_test_poly_scaled)
mse_krr = mean_squared_error(y_test, y_pred_krr)
r2_krr = r2_score(y_test, y_pred_krr)
result_krr = {
    'Option': 'Kernel Ridge Regression (RBF)',
    'Test MSE': mse_krr,
    'Test R²': r2_krr,
    'Best Params': grid_search_krr.best_params_
}
results_list['Kernel Ridge Regression (RBF)'] = result_krr

# ------------------------------
# Option G: SVR (RBF standalone) using GridSearchCV
# ------------------------------
param_grid_svr = {
    'C': [0.1, 1, 10, 100],
    'gamma': [0.001, 0.01, 0.1, 1, 10]
}
grid_search_svr = GridSearchCV(SVR(kernel='rbf'), param_grid_svr, cv=5, scoring='neg_mean_squared_error')
grid_search_svr.fit(X_train_poly_scaled, y_train)
best_svr = grid_search_svr.best_estimator_
result_svr, y_pred_svr = evaluate_model(best_svr, X_train_poly_scaled, y_train,
                                        X_test_poly_scaled, y_test,
                                        'SVR (RBF standalone)',
                                        extra_info={'Best Params': grid_search_svr.best_params_})
results_list['SVR (RBF standalone)'] = result_svr
predictions['SVR'] = y_pred_svr

# ------------------------------
# Option H: Stacking Regressor with Meta-Model (Ridge as meta-model)
# ------------------------------
base_estimators = [
    ('lasso', LassoCV(alphas=None, cv=5, random_state=42, max_iter=10000)),
    ('elasticnet', ElasticNetCV(l1_ratio=l1_ratios, cv=5, random_state=42,
                                 alphas=np.logspace(-3, 3, 100), max_iter=10000)),
    ('svr', SVR(kernel='rbf', C=10, gamma=0.01))
]
meta_model_ridge = Ridge(alpha=20.09)
stack_reg_ridge = StackingRegressor(estimators=base_estimators, final_estimator=meta_model_ridge, cv=5)
result_stack_ridge, y_pred_stack_ridge = evaluate_model(stack_reg_ridge, X_train_poly_scaled, y_train,
                                                        X_test_poly_scaled, y_test,
                                                        'Stacking Regressor (meta = Ridge)')
results_list['Stacking Regressor (meta = Ridge)'] = result_stack_ridge
predictions['Stacking_Ridge'] = y_pred_stack_ridge

# ------------------------------
# Option I: Stacking Regressor with Meta-Model (LassoCV as meta-model)
# ------------------------------
base_estimators = [
    ('lasso', LassoCV(alphas=None, cv=5, random_state=42, max_iter=10000)),
    ('elasticnet', ElasticNetCV(l1_ratio=l1_ratios, cv=5, random_state=42,
                                 alphas=np.logspace(-3, 3, 100), max_iter=10000)),
    ('svr', SVR(kernel='rbf', C=10, gamma=0.01))
]
meta_model_lasso = LassoCV(alphas=None, cv=5, random_state=42, max_iter=10000)
stack_reg_lasso = StackingRegressor(estimators=base_estimators, final_estimator=meta_model_lasso, cv=5)
result_stack_lasso, y_pred_stack_lasso = evaluate_model(stack_reg_lasso, X_train_poly_scaled, y_train,
                                                          X_test_poly_scaled, y_test,
                                                          'Stacking Regressor (meta = LassoCV)')
# Access the fitted meta-model via final_estimator_
result_stack_lasso['Meta Best Alpha'] = stack_reg_lasso.final_estimator_.alpha_
results_list['Stacking Regressor (meta = LassoCV)'] = result_stack_lasso
predictions['Stacking_LassoMeta'] = y_pred_stack_lasso

print("\nStacking Regressor (meta-model = LassoCV):")
print("Test MSE:", result_stack_lasso['Test MSE'])
print("Test R²:", result_stack_lasso['Test R²'])
print("Meta Best Alpha:", stack_reg_lasso.final_estimator_.alpha_)

# ------------------------------
# Option J: Weighted Averaging of Predictions from Option C, ElasticNetCV, and SVR
# ------------------------------
mse_vals = np.array([
    results_list['C (LassoCV directly)']['Test MSE'],
    results_list['ElasticNetCV']['Test MSE'],
    results_list['SVR (RBF standalone)']['Test MSE']
])
weights = 1.0 / mse_vals
weights /= weights.sum()
print("\nWeighted Averaging Weights:", weights)
y_pred_weighted = weights[0]*y_pred_C + weights[1]*y_pred_en + weights[2]*y_pred_svr
mse_weighted = mean_squared_error(y_test, y_pred_weighted)
r2_weighted = r2_score(y_test, y_pred_weighted)
result_weighted = {
    'Option': 'Weighted Averaging',
    'Test MSE': mse_weighted,
    'Test R²': r2_weighted
}
results_list['Weighted Averaging'] = result_weighted

# ------------------------------
# Option K: Nested Cross-Validation Example with Ridge Regression Pipeline
# ------------------------------
pipeline_ridge = Pipeline([
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('scaler', StandardScaler()),
    ('ridgecv', Ridge(alpha=20.09))  # Here we use Ridge (or RidgeCV) in the pipeline
])
outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)
nested_scores = cross_val_score(pipeline_ridge, X, y, cv=outer_cv, scoring='neg_mean_squared_error')
avg_nested_mse = -np.mean(nested_scores)
result_nested = {
    'Option': 'Nested CV (Ridge Pipeline)',
    'Test MSE': avg_nested_mse,
    'Test R²': np.nan
}
results_list['Nested CV (Ridge Pipeline)'] = result_nested

# ------------------------------
# Create a summary DataFrame from all results.
# ------------------------------
results_df = pd.DataFrame(list(results_list.values()))
results_df['Test MSE'] = pd.to_numeric(results_df['Test MSE'], errors='coerce')
results_df['Test R²'] = pd.to_numeric(results_df['Test R²'], errors='coerce')
results_df = results_df.sort_values(by='Test MSE')

print("\nSummary of Model Performance (All Options):")
results_df


Total polynomial features: 14
Option A: LassoCV selected 14 out of 14 features.
Option B: Full polynomial feature set has 14 features. Selecting 7 features via RFE.
RFE selected 7 features.
Option D: RFE with LassoCV as base estimator selected 7 features.

Stacking Regressor (meta-model = LassoCV):
Test MSE: 0.16652075638504785
Test R²: 0.5891536741005478
Meta Best Alpha: 0.03252765968319807

Weighted Averaging Weights: [0.3671947  0.36147091 0.27133439]

Summary of Model Performance (All Options):


Unnamed: 0,Option,Test MSE,Test R²,Best Alpha,Best Params,Meta Best Alpha
7,Stacking Regressor (meta = Ridge),0.160729,0.603444,,,
8,Stacking Regressor (meta = LassoCV),0.166521,0.589154,,,0.032528
2,C (LassoCV directly),0.19373,0.522021,0.017789,,
4,ElasticNetCV,0.196798,0.514452,,,
5,Kernel Ridge Regression (RBF),0.201897,0.501872,,"{'alpha': 0.0030888435964774815, 'gamma': 0.001}",
9,Weighted Averaging,0.207678,0.487609,,,
3,D (Ridge on RFE(LassoCV)-selected),0.232327,0.426794,,,
1,B (Ridge on RFE-selected),0.232538,0.426274,,,
0,A (Ridge on LassoCV-selected),0.249418,0.384625,,,
6,SVR (RBF standalone),0.262174,0.353155,,"{'C': 100, 'gamma': 0.001}",
