In [15]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.datasets import make_regression
from sklearn.model_selection import KFold
from itertools import combinations



In [16]:
Hitters = pd.read_csv("data/Hitters.csv")
from core.helpers import transform_df_for_model
terms = Hitters.columns.drop('Salary')
Y = Hitters['Salary']
X = Hitters[terms].copy()
designX = transform_df_for_model(X,terms)

In [17]:
from core.helpers import transform_df_for_model
terms = Hitters.columns.drop('Salary')
Y = Hitters['Salary']
X = Hitters[terms].copy()


In [18]:
designX = transform_df_for_model(X,terms)

In [5]:
model = sm.OLS(Y,designX)
model_result = model.fit()
sigma2 = model_result.scale.item()
sigma2


99591.35617968219

In [7]:
def nCp(sigma2 , X, Y, cv = 5):
    "Negative Cp statistic"
    kf = KFold(n_splits=cv, shuffle=True, random_state=42)
    scores = []
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = Y.iloc[train_index], Y.iloc[test_index]
        # train model
        model_cv = sm.OLS(y_train, X_train).fit()
        n, p = X.shape
        y_pred = model_cv.predict(X_test)
        RSS = np.sum((y_test - y_pred)**2)
        scores.append(-(RSS + 2 * p * sigma2) / n )
    return np.mean(scores)

In [6]:
# 2️⃣ Définir un custom scoring (ici R² ajusté basé sur K-Fold CV)
def r2_cv(X, y, cv=5):
    kf = KFold(n_splits=cv, shuffle=True, random_state=42)
    scores = []
    
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        model_cv = sm.OLS(y_train, X_train).fit()
        # Predict on the test set
        y_pred = model_cv.predict(X_test)
        
        # Compute R² on test data
        r2 = 1 - np.sum((y_test - y_pred) ** 2) / np.sum((y_test - np.mean(y_test)) ** 2)
        
        scores.append(r2)
    
    return np.mean(scores)

r2_cv(designX,Y)

np.float64(0.313797040766172)

In [7]:
from core.helpers import sklearn_sm
from sklearn.model_selection import \
     (cross_validate,
      KFold)

cv = KFold(n_splits=5,
           shuffle=True,
           random_state=42) # use same splits for each degree
M = sklearn_sm(sm.OLS)
M_CV = cross_validate(M,
                          designX,
                          Y,scoring='r2',
                          cv=cv)
np.mean(M_CV['test_score'])

np.float64(0.313797040766172)

In [None]:
# 2️⃣ Définir un custom scoring (ici R² ajusté basé sur K-Fold CV)
def r2_cv(X, y, cv=5):
    kf = KFold(n_splits=cv, shuffle=True, random_state=42)
    scores = []
    
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        model_cv = sm.OLS(y_train, X_train).fit()
        # Predict on the test set
        y_pred = model_cv.predict(X_test)
        
        # Compute R² on test data
        r2 = 1 - np.sum((y_test - y_pred) ** 2) / np.sum((y_test - np.mean(y_test)) ** 2)
        
        scores.append(r2)
    
    return np.mean(scores)

r2_cv(designX,Y)

In [10]:
def nCp(sigma2 , X, Y, cv = 5):
    "Negative Cp statistic"
    kf = KFold(n_splits=cv, shuffle=True, random_state=42)
    scores = []
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = Y.iloc[train_index], Y.iloc[test_index]
        # train model
        model_cv = sm.OLS(y_train, X_train).fit()
        n, p = X.shape
        y_pred = model_cv.predict(X_test)
        RSS = np.sum((y_test - y_pred)**2)
        scores.append(-(RSS + 2 * p * sigma2) / n )
    return np.mean(scores)

In [11]:
from functools import partial
new_nCp = partial(nCp, sigma2 )

In [None]:
from core.helpers import sklearn_sm
from sklearn.model_selection import \
     (cross_validate,
      KFold)

cv = KFold(n_splits=5,
           shuffle=True,
           random_state=42) # use same splits for each degree
M = sklearn_sm(sm.OLS)
M_CV = cross_validate(M,
                          designX,
                          Y,scoring='r2',
                          cv=cv)
np.mean(M_CV['test_score'])

In [82]:
new_nCp(designX,Y,cv=5)

np.float64(-39258.465294604364)

In [69]:
from core.helpers import sklearn_sm
from sklearn.model_selection import \
     (cross_validate,
      KFold,
      ShuffleSplit)
cv_error = np.zeros(5)
cv = KFold(n_splits=5,
           shuffle=True,
           random_state=42) # use same splits for each degree
M = sklearn_sm(sm.OLS)
M_CV = cross_validate(M,
                          designX,
                          Y,
                          cv=cv)
np.mean(M_CV['test_score'])

np.float64(120488.11603985951)

In [70]:
import numpy as np
from sklearn.model_selection import KFold

def nCp_scorer(estimator, X, y, sigma2=sigma2, cv=5):
    """
    Fonction compatible avec `cross_validate` pour calculer le Cp-statistic négatif.
    
    Paramètres:
        estimator : Modèle de régression (doit avoir .fit et .predict)
        X : Variables explicatives
        y : Variable cible
        sigma2 : Variance des résidus (si None, elle est estimée)
        cv : Nombre de folds (optionnel, non utilisé dans cross_validate)

    Retourne:
        Score négatif de Cp (car sklearn maximise les scores)
    """
    kf = KFold(n_splits=cv, shuffle=True, random_state=42)
    RSS_list = []
    
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Entraîner le modèle
        estimator.fit(X_train, y_train)
        
        # Prédictions sur le test set
        y_pred = estimator.predict(X_test)
        
        # Calcul du Residual Sum of Squares (RSS)
        RSS_list.append(np.sum((y_test - y_pred) ** 2))
    
    # Moyenne des RSS sur les folds
    RSS_cv = np.mean(RSS_list)
    
    # Estimation de sigma2 si non fourni
    if sigma2 is None:
        sigma2 = RSS_cv / len(y)

    # Nombre d'observations et de variables
    n, p = X.shape
    
    # Calcul de Cp statistic (négatif pour maximisation dans sklearn)
    Cp = -(RSS_cv + 2 * p * sigma2) / n
    
    return Cp

In [71]:
from core.helpers import sklearn_sm
from sklearn.model_selection import \
     (cross_validate,
      KFold,
      ShuffleSplit)
cv_error = np.zeros(5)
cv = KFold(n_splits=5,
           shuffle=True,
           random_state=42) # use same splits for each degree
M = sklearn_sm(sm.OLS)
M_CV = cross_validate(M,
                          designX,
                          Y,scoring=nCp_scorer,
                          cv=cv)
np.mean(M_CV['test_score'])

np.float64(-125512.88159716557)

In [None]:
cv_error = np.zeros(5)
cv = KFold(n_splits=5,
           shuffle=True,
           random_state=42) # use same splits for each degree
M = sklearn_sm(sm.OLS)
for i, d in enumerate(range(1,6)):
    X = np.power.outer(H, np.arange(d+1))
    M_CV = cross_validate(M,
                          X,
                          Y,
                          cv=cv)
    cv_error[i] = np.mean(M_CV['test_score'])
cv_error

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
for train_index, test_index in kf.split(designX):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = Y.iloc[train_index], Y.iloc[test_index]

y_train

In [49]:
from functools import partial
new_nCp = partial(nCp, sigma2 =sigma2)

In [46]:
neg_Cp = partial(nCp, sigma2)

In [50]:
neg_Cp(model,designX,Y,cv=5)

ValueError: shapes (263,20) and (263,20) not aligned: 20 (dim 1) != 263 (dim 0)

In [6]:
# 3️⃣ Implémentation de la sélection forward avec K-Fold CV
def forward_selection(df, target, scoring, cv=5):
    available_features = list(df.columns)
    
    selected_features = []
    best_score = -np.inf
    best_model = None

    while available_features:
        scores = []
        models = []
        
        for feature in available_features:
            X = transform_df_for_model(df,selected_features + [feature])
            y = target
            score = scoring(X, y, cv=cv)
            scores.append((feature, score, model))
        
        # Sélectionner la meilleure variable à ajouter
        scores.sort(key=lambda x: x[1], reverse=True)
        best_new_feature, best_new_score, best_new_model = scores[0]
        
        if best_new_score > best_score:  # Vérifier l'amélioration
            selected_features.append(best_new_feature)
            available_features.remove(best_new_feature)
            best_score = best_new_score
            best_model = best_new_model
        else:
            break  # Arrêter si plus d'amélioration

    return best_model, selected_features

In [8]:

# 4️⃣ Lancer la sélection
best_model, selected_features = forward_selection(X, Y, scoring=adjusted_r2_cv, cv=5)
print("Meilleures variables sélectionnées :", selected_features)
print(best_model.summary())

Meilleures variables sélectionnées : ['CRBI', 'Hits', 'PutOuts', 'Division', 'AtBat', 'Walks', 'CWalks', 'CRuns', 'CAtBat', 'Assists', 'League']
                            OLS Regression Results                            
Dep. Variable:                 Salary   R-squared:                       0.543
Model:                            OLS   Adj. R-squared:                  0.523
Method:                 Least Squares   F-statistic:                     27.07
Date:                Wed, 19 Mar 2025   Prob (F-statistic):           8.93e-37
Time:                        14:37:41   Log-Likelihood:                -1877.2
No. Observations:                 263   AIC:                             3778.
Df Residuals:                     251   BIC:                             3821.
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.0

In [9]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate, KFold
from sklearn.metrics import make_scorer, r2_score
import statsmodels.api as sm

# Custom R² scoring function for cross-validation
def r2_scorer(estimator, X, y):
    """
    Custom scoring function for cross_validate that computes R² on the test set.
    
    Parameters:
        estimator : The trained model (must have .fit and .predict)
        X (DataFrame) : Feature matrix
        y (Series) : Target variable
    
    Returns:
        float : R² score
    """
    # Add a constant term (intercept) if using Statsmodels
    if isinstance(estimator, sm.OLS):
        X = sm.add_constant(X)

    # Train the model
    estimator.fit(X, y)

    # Predict on the same set (cross_validate will use test set automatically)
    y_pred = estimator.predict(X)

    # Compute R² on test set
    return r2_score(y, y_pred)

# Convert function into a scorer
custom_r2_scorer = make_scorer(r2_scorer, greater_is_better=True)

In [10]:
# Generate synthetic regression data
X, y = make_regression(n_samples=100, n_features=5, noise=10, random_state=42)
X = pd.DataFrame(X, columns=[f"Var{i}" for i in range(5)])
y = pd.Series(y)

In [13]:




# Define model (Linear Regression)
model = sklearn_sm(sm.OLS)

# Perform cross-validation with the custom R² scorer
cv_results = cross_validate(model, designX, Y, scoring=custom_r2_scorer, cv=5, return_train_score=True)

# Display results
print("Custom R² scores on test sets:", cv_results["test_score"])
print("Mean R² across folds:", np.mean(cv_results["test_score"]))

Custom R² scores on test sets: [nan nan nan nan nan]
Mean R² across folds: nan


Traceback (most recent call last):
  File "/Users/passion/Documents/2025/ML/venv/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/Users/passion/Documents/2025/ML/venv/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 288, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
  File "/Users/passion/Documents/2025/ML/venv/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 388, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
TypeError: r2_scorer() missing 1 required positional argument: 'y'



In [27]:
# Exemple de données factices
designX = np.random.rand(100, 3)  # 100 échantillons, 3 features
Y = np.random.rand(100)  # 100 cibles
designX
Y

array([0.34112994, 0.64682351, 0.43169047, 0.82737198, 0.88378697,
       0.35515211, 0.73216518, 0.30412311, 0.76791636, 0.16453645,
       0.39200141, 0.95304113, 0.86852918, 0.85652826, 0.6260148 ,
       0.2274643 , 0.33066311, 0.83738994, 0.44022479, 0.2116009 ,
       0.99788981, 0.93606177, 0.82250484, 0.7735262 , 0.69204918,
       0.72742574, 0.55586749, 0.29979135, 0.66350105, 0.76958443,
       0.75311119, 0.59698818, 0.26730429, 0.32890829, 0.06808448,
       0.56752747, 0.02149576, 0.65424999, 0.94210265, 0.90627553,
       0.01356619, 0.83436575, 0.13800968, 0.68432994, 0.79159903,
       0.64546835, 0.19206323, 0.12886985, 0.13344615, 0.85609896,
       0.31108035, 0.80535887, 0.97386201, 0.69923782, 0.79003244,
       0.19521081, 0.85865958, 0.96688519, 0.63877666, 0.31548163,
       0.81511011, 0.98863237, 0.22193891, 0.34632878, 0.72982026,
       0.20185936, 0.58740934, 0.22144936, 0.06658957, 0.53279664,
       0.10590963, 0.46889029, 0.34554016, 0.88715356, 0.42046

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate, KFold
from sklearn.metrics import make_scorer

In [2]:
Hitters = pd.read_csv("data/Hitters.csv")
from core.helpers import transform_df_for_model
terms = Hitters.columns.drop(['Salary', 'League','Division','NewLeague'])
Y = Hitters['Salary']
X = Hitters.drop(columns=['Salary', 'League','Division','NewLeague'])
X['intercept'] = np.ones(Hitters.shape[0])

In [3]:
X

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,PutOuts,Assists,Errors,intercept
0,315,81,7,24,38,39,14,3449,835,69,321,414,375,632,43,10,1.0
1,479,130,18,66,72,76,3,1624,457,63,224,266,263,880,82,14,1.0
2,496,141,20,65,78,37,11,5628,1575,225,828,838,354,200,11,3,1.0
3,321,87,10,39,42,30,2,396,101,12,48,46,33,805,40,4,1.0
4,594,169,4,74,51,35,11,4408,1133,19,501,336,194,282,421,25,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
258,497,127,7,65,48,37,5,2703,806,32,379,311,138,325,9,3,1.0
259,492,136,5,76,50,94,12,5511,1511,39,897,451,875,313,381,20,1.0
260,475,126,3,61,43,52,6,1700,433,7,217,93,146,37,113,7,1.0
261,573,144,9,85,60,78,8,3198,857,97,470,420,332,1314,131,12,1.0
