In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn.linear_model import LinearRegression, LogisticRegression, Lasso, Ridge, RidgeClassifier
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, BaggingRegressor, StackingRegressor, StackingClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.svm import SVR, SVC
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import make_scorer
from sklearn.base import RegressorMixin, ClassifierMixin, BaseEstimator
from sklearn.ensemble import VotingClassifier, VotingRegressor

from xgboost import XGBClassifier, XGBRegressor
from lightgbm import LGBMClassifier, LGBMRegressor


### Data Collection

In [None]:
import requests

# URLs of the files
train_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module6/exercise/module6_exercise_train.csv'
test_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module6/exercise/module6_exercise_test.csv'

# Function to download a file
def download_file(url, file_name):
    response = requests.get(url)
    response.raise_for_status()  # Ensure we notice bad responses
    with open(file_name, 'wb') as file:
        file.write(response.content)
    print(f'Downloaded {file_name} from {url}')

# Downloading the files
download_file(train_data_url, 'module6_exercise_train.csv')
download_file(test_data_url, 'module6_exercise_test.csv')

In [None]:
data_train = pd.read_csv('module6_exercise_train.csv', index_col='index')
data_test = pd.read_csv('module6_exercise_test.csv', index_col='index')

### Data Analysis

In [None]:
data_train

In [None]:
data_test

In [None]:
data_train.describe()

In [None]:
data_train.isnull().sum()

In [None]:
data_test.isnull().sum()

In [None]:
# Plot the distribution using seaborn
plt.figure(figsize=(10, 6))
sns.histplot(data_train['end_of_day_return'], bins=50, kde=True)
plt.title('Distribution of End of Day Return')
plt.xlabel('End of Day Return')
plt.ylabel('Frequency')
plt.show()

### Model Building and Evaluate

In [None]:
y = data_train.pop('end_of_day_return')
X = data_train.copy()

In [None]:
def weighted_accuracy(y_true, y_pred):
    weights = np.abs(y_true)
    
    # Compute the sign of true and predicted values
    sign_true = np.sign(y_true)
    sign_pred = np.sign(y_pred)
    
    # Correct predictions where the sign of the true and predicted values match
    correct_predictions = sign_true == sign_pred
    
    # Compute the weighted accuracy
    weighted_acc = np.sum(weights * correct_predictions) / np.sum(weights)
    
    return weighted_acc

In [None]:
# Function to plot the evaluation results
def plot_results(mse_train, mse_test, w_acc_train, w_acc_test):
    plt.figure(figsize=(12, 6))

    # MSE plot
    plt.subplot(1, 2, 1)
    plt.plot(mse_train, label="Train MSE", marker='o')
    plt.plot(mse_test, label="Test MSE", marker='o')
    plt.fill_between(range(len(mse_train)), np.min(mse_train), np.max(mse_train), color='blue', alpha=0.1)
    plt.fill_between(range(len(mse_test)), np.min(mse_test), np.max(mse_test), color='orange', alpha=0.1)
    plt.title("MSE over Folds")
    plt.xlabel("Fold")
    plt.ylabel("MSE")
    plt.legend()
    plt.grid(True)

    # weighted_accuracy plot
    plt.subplot(1, 2, 2)
    plt.plot(w_acc_train, label="Train weighted_accuracy", marker='o')
    plt.plot(w_acc_test, label="Test weighted_accuracy", marker='o')
    plt.fill_between(range(len(w_acc_train)), np.min(w_acc_train), np.max(w_acc_train), color='blue', alpha=0.1)
    plt.fill_between(range(len(w_acc_test)), np.min(w_acc_test), np.max(w_acc_test), color='orange', alpha=0.1)
    plt.title("weighted_accuracy over Folds")
    plt.xlabel("Fold")
    plt.ylabel("weighted_accuracy")
    plt.legend()
    plt.grid(True)

    plt.tight_layout()
    plt.show()

def plot_multi_model_results(results):
    # Set up the plot
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 20))
    
    # Colors for train and test
    train_color = 'skyblue'
    test_color = 'lightgreen'
    
    # Plot MSE
    ax1.set_title('Mean Squared Error (MSE) Comparison', fontsize=16)
    ax1.set_ylabel('MSE', fontsize=12)
    ax1.set_xlabel('Models', fontsize=12)
    ax1.grid(True, linestyle='--', alpha=0.7)
    
    # Plot weighted_accuracy
    ax2.set_title('weighted_accuracy Comparison', fontsize=16)
    ax2.set_ylabel('weighted_accuracy', fontsize=12)
    ax2.set_xlabel('Models', fontsize=12)
    ax2.grid(True, linestyle='--', alpha=0.7)
    
    x = np.arange(len(results))
    width = 0.35
    
    for i, (model_name, scores) in enumerate(results.items()):
        # MSE
        mse_train = scores['mse_train']
        mse_test = scores['mse_test']
        
        ax1.bar(x[i] - width/2, np.mean(mse_train), width, label='Train' if i == 0 else "", 
                color=train_color, alpha=0.7)
        ax1.bar(x[i] + width/2, np.mean(mse_test), width, label='Test' if i == 0 else "", 
                color=test_color, alpha=0.7)
        
        ax1.errorbar(x[i] - width/2, np.mean(mse_train), 
                     yerr=[[np.mean(mse_train)-np.min(mse_train)], [np.max(mse_train)-np.mean(mse_train)]], 
                     fmt='none', ecolor='black', capsize=5)
        ax1.errorbar(x[i] + width/2, np.mean(mse_test), 
                     yerr=[[np.mean(mse_test)-np.min(mse_test)], [np.max(mse_test)-np.mean(mse_test)]], 
                     fmt='none', ecolor='black', capsize=5)
        
        # weighted_accuracy
        w_acc_train = scores['w_acc_train']
        w_acc_test = scores['w_acc_test']
        
        ax2.bar(x[i] - width/2, np.mean(w_acc_train), width, label='Train' if i == 0 else "", 
                color=train_color, alpha=0.7)
        ax2.bar(x[i] + width/2, np.mean(w_acc_test), width, label='Test' if i == 0 else "", 
                color=test_color, alpha=0.7)
        
        ax2.errorbar(x[i] - width/2, np.mean(w_acc_train), 
                     yerr=[[np.mean(w_acc_train)-np.min(w_acc_train)], [np.max(w_acc_train)-np.mean(w_acc_train)]], 
                     fmt='none', ecolor='black', capsize=5)
        ax2.errorbar(x[i] + width/2, np.mean(w_acc_test), 
                     yerr=[[np.mean(w_acc_test)-np.min(w_acc_test)], [np.max(w_acc_test)-np.mean(w_acc_test)]], 
                     fmt='none', ecolor='black', capsize=5)
    
    ax1.set_xticks(x)
    ax1.set_xticklabels(results.keys(), rotation=45, ha='right')
    ax2.set_xticks(x)
    ax2.set_xticklabels(results.keys(), rotation=45, ha='right')
    
    ax1.legend(loc='upper left')
    ax2.legend(loc='upper left')
    
    plt.tight_layout()
    plt.show()

#### Simple Baseline

In [None]:
# Function to handle train-test evaluation in a fold
def train_and_evaluate(X_train, X_test, y_train, y_test, model):
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions on train set
    y_pred_train = model.predict(X_train)
    # Make predictions on train set
    y_pred_test = model.predict(X_test)
    
    # Compute MSE for train and test
    mse_train = mean_squared_error(y_train, y_pred_train)
    mse_test = mean_squared_error(y_test, y_pred_test)
    
    # Compute weighted_accuracy
    
    w_acc_train = weighted_accuracy(y_train, y_pred_train)
    w_acc_test = weighted_accuracy(y_test, y_pred_test)
    
    return mse_train, mse_test, w_acc_train, w_acc_test


def run_multi_model_cv(X, y, models, n_splits=5):
    fold = KFold(n_splits=n_splits)
    results = {name: {'mse_train': [], 'mse_test': [], 'w_acc_train': [], 'w_acc_test': []} 
               for name in models.keys()}
    
    for train_index, test_index in fold.split(X, y):
        X_train, X_test = X.iloc[train_index].copy(), X.iloc[test_index].copy()
        y_train, y_test = y.iloc[train_index].copy(), y.iloc[test_index].copy()
        
        for name, model in models.items():
            mse_train, mse_test, w_acc_train, w_acc_test = train_and_evaluate(
                X_train, X_test, y_train, y_test, model
            )
            results[name]['mse_train'].append(mse_train)
            results[name]['mse_test'].append(mse_test)
            results[name]['w_acc_train'].append(w_acc_train)
            results[name]['w_acc_test'].append(w_acc_test)
        # Find the model with the best mean w_acc test score
    best_mean_w_acc = -1
    best_model = None
    best_min_w_acc = None
    best_max_w_acc = None
    
    for name, result in results.items():
        w_acc_test_scores = result['w_acc_test']
        mean_w_acc_test = sum(w_acc_test_scores) / len(w_acc_test_scores)  # Calculate mean w_acc score
        min_w_acc_test = min(w_acc_test_scores)  # Minimum w_acc score
        max_w_acc_test = max(w_acc_test_scores)  # Maximum w_acc score
        
        if mean_w_acc_test > best_mean_w_acc:
            best_mean_w_acc = mean_w_acc_test
            best_min_w_acc = min_w_acc_test
            best_max_w_acc = max_w_acc_test
            best_model = name
    
    # Print the best mean w_acc test score, min, max, and the associated model
    print(f"Best mean w_acc test score: {best_mean_w_acc:.4f} by model: {best_model}")
    print(f"Min w_acc test score: {best_min_w_acc:.4f}, Max w_acc test score: {best_max_w_acc:.4f}")
    return results


In [None]:
# Step 1: Run cross-validation
results = run_multi_model_cv(X, y, {"RandomForestRegressor": RandomForestRegressor(n_jobs=-1)})

In [None]:
# Step 2: Plot the results
plot_results(results["RandomForestRegressor"]["mse_train"],
             results["RandomForestRegressor"]["mse_test"],
             results["RandomForestRegressor"]["w_acc_train"],
             results["RandomForestRegressor"]["w_acc_test"])

In [None]:
models = {
    'Ridge': Ridge(),
    'Decision Tree Regressor': RandomForestRegressor(n_jobs=-1)
    }

In [None]:
# Run cross-validation for regression models
results = run_multi_model_cv(X, y, models)

In [None]:
# Plot MSE results for regression models
plot_multi_model_results(results)

#### Manage properly the objective weighted_accuracy
should we create different classes? custom loss?

Create Compare and Optimize different models

In [None]:
# 
#Y2=y>0
# convertir Y en un varaible binaire

In [None]:

#models = {
    #'Ridge': Ridge(),
    #'Decision Tree Regressor': DecisionTreeRegressor(),
    #'Random Forest Regressor': RandomForestRegressor(),
    #'SVR': SVR(),
    #'Lasso': Lasso(max_iter=5000),
    #'KNN Regressor': KNeighborsRegressor(),
    #'Logistic Regression': LogisticRegression(),
    #'Decision Tree Classifier': DecisionTreeClassifier(),
    #'Random Forest Classifier': RandomForestClassifier(),
    #'SVC': SVC(),
    #'KNN Classifier': KNeighborsClassifier(),
    #'XGBRegressor': XGBRegressor(),
    #'LGBMRegressor': LGBMRegressor(verbose=-1)
#}

# Regression
models = {
    'Ridge': Ridge(),
    'Decision Tree Regressor': DecisionTreeRegressor(),
    'Random Forest Regressor': RandomForestRegressor(),
    'SVR': SVR(),
    'Lasso': Lasso(max_iter=5000),
    'KNN Regressor': KNeighborsRegressor(),
    'XGBRegressor': XGBRegressor(),
    'LGBMRegressor': LGBMRegressor(verbose=-1)
}


#classification
modeles_bin = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree Classifier': DecisionTreeClassifier(),
    'Random Forest Classifier': RandomForestClassifier(),
    'SVC': SVC(),
    'KNN Classifier': KNeighborsClassifier(),
    'XGBClassifier': XGBClassifier(),
    'LGBMClassifier': LGBMClassifier()
}

# Regression

In [None]:
# Run cross-validation for regression models
results = run_multi_model_cv(X, y, models)

In [None]:
# Plot MSE results for regression models
plot_multi_model_results(results)

# Classification

In [None]:
# Run cross-validation for regression models

Y2=(y>0).astype(int)
results = run_multi_model_cv(X, Y2, modeles_bin)

In [None]:
# Plot MSE results for regression models binaire
plot_multi_model_results(results)

In [None]:
# Optimize models

from skopt import BayesSearchCV
from skopt.space import Integer, Real, Categorical
from skopt.callbacks import DeltaYStopper

# Define the search spaces for each model
spaces =  {
    'RandomForestRegressor': {
        'n_estimators': Integer(10, 300),
        'max_depth': Integer(1, 50),
        'min_samples_split': Integer(2, 20),
        'min_samples_leaf': Integer(1, 20)
    },
    'XGBRegressor': {
        'n_estimators': Integer(10, 200),
        'max_depth': Integer(1, 50),
        'learning_rate': Real(0.01, 2.0, 'log-uniform'),
        'subsample': Real(0.5, 1.0, 'uniform'),
        'colsample_bytree': Real(0.5, 1.0, 'uniform')
    },
    'LGBMRegressor': {
        'n_estimators': Integer(10, 200),
        'max_depth': Integer(1, 50),
        'learning_rate': Real(0.01, 1.0, 'log-uniform'),
        'num_leaves': Integer(20, 300),
        'min_child_samples': Integer(1, 100)
    },
    'SVR': {
        'C': Real(0.1, 10.0, 'log-uniform'),
        'epsilon': Real(0.001, 1.0, 'log-uniform'),
        'kernel': Categorical(['linear', 'rbf', 'poly'])
    },
    'KNNRegressor': {
        'n_neighbors': Integer(1, 50),
        'weights': Categorical(['uniform', 'distance']),
        'p': Integer(1, 2)
    },
    'Lasso': {
        'alpha': Real(0.0001, 10.0, 'log-uniform')
    },
    'Ridge': {
        'alpha': Real(0.01, 10.0, 'log-uniform'),
    }
}

def optimizer_callback(res):
    if len(res.func_vals) % 5 == 0:  # Print every 5 iterations
        print(f"Iteration {len(res.func_vals)}: Best score = {-res.fun:.4f}")

delta_stopper = DeltaYStopper(delta=0.001, n_best=10)

# Function to optimize models
def optimize_model(X, y, model, space, n_iter=100):
    sfold = KFold(n_splits=5)

    scorer = make_scorer(weighted_accuracy, greater_is_better=True)

    opt = BayesSearchCV(
        model,
        space,
        n_iter=n_iter,
        n_points=5,
        cv=sfold,
        n_jobs=-1,
        scoring=scorer,
        random_state=42
    )

    opt.fit(X, y, callback=[optimizer_callback, delta_stopper])

    return opt

# Optimize models
models = {
    'RandomForestRegressor': RandomForestRegressor(),
    'XGBRegressor': XGBRegressor(),
    'LGBMRegressor': LGBMRegressor(verbose=-1),
    'SVR': SVR(),
    'KNNRegressor': KNeighborsRegressor(),
    'Lasso': Lasso(max_iter=5000),
    'Ridge': Ridge(),
}


In [None]:

models_opt = {}
for name, model in models.items():
    print(f"Optimizing {name}...")
    opt = optimize_model(X, y, model, spaces[name])
    models_opt[name] = opt
    print(f"Best parameters: {opt.best_params_}")
    print(f"Best score: {opt.best_score_:.4f}")
    print()

In [None]:
models_opt = {}

models_opt['RandomForestRegressor'] = {
    'estimator': RandomForestRegressor(**{'max_depth': 46, 'min_samples_leaf': 7, 'min_samples_split': 4, 'n_estimators': 37}),
    'best_params_': {'max_depth': 46, 'min_samples_leaf': 7, 'min_samples_split': 4, 'n_estimators': 37},
    'best_score_': 0.5381
}

models_opt['XGBRegressor'] = {
    'estimator': XGBRegressor(**{'colsample_bytree': 0.9227515368399839, 'learning_rate': 0.04029469909025679, 'max_depth': 31, 'n_estimators': 28, 'subsample': 0.7983208166471775}),
    'best_params_': {'colsample_bytree': 0.9227515368399839, 'learning_rate': 0.04029469909025679, 'max_depth': 31, 'n_estimators': 28, 'subsample': 0.7983208166471775},
    'best_score_': 0.5364
}

models_opt['LGBMRegressor'] = {
    'estimator': LGBMRegressor(**{'learning_rate': 0.0634974604459163, 'max_depth': 8, 'min_child_samples': 100, 'n_estimators': 195, 'num_leaves': 185}),
    'best_params_': {'learning_rate': 0.0634974604459163, 'max_depth': 8, 'min_child_samples': 100, 'n_estimators': 195, 'num_leaves': 185},
    'best_score_': 0.5311
}

models_opt['SVR'] = {
    'estimator': SVR(**{'C': 0.10270990995387987, 'epsilon': 0.0010205465165897824, 'kernel': 'linear'}),
    'best_params_': {'C': 0.10270990995387987, 'epsilon': 0.0010205465165897824, 'kernel': 'linear'},
    'best_score_': 0.5390
}

models_opt['KNNRegressor'] = {
    'estimator': KNeighborsRegressor(**{'n_neighbors': 10, 'p': 1, 'weights': 'distance'}),
    'best_params_': {'n_neighbors': 10, 'p': 1, 'weights': 'distance'},
    'best_score_': 0.5272
}

models_opt['Lasso'] = {
    'estimator': Lasso(**{'alpha': 0.015848691168076898}),
    'best_params_': {'alpha': 0.015848691168076898},
    'best_score_': 0.5297
}

models_opt['Ridge'] = {
    'estimator': Ridge(**{'alpha': 6.2893244081003825}),
    'best_params_': {'alpha': 6.2893244081003825},
    'best_score_': 0.5256
}


In [None]:
models = {}
for name, opt in models_opt.items():
    model_class = type(opt["estimator"])  # Get the model class (e.g., RandomForestRegressor)
    best_params = opt["best_params_"]  # Get the best parameters from the optimization

    # Add the model with the best parameters to the new dictionary
    models[f"{name} opt"] = model_class(**best_params)

# Add a baseline model manually (if needed)
#models['Random Forest Baseline'] = RandomForestClassifier()

In [None]:
# Run cross-validation for regression 
results = run_multi_model_cv(X, y, models)

In [None]:
# Plot MSE results for regression models
plot_multi_model_results(results)

Si train sont significativement plus courtes que tests signifie que le modèle surajuste (overfitting). Il fonctionne bien sur l'entraînement, mais moins bien sur les données de test. (inversement underfitting)

Si train et test sont similaires en longueur, cela signifie que le modèle généralise bien à la fois sur les données d'entraînement et de test.



# Bagging

In [None]:
# Sort the models by their best score in descending order
top_regressors = sorted(models_opt.items(), key=lambda x: x[1]["best_score_"], reverse=True)
# Filter only classifiers from the sorted models

top_2_regressors = [(name, opt["estimator"]) for name, opt in top_regressors if isinstance(opt["estimator"], RegressorMixin)][:2]

# Print the top 2 regressors for verification
print("Top 2 regressors used in VotingRegressor:")
for name, estimator in top_2_regressors:
    print(f"{name}: {type(estimator).__name__}")

# Define the voting ensemble using the top 2 regressors
voting_regressor_ensemble = VotingRegressor(estimators=top_2_regressors)


bagging_ensemble = BaggingRegressor(
    estimator=RandomForestRegressor(),
    n_estimators=10,
    max_samples=0.8,
    max_features=0.8,
    bootstrap=True,
    bootstrap_features=False,
    n_jobs=-1
)

models = {
    'bagging_ensemble': bagging_ensemble,
    'voting_regressor_ensemble': voting_regressor_ensemble,
      'Random Forest Baseline': RandomForestRegressor()
}

In [None]:
# Run cross-validation for regression models
results = run_multi_model_cv(X, y, models)

In [None]:
# Plot MSE results for regression models
plot_multi_model_results(results)

# Stacking

In [None]:
# Separate regressors and classifiers from models_opt
regressor_estimators = [(name, opt["estimator"]) for name, opt in models_opt.items() if isinstance(opt["estimator"], RegressorMixin)]


# Définition des meta-modèles
ridge_regressor = Ridge()
random_forest_regressor = RandomForestRegressor()
xgb_regressor = XGBRegressor()
lasso_regressor = Lasso()
svr_regressor = SVR()
lgbm_regressor = LGBMRegressor()
knn_regressor = KNeighborsRegressor()

# Stacking Regressor avec Ridge comme meta-modèle
stacking_regressor_ridge = StackingRegressor(
    estimators=regressor_estimators,
    final_estimator=ridge_regressor
)

# Stacking Regressor avec RandomForest comme meta-modèle
stacking_regressor_rf = StackingRegressor(
    estimators=regressor_estimators,
    final_estimator=random_forest_regressor
)

# Stacking Regressor avec XGBRegressor comme meta-modèle
stacking_regressor_xgb = StackingRegressor(
    estimators=regressor_estimators,
    final_estimator=xgb_regressor
)

# Stacking Regressor avec Lasso comme meta-modèle
stacking_regressor_lasso = StackingRegressor(
    estimators=regressor_estimators,
    final_estimator=lasso_regressor
)

# Stacking Regressor avec SVR comme meta-modèle
stacking_regressor_svr = StackingRegressor(
    estimators=regressor_estimators,
    final_estimator=svr_regressor
)

# Stacking Regressor avec LGBMRegressor comme meta-modèle
stacking_regressor_lgbm = StackingRegressor(
    estimators=regressor_estimators,
    final_estimator=lgbm_regressor
)

# Stacking Regressor avec KNeighborsRegressor comme meta-modèle
stacking_regressor_knn = StackingRegressor(
    estimators=regressor_estimators,
    final_estimator=knn_regressor
)

In [None]:
models = {
    'Stacking Regressor (Ridge Meta)': stacking_regressor_ridge,
    'Stacking Regressor (RandomForest Meta)': stacking_regressor_rf,
    'Stacking Regressor (XGBRegressor Meta)': stacking_regressor_xgb,
    'Stacking Regressor (Lasso Meta)': stacking_regressor_lasso,
    'Stacking Regressor (SVR Meta)': stacking_regressor_svr,
    'Stacking Regressor (LGBMRegressor Meta)': stacking_regressor_lgbm,
    'Stacking Regressor (KNeighborsRegressor Meta)': stacking_regressor_knn
}

results = run_multi_model_cv(X, y, models)

plot_multi_model_results(results)

### Submission:

In [None]:
data_train = pd.read_csv('module6_exercise_train.csv', index_col='index')
X_test = pd.read_csv('module6_exercise_test.csv', index_col='index')
y_train = data_train.pop('end_of_day_return')
X_train = data_train.copy()
data_train

In [None]:
# Train on complete data (X_train, y_train) and predict on X_test
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer

best_model=  SVR(  C=0.10270990995387987,    epsilon=0.0010205465165897824, kernel='linear')


best_model.fit(X_train, y_train)
cv = cross_validate(best_model, X_train, y_train, cv=5, scoring=make_scorer(weighted_accuracy))

print(cv['test_score'].mean())
pred=best_model.predict(X_test)

In [None]:

submission = pd.DataFrame({
    'index': X_test.index,
    'end_of_day_return':pred
})

submission.to_csv('submission.csv', index=False, sep=',')