In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn.linear_model import LinearRegression, LogisticRegression, Lasso, Ridge, RidgeClassifier
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, BaggingRegressor, StackingRegressor, StackingClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.svm import SVR, SVC
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import make_scorer
from sklearn.base import RegressorMixin, ClassifierMixin, BaseEstimator
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier, XGBRegressor
from lightgbm import LGBMClassifier, LGBMRegressor


### Data Collection

In [None]:
import requests

# URLs of the files
train_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module6/exercise/module6_exercise_train.csv'
test_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module6/exercise/module6_exercise_test.csv'

# Function to download a file
def download_file(url, file_name):
    response = requests.get(url)
    response.raise_for_status()  # Ensure we notice bad responses
    with open(file_name, 'wb') as file:
        file.write(response.content)
    print(f'Downloaded {file_name} from {url}')

# Downloading the files
download_file(train_data_url, 'module6_exercise_train.csv')
download_file(test_data_url, 'module6_exercise_test.csv')

In [None]:
data_train = pd.read_csv('module6_exercise_train.csv', index_col='index')
data_test = pd.read_csv('module6_exercise_test.csv', index_col='index')

### Data Analysis

In [None]:
data_train

In [None]:
data_test

In [None]:
data_train.describe()

In [None]:
data_train.isnull().sum()

In [None]:
data_test.isnull().sum()

In [None]:
# Plot the distribution using seaborn
plt.figure(figsize=(10, 6))
sns.histplot(data_train['end_of_day_return'], bins=50, kde=True)
plt.title('Distribution of End of Day Return')
plt.xlabel('End of Day Return')
plt.ylabel('Frequency')
plt.show()

### Model Building and Evaluate

In [None]:
y = data_train.pop('end_of_day_return')
X = data_train.copy()


In [None]:
print(y)

In [None]:
def weighted_accuracy(y_true, y_pred):
    weights = np.abs(y_true)

    # Compute the sign of true and predicted values
    sign_true = np.sign(y_true)
    sign_pred = np.sign(y_pred)

    # Correct predictions where the sign of the true and predicted values match
    correct_predictions = sign_true == sign_pred

    # Compute the weighted accuracy
    weighted_acc = np.sum(weights * correct_predictions) / np.sum(weights)

    return weighted_acc

In [None]:
# Function to plot the evaluation results
def plot_results(mse_train, mse_test, w_acc_train, w_acc_test):
    plt.figure(figsize=(12, 6))

    # MSE plot
    plt.subplot(1, 2, 1)
    plt.plot(mse_train, label="Train MSE", marker='o')
    plt.plot(mse_test, label="Test MSE", marker='o')
    plt.fill_between(range(len(mse_train)), np.min(mse_train), np.max(mse_train), color='blue', alpha=0.1)
    plt.fill_between(range(len(mse_test)), np.min(mse_test), np.max(mse_test), color='orange', alpha=0.1)
    plt.title("MSE over Folds")
    plt.xlabel("Fold")
    plt.ylabel("MSE")
    plt.legend()
    plt.grid(True)

    # weighted_accuracy plot
    plt.subplot(1, 2, 2)
    plt.plot(w_acc_train, label="Train weighted_accuracy", marker='o')
    plt.plot(w_acc_test, label="Test weighted_accuracy", marker='o')
    plt.fill_between(range(len(w_acc_train)), np.min(w_acc_train), np.max(w_acc_train), color='blue', alpha=0.1)
    plt.fill_between(range(len(w_acc_test)), np.min(w_acc_test), np.max(w_acc_test), color='orange', alpha=0.1)
    plt.title("weighted_accuracy over Folds")
    plt.xlabel("Fold")
    plt.ylabel("weighted_accuracy")
    plt.legend()
    plt.grid(True)

    plt.tight_layout()
    plt.show()

def plot_multi_model_results(results):
    # Set up the plot
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 20))

    # Colors for train and test
    train_color = 'skyblue'
    test_color = 'lightgreen'

    # Plot MSE
    ax1.set_title('Mean Squared Error (MSE) Comparison', fontsize=16)
    ax1.set_ylabel('MSE', fontsize=12)
    ax1.set_xlabel('Models', fontsize=12)
    ax1.grid(True, linestyle='--', alpha=0.7)

    # Plot weighted_accuracy
    ax2.set_title('weighted_accuracy Comparison', fontsize=16)
    ax2.set_ylabel('weighted_accuracy', fontsize=12)
    ax2.set_xlabel('Models', fontsize=12)
    ax2.grid(True, linestyle='--', alpha=0.7)

    x = np.arange(len(results))
    width = 0.35

    for i, (model_name, scores) in enumerate(results.items()):
        # MSE
        mse_train = scores['mse_train']
        mse_test = scores['mse_test']

        ax1.bar(x[i] - width/2, np.mean(mse_train), width, label='Train' if i == 0 else "",
                color=train_color, alpha=0.7)
        ax1.bar(x[i] + width/2, np.mean(mse_test), width, label='Test' if i == 0 else "",
                color=test_color, alpha=0.7)

        ax1.errorbar(x[i] - width/2, np.mean(mse_train),
                     yerr=[[np.mean(mse_train)-np.min(mse_train)], [np.max(mse_train)-np.mean(mse_train)]],
                     fmt='none', ecolor='black', capsize=5)
        ax1.errorbar(x[i] + width/2, np.mean(mse_test),
                     yerr=[[np.mean(mse_test)-np.min(mse_test)], [np.max(mse_test)-np.mean(mse_test)]],
                     fmt='none', ecolor='black', capsize=5)

        # weighted_accuracy
        w_acc_train = scores['w_acc_train']
        w_acc_test = scores['w_acc_test']

        ax2.bar(x[i] - width/2, np.mean(w_acc_train), width, label='Train' if i == 0 else "",
                color=train_color, alpha=0.7)
        ax2.bar(x[i] + width/2, np.mean(w_acc_test), width, label='Test' if i == 0 else "",
                color=test_color, alpha=0.7)

        ax2.errorbar(x[i] - width/2, np.mean(w_acc_train),
                     yerr=[[np.mean(w_acc_train)-np.min(w_acc_train)], [np.max(w_acc_train)-np.mean(w_acc_train)]],
                     fmt='none', ecolor='black', capsize=5)
        ax2.errorbar(x[i] + width/2, np.mean(w_acc_test),
                     yerr=[[np.mean(w_acc_test)-np.min(w_acc_test)], [np.max(w_acc_test)-np.mean(w_acc_test)]],
                     fmt='none', ecolor='black', capsize=5)

    ax1.set_xticks(x)
    ax1.set_xticklabels(results.keys(), rotation=45, ha='right')
    ax2.set_xticks(x)
    ax2.set_xticklabels(results.keys(), rotation=45, ha='right')

    ax1.legend(loc='upper left')
    ax2.legend(loc='upper left')

    plt.tight_layout()
    plt.show()

#### Simple Baseline

In [None]:
# Function to handle train-test evaluation in a fold
def train_and_evaluate(X_train, X_test, y_train, y_test, model):

    # Train the model
    model.fit(X_train, y_train)

    # Make predictions on train set
    y_pred_train = model.predict(X_train)
    # Make predictions on train set
    y_pred_test = model.predict(X_test)

    # Compute MSE for train and test
    mse_train = mean_squared_error(y_train, y_pred_train)
    mse_test = mean_squared_error(y_test, y_pred_test)

    # Compute weighted_accuracy

    w_acc_train = weighted_accuracy(y_train, y_pred_train)
    w_acc_test = weighted_accuracy(y_test, y_pred_test)

    return mse_train, mse_test, w_acc_train, w_acc_test


def run_multi_model_cv(X, y, models, n_splits=5):
    fold = KFold(n_splits=n_splits)
    results = {name: {'mse_train': [], 'mse_test': [], 'w_acc_train': [], 'w_acc_test': []}
               for name in models.keys()}

    for train_index, test_index in fold.split(X, y):
        X_train, X_test = X.iloc[train_index].copy(), X.iloc[test_index].copy()
        y_train, y_test = y.iloc[train_index].copy(), y.iloc[test_index].copy()

        for name, model in models.items():
            mse_train, mse_test, w_acc_train, w_acc_test = train_and_evaluate(
                X_train, X_test, y_train, y_test, model
            )
            results[name]['mse_train'].append(mse_train)
            results[name]['mse_test'].append(mse_test)
            results[name]['w_acc_train'].append(w_acc_train)
            results[name]['w_acc_test'].append(w_acc_test)
        # Find the model with the best mean w_acc test score
    best_mean_w_acc = -1
    best_model = None
    best_min_w_acc = None
    best_max_w_acc = None

    for name, result in results.items():
        w_acc_test_scores = result['w_acc_test']
        mean_w_acc_test = sum(w_acc_test_scores) / len(w_acc_test_scores)  # Calculate mean w_acc score
        min_w_acc_test = min(w_acc_test_scores)  # Minimum w_acc score
        max_w_acc_test = max(w_acc_test_scores)  # Maximum w_acc score

        if mean_w_acc_test > best_mean_w_acc:
            best_mean_w_acc = mean_w_acc_test
            best_min_w_acc = min_w_acc_test
            best_max_w_acc = max_w_acc_test
            best_model = name

    # Print the best mean w_acc test score, min, max, and the associated model
    print(f"Best mean w_acc test score: {best_mean_w_acc:.4f} by model: {best_model}")
    print(f"Min w_acc test score: {best_min_w_acc:.4f}, Max w_acc test score: {best_max_w_acc:.4f}")
    return results


In [None]:
models = {
    'Ridge': Ridge(),
    'Decision Tree Regressor': DecisionTreeRegressor(),
    'Random Forest Regressor': RandomForestRegressor(n_jobs=-1),
    'Bagging Regressor': BaggingRegressor(),
    'SVR': SVR(),
    'Lasso': Lasso(max_iter=5000),
    'KNN Regressor': KNeighborsRegressor(n_jobs=-1),
    'XGBRegressor': XGBRegressor(n_jobs=-1),
    'LGBMRegressor': LGBMRegressor(n_jobs=-1,verbose=-1)
}

In [None]:
# Step 1: Run cross-validation
results = run_multi_model_cv(X, y, models)

In [None]:
# Plot MSE results for regression models
plot_multi_model_results(results)

#### Manage properly the objective weighted_accuracy
should we create different classes? custom loss?

Create Compare and Optimize different models

#

```
# Optimization des Hyperparamètres

```



In [None]:
!pip install scikit-optimize


In [None]:
from skopt import BayesSearchCV
from skopt.space import Integer, Real, Categorical
from skopt.callbacks import DeltaYStopper
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from scipy.stats import randint
from skopt.space import Integer, Real, Categorical
from skopt.callbacks import DeltaYStopper
from xgboost import XGBRegressor
from scipy.stats import randint, loguniform, uniform
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline

# Define distributions to sample from

def y_sign_strat(y):
    s = np.sign(y).copy(); s[s == 0] = 1
    return (s > 0).astype(int)

param_ridge = param_distributions = {
    "alpha": np.logspace(-3, 3, 40),
    "solver": ["auto", "lsqr", "saga"],
}

weighted_acc_scorer = make_scorer(
    lambda yt, yp: weighted_accuracy(yt, yp),
    greater_is_better=True
)

r1 = Ridge()

random_search = RandomizedSearchCV(
    estimator=r1,
    param_distributions=param_distributions,
    n_iter=50,
    cv=cv.split(X, y_sign_strat(y)),
    scoring=weighted_acc_scorer,
    random_state=42,
    n_jobs=-1,
    verbose=1
)

In [None]:
random_search.fit(X,y)

In [None]:
results = pd.DataFrame(random_search.cv_results_)
results = results.sort_values(by="mean_test_score", ascending=False)
print(results[["mean_test_score", "std_test_score", "param_alpha", "param_solver"]].head()) # Modified column names and example params
print("\nBest params:", random_search.best_params_)
print("Best score:", random_search.best_score_)

In [None]:
from skopt import BayesSearchCV
from skopt.space import Integer, Real, Categorical
from skopt.callbacks import DeltaYStopper
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from skopt.space import Integer, Real, Categorical
from skopt.callbacks import DeltaYStopper
from xgboost import XGBRegressor
from scipy.stats import randint, loguniform, uniform
from sklearn.model_selection import RandomizedSearchCV, KFold




param_xgb = {
    "max_depth": randint(2, 10),
    "min_child_weight": randint(1, 10),
    "gamma": loguniform(1e-8, 1),
    "subsample": uniform(0.5, 0.5),
    "colsample_bytree": uniform(0.5, 0.5),
    "learning_rate": loguniform(1e-3, 0.3),
    "reg_alpha": loguniform(1e-6, 1),
    "reg_lambda": loguniform(1e-3, 10),
}


xgb = XGBRegressor(
    objective="reg:squarederror",
    n_estimators=1000,
    tree_method="hist",
    random_state=42,
    n_jobs=1
)


cv = KFold(n_splits=5)

xgb_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_xgb,
    n_iter=80,
    scoring=weighted_accuracy,
    cv=cv,
    n_jobs=-1,
    verbose=1,
    random_state=42,
    error_score="raise",
)

In [None]:
xgb_search.fit(X, y)

In [None]:
results = pd.DataFrame(xgb_search.cv_results_)
results = results.sort_values(by="mean_test_score", ascending=False)
print(results[["mean_test_score", "std_test_score", "param_max_depth", "param_learning_rate"]].head()) # Modified column names and example params
print("\nBest params:", xgb_search.best_params_)
print("Best score:", xgb_search.best_score_)

In [None]:
random_search.fit(X,y)

In [None]:
rf = RandomForestRegressor(
    n_estimators=300,
    n_jobs=-1,
    random_state=42
)


param_rf = {
    "max_depth": randint(3, 15),
    "min_samples_split": randint(2, 20),
    "min_samples_leaf": randint(1, 20),
    "max_features": uniform(0.3, 0.7),
}

# RandomizedSearchCV
rf_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_rf,
    n_iter=50,
    cv=3,
    scoring=weighted_acc_scorer,
    n_jobs=-1,
    verbose=1,
    random_state=42,
    error_score="raise"
)

In [None]:
rf_search.fit(X,y)

In [None]:
results = pd.DataFrame(rf_search.cv_results_)
results = results.sort_values(by="mean_test_score", ascending=False)

# Afficher les colonnes utiles
print(results[["mean_test_score", "std_test_score", "param_max_depth", "param_min_samples_split", "param_min_samples_leaf", "param_max_features"]].head())

# Meilleurs paramètres et score
print("\nBest params:", rf_search.best_params_)
print("Best score:", rf_search.best_score_)

After the installation is complete, please run the cell with the `skopt` import again.

### Submission:

In [None]:
data_train = pd.read_csv('module6_exercise_train.csv', index_col='index')
X_test = pd.read_csv('module6_exercise_test.csv', index_col='index')
y_train = data_train.pop('end_of_day_return')
X_train = data_train.copy()

In [None]:
ridge_best = Ridge(alpha=702, solver="lsqr", tol=1e-6)
ridge_best.fit(X_train, y_train)

In [None]:
y_pred_train = ridge_best.predict(X_train)
y_pred_test  = ridge_best.predict(X_test)

In [None]:
train_w_acc = weighted_accuracy(y_train, y_pred_train)
print("Train weighted accuracy:", train_w_acc)

In [None]:
# Train on complete data (X_train, y_train) and predict on X_test

In [None]:

submission = pd.DataFrame({
    'index': X_test.index,
    'end_of_day_return': y_pred_test
})

submission.to_csv('submission.csv', index=False, sep=',')

In [None]:
from google.colab import files

# Téléchargement du fichier généré
files.download("submission.csv")
