In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn.linear_model import LinearRegression, LogisticRegression, Lasso, Ridge, RidgeClassifier
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, BaggingRegressor, StackingRegressor, StackingClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.svm import SVR, SVC
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import make_scorer
from sklearn.base import RegressorMixin, ClassifierMixin, BaseEstimator
from sklearn.ensemble import VotingClassifier

from xgboost import XGBClassifier, XGBRegressor
from lightgbm import LGBMClassifier, LGBMRegressor


### Data Collection

In [None]:
import requests

# URLs of the files
train_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module6/exercise/module6_exercise_train.csv'
test_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module6/exercise/module6_exercise_test.csv'

# Function to download a file
def download_file(url, file_name):
    response = requests.get(url)
    response.raise_for_status()  # Ensure we notice bad responses
    with open(file_name, 'wb') as file:
        file.write(response.content)
    print(f'Downloaded {file_name} from {url}')

# Downloading the files
download_file(train_data_url, 'module6_exercise_train.csv')
download_file(test_data_url, 'module6_exercise_test.csv')

In [None]:
data_train = pd.read_csv('module6_exercise_train.csv', index_col='index')
data_test = pd.read_csv('module6_exercise_test.csv', index_col='index')

### Data Analysis

In [None]:
data_train

In [None]:
data_test

In [None]:
data_train.describe()

In [None]:
data_train.isnull().sum()

In [None]:
data_test.isnull().sum()

In [None]:
# Plot the distribution using seaborn
plt.figure(figsize=(10, 6))
sns.histplot(data_train['end_of_day_return'], bins=50, kde=True)
plt.title('Distribution of End of Day Return')
plt.xlabel('End of Day Return')
plt.ylabel('Frequency')
plt.show()

### Model Building and Evaluate

In [None]:
y = data_train.pop('end_of_day_return')
X = data_train.copy()

In [None]:
def weighted_accuracy(y_true, y_pred):
    weights = np.abs(y_true)
    
    # Compute the sign of true and predicted values
    sign_true = np.sign(y_true)
    sign_pred = np.sign(y_pred)
    
    # Correct predictions where the sign of the true and predicted values match
    correct_predictions = sign_true == sign_pred
    
    # Compute the weighted accuracy
    weighted_acc = np.sum(weights * correct_predictions) / np.sum(weights)
    
    return weighted_acc

In [None]:
# Function to plot the evaluation results
def plot_results(mse_train, mse_test, w_acc_train, w_acc_test):
    plt.figure(figsize=(12, 6))

    # MSE plot
    plt.subplot(1, 2, 1)
    plt.plot(mse_train, label="Train MSE", marker='o')
    plt.plot(mse_test, label="Test MSE", marker='o')
    plt.fill_between(range(len(mse_train)), np.min(mse_train), np.max(mse_train), color='blue', alpha=0.1)
    plt.fill_between(range(len(mse_test)), np.min(mse_test), np.max(mse_test), color='orange', alpha=0.1)
    plt.title("MSE over Folds")
    plt.xlabel("Fold")
    plt.ylabel("MSE")
    plt.legend()
    plt.grid(True)

    # weighted_accuracy plot
    plt.subplot(1, 2, 2)
    plt.plot(w_acc_train, label="Train weighted_accuracy", marker='o')
    plt.plot(w_acc_test, label="Test weighted_accuracy", marker='o')
    plt.fill_between(range(len(w_acc_train)), np.min(w_acc_train), np.max(w_acc_train), color='blue', alpha=0.1)
    plt.fill_between(range(len(w_acc_test)), np.min(w_acc_test), np.max(w_acc_test), color='orange', alpha=0.1)
    plt.title("weighted_accuracy over Folds")
    plt.xlabel("Fold")
    plt.ylabel("weighted_accuracy")
    plt.legend()
    plt.grid(True)

    plt.tight_layout()
    plt.show()

def plot_multi_model_results(results):
    # Set up the plot
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 20))
    
    # Colors for train and test
    train_color = 'skyblue'
    test_color = 'lightgreen'
    
    # Plot MSE
    ax1.set_title('Mean Squared Error (MSE) Comparison', fontsize=16)
    ax1.set_ylabel('MSE', fontsize=12)
    ax1.set_xlabel('Models', fontsize=12)
    ax1.grid(True, linestyle='--', alpha=0.7)
    
    # Plot weighted_accuracy
    ax2.set_title('weighted_accuracy Comparison', fontsize=16)
    ax2.set_ylabel('weighted_accuracy', fontsize=12)
    ax2.set_xlabel('Models', fontsize=12)
    ax2.grid(True, linestyle='--', alpha=0.7)
    
    x = np.arange(len(results))
    width = 0.35
    
    for i, (model_name, scores) in enumerate(results.items()):
        # MSE
        mse_train = scores['mse_train']
        mse_test = scores['mse_test']
        
        ax1.bar(x[i] - width/2, np.mean(mse_train), width, label='Train' if i == 0 else "", 
                color=train_color, alpha=0.7)
        ax1.bar(x[i] + width/2, np.mean(mse_test), width, label='Test' if i == 0 else "", 
                color=test_color, alpha=0.7)
        
        ax1.errorbar(x[i] - width/2, np.mean(mse_train), 
                     yerr=[[np.mean(mse_train)-np.min(mse_train)], [np.max(mse_train)-np.mean(mse_train)]], 
                     fmt='none', ecolor='black', capsize=5)
        ax1.errorbar(x[i] + width/2, np.mean(mse_test), 
                     yerr=[[np.mean(mse_test)-np.min(mse_test)], [np.max(mse_test)-np.mean(mse_test)]], 
                     fmt='none', ecolor='black', capsize=5)
        
        # weighted_accuracy
        w_acc_train = scores['w_acc_train']
        w_acc_test = scores['w_acc_test']
        
        ax2.bar(x[i] - width/2, np.mean(w_acc_train), width, label='Train' if i == 0 else "", 
                color=train_color, alpha=0.7)
        ax2.bar(x[i] + width/2, np.mean(w_acc_test), width, label='Test' if i == 0 else "", 
                color=test_color, alpha=0.7)
        
        ax2.errorbar(x[i] - width/2, np.mean(w_acc_train), 
                     yerr=[[np.mean(w_acc_train)-np.min(w_acc_train)], [np.max(w_acc_train)-np.mean(w_acc_train)]], 
                     fmt='none', ecolor='black', capsize=5)
        ax2.errorbar(x[i] + width/2, np.mean(w_acc_test), 
                     yerr=[[np.mean(w_acc_test)-np.min(w_acc_test)], [np.max(w_acc_test)-np.mean(w_acc_test)]], 
                     fmt='none', ecolor='black', capsize=5)
    
    ax1.set_xticks(x)
    ax1.set_xticklabels(results.keys(), rotation=45, ha='right')
    ax2.set_xticks(x)
    ax2.set_xticklabels(results.keys(), rotation=45, ha='right')
    
    ax1.legend(loc='upper left')
    ax2.legend(loc='upper left')
    
    plt.tight_layout()
    plt.show()

#### Simple Baseline

In [None]:
# Function to handle train-test evaluation in a fold
def train_and_evaluate(X_train, X_test, y_train, y_test, model):
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions on train set
    y_pred_train = model.predict(X_train)
    # Make predictions on train set
    y_pred_test = model.predict(X_test)
    
    # Compute MSE for train and test
    mse_train = mean_squared_error(y_train, y_pred_train)
    mse_test = mean_squared_error(y_test, y_pred_test)
    
    # Compute weighted_accuracy
    
    w_acc_train = weighted_accuracy(y_train, y_pred_train)
    w_acc_test = weighted_accuracy(y_test, y_pred_test)
    
    return mse_train, mse_test, w_acc_train, w_acc_test


def run_multi_model_cv(X, y, models, n_splits=5):
    fold = KFold(n_splits=n_splits)
    results = {name: {'mse_train': [], 'mse_test': [], 'w_acc_train': [], 'w_acc_test': []} 
               for name in models.keys()}
    
    for train_index, test_index in fold.split(X, y):
        X_train, X_test = X.iloc[train_index].copy(), X.iloc[test_index].copy()
        y_train, y_test = y.iloc[train_index].copy(), y.iloc[test_index].copy()
        
        for name, model in models.items():
            mse_train, mse_test, w_acc_train, w_acc_test = train_and_evaluate(
                X_train, X_test, y_train, y_test, model
            )
            results[name]['mse_train'].append(mse_train)
            results[name]['mse_test'].append(mse_test)
            results[name]['w_acc_train'].append(w_acc_train)
            results[name]['w_acc_test'].append(w_acc_test)
        # Find the model with the best mean w_acc test score
    best_mean_w_acc = -1
    best_model = None
    best_min_w_acc = None
    best_max_w_acc = None
    
    for name, result in results.items():
        w_acc_test_scores = result['w_acc_test']
        mean_w_acc_test = sum(w_acc_test_scores) / len(w_acc_test_scores)  # Calculate mean w_acc score
        min_w_acc_test = min(w_acc_test_scores)  # Minimum w_acc score
        max_w_acc_test = max(w_acc_test_scores)  # Maximum w_acc score
        
        if mean_w_acc_test > best_mean_w_acc:
            best_mean_w_acc = mean_w_acc_test
            best_min_w_acc = min_w_acc_test
            best_max_w_acc = max_w_acc_test
            best_model = name
    
    # Print the best mean w_acc test score, min, max, and the associated model
    print(f"Best mean w_acc test score: {best_mean_w_acc:.4f} by model: {best_model}")
    print(f"Min w_acc test score: {best_min_w_acc:.4f}, Max w_acc test score: {best_max_w_acc:.4f}")
    return results


In [None]:
# Step 1: Run cross-validation
results = run_multi_model_cv(X, y, {"RandomForestRegressor": RandomForestRegressor(n_jobs=-1)})

In [None]:
# Step 2: Plot the results
plot_results(results["RandomForestRegressor"]["mse_train"],
             results["RandomForestRegressor"]["mse_test"],
             results["RandomForestRegressor"]["w_acc_train"],
             results["RandomForestRegressor"]["w_acc_test"])

In [None]:
models = {
    'Ridge': Ridge(),
    'Decision Tree Regressor': RandomForestRegressor(n_jobs=-1)
    }

In [None]:
# Run cross-validation for regression models
results = run_multi_model_cv(X, y, models)

In [None]:
# Plot MSE results for regression models
plot_multi_model_results(results)

#### Manage properly the objective weighted_accuracy
should we create different classes? custom loss?

Create Compare and Optimize different models

#### Minimisation of the weighted_accuracy

##### Linear models

We try linear models because the distribution of Y seems to be Gaussian.

In [None]:
models = {
    'OLS': LinearRegression(),
    'Lasso': Lasso(),
    'Ridge': Ridge()
    }

In [None]:
# Run cross-validation for regression models
results = run_multi_model_cv(X, y, models)

We try a GridSearch on the parameter of Ridge.

In [None]:
# Hyperparameter values
alphas = [100, 500, 750, 1000, 1250]

# Creation of the model dictionary
ridge_models = {}

for alpha in alphas:
    name = f"ridge_alpha{alpha}"
    ridge_models[name] = Ridge(alpha=alpha)

# List of models
for k, v in list(ridge_models.items()):
    print(k, ":", v)

In [None]:
# Run cross-validation for regression models
ridge_results = run_multi_model_cv(X, y, ridge_models)

In [None]:
# Plot the ridge_results
plot_results(ridge_results["ridge_alpha750"]["mse_train"],
             ridge_results["ridge_alpha750"]["mse_test"],
             ridge_results["ridge_alpha750"]["w_acc_train"],
             ridge_results["ridge_alpha750"]["w_acc_test"])

We try a GridSearch on the parameter of Lasso.

In [None]:
# Hyperparameter values
alphas = [0.001, 0.005, 0.01, 0.1, 1]

# Creation of the model dictionary
lasso_models = {}

for alpha in alphas:
    name = f"lasso_alpha{alpha}"
    lasso_models[name] = Lasso(alpha=alpha)  
    
# List of models
for k, v in list(lasso_models.items()):
    print(k, ":", v)

In [None]:
# Run cross-validation for regression models
lasso_results = run_multi_model_cv(X, y, lasso_models)

Our best model is Ridge followed by the Lasso.

We try Gauss-Lasso and Ridge after Lasso (Ridge-Lasso).

In [None]:
def GaussAndRidge_Lasso(X, y, alpha, count, alphas_ridge, n_splits=5):
    fold = KFold(n_splits=n_splits)
    n, p = X.shape
    X_count = np.zeros(p)
    w_acc_test = []

    for train_index, test_index in fold.split(X, y):
        X_train, X_test = X.iloc[train_index].copy(), X.iloc[test_index].copy()
        y_train, y_test = y.iloc[train_index].copy(), y.iloc[test_index].copy()

        # Create the lasso
        lasso = Lasso(alpha = alpha)
        
        # Train the lasso
        lasso.fit(X_train, y_train)

        # Make predictions on test set
        y_pred_test = lasso.predict(X_test)
    
        # Compute weighted_accuracy
        w_acc_test.append(weighted_accuracy(y_test, y_pred_test))

        # Retrieving selected columns
        selected_mask = lasso.coef_ != 0
        selected_features = X.columns[selected_mask]

        # Count the number of selections
        X_count = X_count + selected_mask
    

    mean_w_acc_test = sum(w_acc_test) / len(w_acc_test)  # Calculate mean w_acc score
    min_w_acc_test = min(w_acc_test)  # Minimum w_acc score
    max_w_acc_test = max(w_acc_test)  # Maximum w_acc score

    # Print the mean w_acc test score, min and max of the lasso model
    print(f"Lasso (alpha={alpha}):")
    print(f"Mean w_acc test score: {mean_w_acc_test:.4f}")
    print(f"Min w_acc test score: {min_w_acc_test:.4f}, Max w_acc test score: {max_w_acc_test:.4f}")
    print("----------------------------------------------------------")

    # Selection of columns present at least count
    selected_mask = X_count >= count
    selected_features = X.columns[selected_mask]

    # Reduction of the dataset
    X_reduced = X[selected_features]

    # Training of a classic linear model and ridge models on the reduced dataset
    results = run_multi_model_cv(X_reduced, y, {"Gauss-Lasso":LinearRegression()})
    print("----------------------------------------------------------")
    # Creation of the model dictionary
    ridge_models = {}
    for alpha in alphas_ridge:
        name = f"Ridge-Lasso_alpha{alpha}"
        ridge_models[name] = Ridge(alpha=alpha)

    ridge_results = run_multi_model_cv(X_reduced, y, ridge_models)
    return results, ridge_results, X_reduced, selected_features


In [None]:
best_alpha_lasso = 0.01 # alpha of our best lasso model
best_count = 5 # We tested all the possible values by hand

# Hyperparameter values
alphas_ridge = [1200, 1225, 1250, 1275, 1300]

results, ridge_results, X_reduced, selected_features = GaussAndRidge_Lasso(X, y, alpha=best_alpha_lasso, count=best_count, alphas_ridge=alphas_ridge)

In [None]:
best_alpha_ridge_lasso = 1250

In [None]:
# Plot the results
plot_results(results["Gauss-Lasso"]["mse_train"],
             results["Gauss-Lasso"]["mse_test"],
             results["Gauss-Lasso"]["w_acc_train"],
             results["Gauss-Lasso"]["w_acc_test"])

In [None]:
# Plot the results
plot_results(ridge_results["Ridge-Lasso_alpha1250"]["mse_train"],
             ridge_results["Ridge-Lasso_alpha1250"]["mse_test"],
             ridge_results["Ridge-Lasso_alpha1250"]["w_acc_train"],
             ridge_results["Ridge-Lasso_alpha1250"]["w_acc_test"])

Our best model is Ridge-Lasso followed by Gauss-Lasso.

##### Boosting

We try a GridSearch on the parameter of Xgboost.

In [None]:
# Hyperparameter values
max_depths = [8, 9, 10]
learning_rates = [0.95, 1, 1.05]
n_estimators = [33, 35, 37]

# Creation of the model dictionary
xgboost_models = {}
for depth in max_depths:
    for lr in learning_rates:
        for n_est in n_estimators:
            name = f"Xgboost_depth={depth}_lr={lr}_n_estimators={n_est}"
            xgboost_models[name] = XGBRegressor(max_depth=depth, learning_rate=lr, n_estimators=n_est)

print("Number of models:", len(xgboost_models))

In [None]:
# Run cross-validation for regression models
# xgboost_results = run_multi_model_cv(X, y, xgboost_models)

# Best mean w_acc test score: 0.5272 by model: Xgboost_depth=9_lr=1_n_estimators=35
# Min w_acc test score: 0.5073, Max w_acc test score: 0.5517

We try on the reduced dataset by Lasso.

In [None]:
# Hyperparameter values
max_depths = [7, 8, 9]
learning_rates = [1.95, 2, 2.05]
n_estimators = [105, 110, 115]

# Creation of the model dictionary
xgboost_models = {}
for depth in max_depths:
    for lr in learning_rates:
        for n_est in n_estimators:
            name = f"Xgboost_depth={depth}_lr={lr}_n_estimators={n_est}"
            xgboost_models[name] = XGBRegressor(max_depth=depth, learning_rate=lr, n_estimators=n_est)

print("Number of models:", len(xgboost_models))

In [None]:
# Run cross-validation for regression models
# xgboost_results = run_multi_model_cv(X_reduced, y, xgboost_models)

# Best mean w_acc test score: 0.5289 by model: Xgboost_depth=8_lr=2_n_estimators=110
# Min w_acc test score: 0.4993, Max w_acc test score: 0.5500

##### Stacking (Stacked Generalization)

In [None]:
def evaluate_model(X, y, model, n_splits=5):
    fold = KFold(n_splits=n_splits)
    w_acc_test = []

    for train_index, test_index in fold.split(X, y):
        X_train, X_test = X.iloc[train_index].copy(), X.iloc[test_index].copy()
        y_train, y_test = y.iloc[train_index].copy(), y.iloc[test_index].copy()
        
        # Train the model
        model.fit(X_train, y_train)

        # Make predictions on test set
        y_pred_test = model.predict(X_test)
    
        # Compute weighted_accuracy
        w_acc_test.append(weighted_accuracy(y_test, y_pred_test))
    

    mean_w_acc_test = sum(w_acc_test) / len(w_acc_test)  # Calculate mean w_acc score
    min_w_acc_test = min(w_acc_test)  # Minimum w_acc score
    max_w_acc_test = max(w_acc_test)  # Maximum w_acc score

    # Print the mean w_acc test score, min and max of the model
    print(f"model:")
    print(f"Mean w_acc test score: {mean_w_acc_test:.4f}")
    print(f"Min w_acc test score: {min_w_acc_test:.4f}, Max w_acc test score: {max_w_acc_test:.4f}")

    return mean_w_acc_test, min_w_acc_test, max_w_acc_test

Our best models:

In [None]:
gauss_lasso_results = run_multi_model_cv(X_reduced, y, {"Gauss-Lasso":LinearRegression()})
print("----------------------------------------------------------")
ridge_lasso_results = run_multi_model_cv(X_reduced, y, {"Ridge-Lasso":Ridge(alpha=best_alpha_ridge_lasso)})
print("----------------------------------------------------------")
xgboost_results = run_multi_model_cv(X_reduced, y, {"Xgboost":XGBRegressor(max_depth=8, learning_rate=2, n_estimators=110)})


We try stacking on our best models.

In [None]:
# base models
base_models = [
    ('Ridge-Lasso', Ridge(alpha=best_alpha_ridge_lasso)),
    ('Xgboost', XGBRegressor(max_depth=8, learning_rate=2, n_estimators=110))
]

# meta-learner
stacking = StackingRegressor(
    estimators=base_models,
    final_estimator=Ridge(),
    cv=5  # cross-validation to generate the meta-features
)

In [None]:
mean_w_acc_test, min_w_acc_test, max_w_acc_test = evaluate_model(X_reduced, y, stacking)

Finally, our best model is the Ridge-Lasso with an average weighted accuracy of 0.5299.

### Submission:

In [None]:
data_train = pd.read_csv('module6_exercise_train.csv', index_col='index')
X_test = pd.read_csv('module6_exercise_test.csv', index_col='index')
y_train = data_train.pop('end_of_day_return')
X_train = data_train.copy()

In [None]:
# Train on complete data (X_train, y_train) and predict on X_test
alpha_ridge = [best_alpha_ridge_lasso]
results, ridge_results, X_train_reduced, selected_features = GaussAndRidge_Lasso(X_train, y_train, alpha=best_alpha_lasso, count=best_count, alphas_ridge=alpha_ridge)

In [None]:
# Reduce the test dataset
X_test_reduced = X_test[selected_features]
print(X_reduced.columns)
print(X_test_reduced.columns)

In [None]:
ridge_lasso = Ridge(alpha=best_alpha_ridge_lasso)
mean_w_acc_test, min_w_acc_test, max_w_acc_test = evaluate_model(X_train_reduced, y_train, ridge_lasso)

In [None]:
# Train the model
ridge_lasso.fit(X_train_reduced, y_train)

# Make predictions on test set
y_pred_test = ridge_lasso.predict(X_test_reduced)

In [None]:

submission = pd.DataFrame({
    'index': X_test.index,
    'end_of_day_return': y_pred_test
})

submission.to_csv('submission.csv', index=False, sep=',')