In [None]:


import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.tree import DecisionTreeRegressor

import requests


train_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module6/exercise/module6_exercise_train.csv'
test_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module6/exercise/module6_exercise_test.csv'


def download_file(url, file_name):
    response = requests.get(url)
    response.raise_for_status()
    with open(file_name, 'wb') as file:
        file.write(response.content)
    print(f"Downloaded {file_name} from {url}")


download_file(train_data_url, 'module6_exercise_train.csv')
download_file(test_data_url, 'module6_exercise_test.csv')


data_train = pd.read_csv('module6_exercise_train.csv', index_col="index")
data_test = pd.read_csv('module6_exercise_test.csv', index_col="index")

print("Shape train:", data_train.shape)
print("Shape test:", data_test.shape)


print("Colonnes train:", data_train.columns.tolist()[:10], "...")  # aperçu
print("Colonnes test:", data_test.columns.tolist()[:10], "...")


display(data_train.head())
display(data_test.head())


In [None]:



display(data_train.describe())


print("Valeurs manquantes train:", data_train.isnull().sum().sum())
print("Valeurs manquantes test:", data_test.isnull().sum().sum())


plt.figure(figsize=(10, 6))
sns.histplot(data_train['end_of_day_return'], bins=50, kde=True)
plt.title('Distribution de End-of-Day Return')
plt.xlabel('End-of-Day Return')
plt.ylabel('Frequencey')
plt.show()



y = data_train.pop('end_of_day_return')
X = data_train.copy()



def weighted_accuracy(y_true, y_pred):

    weights = np.abs(y_true)

    sign_true = np.sign(y_true)
    sign_pred = np.sign(y_pred)

    correct_predictions = sign_true == sign_pred

    weighted_acc = np.sum(weights * correct_predictions) / np.sum(weights)
    return weighted_acc



def plot_results(mse_train, mse_test, w_acc_train, w_acc_test):
    plt.figure(figsize=(12, 6))


    plt.subplot(1, 2, 1)
    plt.plot(mse_train, label="Train MSE", marker='o')
    plt.plot(mse_test, label="Test MSE", marker='o')
    plt.title("MSE over Folds")
    plt.xlabel("Fold")
    plt.ylabel("MSE")
    plt.legend()
    plt.grid(True)


    plt.subplot(1, 2, 2)
    plt.plot(w_acc_train, label="Train weighted_accuracy", marker='o')
    plt.plot(w_acc_test, label="Test weighted_accuracy", marker='o')
    plt.title("Weighted Accuracy over Folds")
    plt.xlabel("Fold")
    plt.ylabel("Weighted Accuracy")
    plt.legend()
    plt.grid(True)

    plt.tight_layout()
    plt.show()


def plot_multi_model_results(results):
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 20))

    train_color = 'pink'
    test_color = 'blue'

    x = np.arange(len(results))
    width = 0.35


    ax1.set_title('Comparaison MSE', fontsize=16)
    ax1.set_ylabel('MSE')
    ax1.grid(True, alpha=0.7)


    ax2.set_title('Weighted Accuracy Comparaison', fontsize=16)
    ax2.set_ylabel('Weighted Accuracy')
    ax2.grid(True, alpha=0.7)

    for i, (model_name, scores) in enumerate(results.items()):
        mse_train = scores['mse_train']
        mse_test = scores['mse_test']
        w_acc_train = scores['w_acc_train']
        w_acc_test = scores['w_acc_test']


        ax1.bar(x[i] - width/2, np.mean(mse_train), width,
                color=train_color, alpha=0.7, label="Train" if i == 0 else "")
        ax1.bar(x[i] + width/2, np.mean(mse_test), width,
                color=test_color, alpha=0.7, label="Test" if i == 0 else "")


        ax2.bar(x[i] - width/2, np.mean(w_acc_train), width,
                color=train_color, alpha=0.7, label="Train" if i == 0 else "")
        ax2.bar(x[i] + width/2, np.mean(w_acc_test), width,
                color=test_color, alpha=0.7, label="Test" if i == 0 else "")

    ax1.set_xticks(x)
    ax1.set_xticklabels(results.keys(), rotation=45, ha='right')
    ax2.set_xticks(x)
    ax2.set_xticklabels(results.keys(), rotation=45, ha='right')

    ax1.legend(loc='upper left')
    ax2.legend(loc='upper left')

    plt.tight_layout()
    plt.show()


In [None]:


from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error


def train_and_evaluate(X_train, X_test, y_train, y_test, model):

    model.fit(X_train, y_train)


    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)


    mse_train = mean_squared_error(y_train, y_pred_train)
    mse_test = mean_squared_error(y_test, y_pred_test)


    w_acc_train = weighted_accuracy(y_train, y_pred_train)
    w_acc_test = weighted_accuracy(y_test, y_pred_test)

    return mse_train, mse_test, w_acc_train, w_acc_test



def run_multi_model_cv(X, y, models, n_splits=5):
    fold = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    results = {name: {'mse_train': [], 'mse_test': [],
                      'w_acc_train': [], 'w_acc_test': []}
               for name in models.keys()}

    for train_index, test_index in fold.split(X, y):
        X_train, X_test = X.iloc[train_index].copy(), X.iloc[test_index].copy()
        y_train, y_test = y.iloc[train_index].copy(), y.iloc[test_index].copy()

        for name, model in models.items():
            mse_train, mse_test, w_acc_train, w_acc_test = train_and_evaluate(
                X_train, X_test, y_train, y_test, model
            )
            results[name]['mse_train'].append(mse_train)
            results[name]['mse_test'].append(mse_test)
            results[name]['w_acc_train'].append(w_acc_train)
            results[name]['w_acc_test'].append(w_acc_test)


    best_mean_w_acc = -1
    best_model = None

    for name, result in results.items():
        mean_w_acc_test = np.mean(result['w_acc_test'])
        if mean_w_acc_test > best_mean_w_acc:
            best_mean_w_acc = mean_w_acc_test
            best_model = name

    print(f"meilleur modèle: {best_model} avec mean weighted accuracy {best_mean_w_acc:.4f}")
    return results



baseline_models = {
    "Ridge": Ridge(),
    "RandomForestRegressor": RandomForestRegressor(n_jobs=-1, random_state=42)
}


results_baseline = run_multi_model_cv(X, y, baseline_models)


plot_multi_model_results(results_baseline)


In [None]:

import pandas as pd
import numpy as np

from sklearn.model_selection import RandomizedSearchCV, KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer

from scipy.stats import randint as sp_randint




try:
    from data_preprocessing import X, y, weighted_accuracy
except ImportError:
    print("WARNING: Assurez-vous que X, y, et weighted_accuracy sont disponibles dans votre environnement.")
    pass


w_acc_scorer = make_scorer(weighted_accuracy, greater_is_better=True)


rf_model = RandomForestRegressor(random_state=42, n_jobs=-1)


param_distributions = {
    # On utilise sp_randint ou des listes courtes
    'n_estimators': [200, 500, 800],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}


cv_fold = KFold(n_splits=3, shuffle=True, random_state=42)




grid_search = RandomizedSearchCV(
    estimator=rf_model,
    param_distributions=param_distributions,
    n_iter=20,
    scoring=w_acc_scorer,
    cv=cv_fold,
    verbose=2,
    random_state=42,
    n_jobs=-1
)


grid_search.fit(X, y)


print("\n--- Résultats de la Recherche Aléatoire (Random Forest Optimisé) ---")
print(f"Meilleure Weighted Accuracy CV: {grid_search.best_score_:.4f}")
print(f"Meilleurs Hyperparamètres: {grid_search.best_params_}")


best_rf_model = grid_search.best_estimator_
best_lgbm_model = best_rf_model

print(f"\nLe modèle optimisé (Random Forest) a atteint une Weighted Accuracy moyenne de {grid_search.best_score_:.4f} sur la Cross-Validation.")

In [None]:


from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
import numpy as np
from sklearn.metrics import make_scorer



try:

    best_params = {
        'n_estimators': 200,
        'max_depth': 10,
        'min_samples_split': 5,
        'min_samples_leaf': 1,
        'max_features': 'log2',
        'random_state': 42,
        'n_jobs': -1
    }


    stability_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="mean")),
        ("scaler", StandardScaler()),
        ("model", RandomForestRegressor(**best_params))
    ])


    w_acc_scorer = make_scorer(weighted_accuracy, greater_is_better=True)




    cv_scores = cross_val_score(
        estimator=stability_pipeline,
        X=X,
        y=y,
        scoring=w_acc_scorer,
        cv=5,
        n_jobs=-1
    )


    mean_score = np.mean(cv_scores)
    std_dev = np.std(cv_scores)


    print(f"Scores par fold (Weighted Accuracy): {cv_scores}")
    print(f"Score Moyen CV: {mean_score:.4f}")
    print(f"Écart-type des scores: {std_dev:.4f}")


    if std_dev < 0.01:
        print("Très bien")
    elif std_dev < 0.03:
        print("Ca va ")
    else:
        print("TERRIBLE")

except NameError:
    print("Erreur: Assurez-vous que X, y, et weighted_accuracy sont définis.")
    pass

In [None]:

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer


final_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler()),
    ("model", RandomForestRegressor(
        n_estimators=200,
        max_depth=10,
        min_samples_split=5,
        min_samples_leaf=1,
        max_features='log2',
        random_state=42,
        n_jobs=-1
    ))
])


final_pipeline.fit(X, y)


test_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module6/exercise/module6_exercise_test.csv'
X_test = pd.read_csv('module6_exercise_test.csv', index_col='index')


y_pred_test = final_pipeline.predict(X_test)

submission = pd.DataFrame({
    'index': X_test.index,
    'end_of_day_return': y_pred_test
})

submission.to_csv('submission_module6.csv', index=False)


