In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import geopandas as gpd

## Data loading

In [None]:
df_lucie = pd.read_csv("../data/Dylan.csv").drop(columns="Unnamed: 0")

In [None]:
df_comm = pd.read_csv("../data/correspondance-code-insee-code-postal.csv", sep=";")

In [None]:
df_comm = df_comm[["POPULATION", "INSEE_COM", "X_CENTROID", "Y_CENTROID", "SUPERFICIE"]].rename(
    columns={
        "POPULATION": "population",
        "INSEE_COM": "code_commune",
        "X_CENTROID": "lat",
        "Y_CENTROID": "long",
        "SUPERFICIE": "area",
    }
).drop_duplicates()

In [None]:
df_lucie = (
    df_lucie.groupby("com").count().reset_index().rename(columns={"com": "code_commune", "Num_Acc": "accident_num"})
)
df_lucie = df_lucie.drop(columns="piste_cyclable")

In [None]:
df = df_comm.merge(df_lucie, how="left")
df["accident_num"] = df["accident_num"].fillna(0)

In [None]:
# Données perdues
lost_data_num = df_lucie[
    df_lucie.code_commune.isin(set(df_lucie.code_commune) - set(df_comm.code_commune))
].accident_num.sum()
print(f"Données perdues: {lost_data_num}")

## Data preprocessing

### Filter features

In [None]:
FEATURES = ["population", "area", "accident_num"]

df = df[FEATURES]
df.head()

### Train/test split

In [None]:
def stratified_sample(df, test_size):
    hist = np.histogram(df["accident_num"], bins="doane")
    df["bin"] = np.fmin(np.digitize(df["accident_num"], hist[1]), len(hist[0]))
    df_test = df.groupby("bin", group_keys=False).apply(lambda x: x.sample(frac=test_size, random_state=42))
    df_train = df.drop(index=df_test.index)

    # Drop bin col
    df = df.drop(columns="bin")
    df_train = df_train.drop(columns="bin")
    df_test = df_test.drop(columns="bin")
    return df_train, df_test

In [None]:
df_train, df_test = stratified_sample(df, 0.15)

In [None]:
X_train, y_train = df_train[df_train.columns.difference(["accident_num"])], df_train["accident_num"].values
X_test, y_test = df_test[df_test.columns.difference(["accident_num"])], df_test["accident_num"].values

### Scaling

In [None]:
scaler = StandardScaler()

In [None]:
NUMERIC_FEATURES = ["area", "population"]

In [None]:
pd.options.mode.chained_assignment = None
X_train[NUMERIC_FEATURES] = scaler.fit_transform(X_train[NUMERIC_FEATURES])
X_test[NUMERIC_FEATURES] = scaler.transform(X_test[NUMERIC_FEATURES])
pd.options.mode.chained_assignment = "warn"

## Model selection

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator
from sklearn.linear_model import LinearRegression

In [None]:
def grid_search(
    model: BaseEstimator,
    params: dict[str, list],
    X: pd.DataFrame | np.ndarray,
    y: pd.DataFrame | np.ndarray,
    scoring="neg_root_mean_squared_error",
) -> tuple[GridSearchCV, pd.DataFrame]:
    """
    Perform a grid search.

    :param model: The model to run the grid search on
    :param params: The parameters in the grid search.
    :param X: The independant variables.
    :param y: The dependant variable.
    :param scoring: a scikit_learn scoring function.

    Returns:
        A tuple containing the fitted GridSearchCV estimator, and a formatted dataframe of results.
    """
    grid_model = GridSearchCV(estimator=model, param_grid=params, n_jobs=-1, return_train_score=True, scoring=scoring)
    grid_model.fit(X, y)
    return (
        grid_model,
        pd.DataFrame(grid_model.cv_results_).sort_values("rank_test_score")[
            ["params", "mean_test_score", "std_test_score", "mean_train_score", "std_train_score"]
        ],
    )

In [None]:
linear_model, linear_perf = grid_search(LinearRegression(), {"fit_intercept": [True, False]}, X_train, y_train)

In [None]:
linear_perf