# Machine Learning

In [None]:
# Data handling
import pandas as pd

# ML librairies
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler

# Dataviz librairies
import plotly.express as px

In [3]:
dataset = pd.read_csv("../data/Walmart_Store_sales.csv")
display(dataset.head())

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,6.0,18-02-2011,1572117.54,,59.61,3.045,214.777523,6.858
1,13.0,25-03-2011,1807545.43,0.0,42.38,3.435,128.616064,7.47
2,17.0,27-07-2012,,0.0,,,130.719581,5.936
3,11.0,,1244390.03,0.0,84.57,,214.556497,7.346
4,6.0,28-05-2010,1644470.66,0.0,78.89,2.759,212.412888,7.092


### Preprocessing

In [None]:
dataset.dropna(subset="Weekly_Sales", inplace=True)

drop_outliers_columns_list = [
    "Temperature",
    "CPI",
    "Fuel_Price",
    "Unemployment"
]

for column in drop_outliers_columns_list :

    high_range = dataset[column].mean() + 3 * dataset[column].std()
    low_range = dataset[column].mean() - 3 * dataset[column].std()

    outlier_condition = (dataset[column] > high_range) | (dataset[column] < low_range)
    
    dataset = dataset.loc[~outlier_condition]

dataset["Date"] = pd.to_datetime(dataset["Date"], dayfirst=True)
dataset["Year"] = dataset["Date"].dt.year
dataset["Month"] = dataset["Date"].dt.month
dataset["Day"] = dataset["Date"].dt.day
dataset.drop("Date", axis=1, inplace=True)

## Baseline model : multivariate linear regression

Separating features from target.

In [21]:
target_variable = "Weekly_Sales"

X = dataset.drop(target_variable, axis=1)
y = dataset[target_variable]

### Train-test split

In [22]:
X_train_unproc, X_test_unproc, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

### Preprocessing : impute, encode and standardize values

In [23]:
numerical_features = ["Temperature", "Fuel_Price", "CPI", "Unemployment", "Year", "Month", "Day"]
categorical_features = ["Store", "Holiday_Flag"]

numerical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="mean")),
        ("scaler", StandardScaler())
    ]
)

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")), 
        ("encoder", OneHotEncoder(drop="first"))
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

X_train = preprocessor.fit_transform(X_train_unproc)
X_test = preprocessor.transform(X_test_unproc)

### Train model

In [24]:
linear_regression = LinearRegression()
linear_regression.fit(X_train, y_train)

### Performance assessment

In [25]:
data_comparaison = []

print("R2 score on training set : ", linear_regression.score(X_train, y_train))
print("R2 score on test set : ", linear_regression.score(X_test, y_test))
print()

scores = cross_val_score(linear_regression, X_train, y_train, cv=10)
print("The cross-validated accuracy is : ", scores.mean())
print("The standard deviation is : ", scores.std())

data_comparaison.append([
    "Linear",
    "",
    linear_regression.score(X_train, y_train),
    linear_regression.score(X_test, y_test),
    scores.mean(),
    scores.std()
])

R2 score on training set :  0.9731935046590678
R2 score on test set :  0.9344654257543307

The cross-validated accuracy is :  0.9405178371914926
The standard deviation is :  0.04255055442642696


The R2 scores on the training and test sets are fairly high (0.971 and 0.934), but it seems that the model overtfit a bit. Our 10-fold cross-validation shows that the mean accuracy is 0.940 and can vary by $\pm 0.0425$ simply by changing a few examples between the training and test sets. We'll try to reduce overfitting through regularization.

### Features importance

In [26]:
def create_features_importance_df_fig(preprocessor, model_coef) :
    column_names = []

    for name, step, features_list in preprocessor.transformers_:
        if name == 'num':
            features = features_list
        else :
            features = step.get_feature_names_out()
        
        column_names.extend(features)

    df = pd.DataFrame(index=column_names, data=model_coef.transpose()).reset_index()
    df.columns = ["features", "coefficients"]
    df["coefficients"] = df["coefficients"].apply(lambda x : abs(x))
    df = df.sort_values(by = 'coefficients')

    fig = px.bar(
        df,
        x="coefficients",
        y="features"
    )

    fig.update_layout(
        showlegend = False, 
        margin = {'l': 120},
        width=1000,
        height=700
    )

    return df, fig

In [27]:
_, fig = create_features_importance_df_fig(preprocessor, linear_regression.coef_)
fig.show()

Our baseline told us that the store is the most important to predict weekly sales. Beside store, unemployment and CPI seems to have a certain importance too.

## Hyperparameter tuning

In [28]:
ridge = Ridge()

params = {
    "alpha": [0.01, 0.02, 0.03, 0.04, 0.05, 0.6, 0.7]
}

gridsearch_ridge = GridSearchCV(ridge, param_grid=params, cv=10)
gridsearch_ridge.fit(X_train, y_train)

data_comparaison.append([
    "Ridge",
    gridsearch_ridge.best_params_,
    gridsearch_ridge.score(X_train, y_train),
    gridsearch_ridge.score(X_test, y_test),
    gridsearch_ridge.cv_results_["mean_test_score"][gridsearch_ridge.best_index_],
    gridsearch_ridge.cv_results_["std_test_score"][gridsearch_ridge.best_index_]
])

_, fig = create_features_importance_df_fig(preprocessor, gridsearch_ridge.best_estimator_.coef_)
fig.show()

In [29]:
lasso = Lasso()

params = {
    "alpha": [i for i in range(100, 1000, 25)]
}

gridsearch_lasso = GridSearchCV(lasso, param_grid=params, cv=10)
gridsearch_lasso.fit(X_train, y_train)

data_comparaison.append([
    "Lasso",
    gridsearch_lasso.best_params_,
    gridsearch_lasso.score(X_train, y_train),
    gridsearch_lasso.score(X_test, y_test),
    gridsearch_lasso.cv_results_["mean_test_score"][gridsearch_lasso.best_index_],
    gridsearch_lasso.cv_results_["std_test_score"][gridsearch_lasso.best_index_]
])

_, fig = create_features_importance_df_fig(preprocessor, gridsearch_lasso.best_estimator_.coef_)
fig.show()

In [30]:
score_comparaison = pd.DataFrame(data=data_comparaison, columns=["model", "best_hyperparameters", "train_score", "test_score", "cv_mean_score", "cv_mean_std"])
score_comparaison

Unnamed: 0,model,best_hyperparameters,train_score,test_score,cv_mean_score,cv_mean_std
0,Linear,,0.973194,0.934465,0.940518,0.042551
1,Ridge,{'alpha': 0.02},0.973124,0.937152,0.940656,0.043713
2,Lasso,{'alpha': 450},0.972741,0.943108,0.94199,0.044039


We can't really see a huge difference of performances between those three models, even with regularization. One reason is the size of the dataset himself, which is very small. An other can be the features we have selected : but thanks to L1 regularization (Lasso), which shrink ineficient coefficients (i.e. features) to 0, we can easily see the ones less important to the model. We'll try some features combinaisons, but starting by dropping the 3 less importants regarding Lasso regularization : Year, Holiday_Flag and Day.

### Features selection

In [None]:
target_variable = "Weekly_Sales"
features_to_drop = ["Year", "Holiday_Flag", "Day", "Temperature"]

X = dataset.drop([target_variable, *features_to_drop], axis=1)
y = dataset[target_variable]

X_train_unproc, X_test_unproc, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

numerical_features = ["CPI", "Unemployment", "Month", "Fuel_Price"]
categorical_features = ["Store"]

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

X_train = preprocessor.fit_transform(X_train_unproc)
X_test = preprocessor.transform(X_test_unproc)

After few try, it seems that Temperature isn't an importante feature either, so we decide not to use it too.

In [32]:
linear_regression = LinearRegression()
linear_regression.fit(X_train, y_train)

scores = cross_val_score(linear_regression, X_train, y_train, cv=10)

data_comparaison = []

data_comparaison.append([
    "Linear",
    "",
    linear_regression.score(X_train, y_train),
    linear_regression.score(X_test, y_test),
    scores.mean(),
    scores.std()
])

In [33]:
ridge = Ridge()

params = {
    "alpha": [0.008, 0.009, 0.01, 0.02, 0.03, 0.04, 0.05, 0.6, 0.7]
}

gridsearch_ridge = GridSearchCV(ridge, param_grid=params, cv=10)
gridsearch_ridge.fit(X_train, y_train)

data_comparaison.append([
    "Ridge",
    gridsearch_ridge.best_params_,
    gridsearch_ridge.score(X_train, y_train),
    gridsearch_ridge.score(X_test, y_test),
    gridsearch_ridge.cv_results_["mean_test_score"][gridsearch_ridge.best_index_],
    gridsearch_ridge.cv_results_["std_test_score"][gridsearch_ridge.best_index_]
])

In [34]:
lasso = Lasso()

params = {
    "alpha": [i for i in range(100, 1000, 25)]
}

gridsearch_lasso = GridSearchCV(lasso, param_grid=params, cv=10)
gridsearch_lasso.fit(X_train, y_train)

data_comparaison.append([
    "Lasso",
    gridsearch_lasso.best_params_,
    gridsearch_lasso.score(X_train, y_train),
    gridsearch_lasso.score(X_test, y_test),
    gridsearch_lasso.cv_results_["mean_test_score"][gridsearch_lasso.best_index_],
    gridsearch_lasso.cv_results_["std_test_score"][gridsearch_lasso.best_index_]
])

In [35]:
score_comparaison = pd.DataFrame(data=data_comparaison, columns=["model", "best_hyperparameters", "train_score", "test_score", "cv_mean_score", "cv_mean_std"])
score_comparaison

Unnamed: 0,model,best_hyperparameters,train_score,test_score,cv_mean_score,cv_mean_std
0,Linear,,0.969,0.928336,0.944151,0.033592
1,Ridge,{'alpha': 0.02},0.968941,0.929988,0.944275,0.034548
2,Lasso,{'alpha': 150},0.968953,0.930425,0.944446,0.034455


Even if we still overfit a little, we have managed to reduce the standard deviation (by 0.01) while maintaining a high average score thanks to regularization and features selection. To go further we would need more data or create new insights by cleaverly combining our features.