In [23]:
# data imports
import numpy as np
import pandas as pd

# visual imports
import matplotlib.pyplot as plt 
import seaborn as sns

# data processing imports
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

# model imports 
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

# metrics imports
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [24]:
df=pd.read_csv('data/Copy_of_sales-sales.csv')

In [25]:
# df.columns = df.columns.str.strip()
df.head()

Unnamed: 0.1,Unnamed: 0,store_ID,day_of_week,date,nb_customers_on_day,open,promotion,state_holiday,school_holiday,sales
0,425390,366,4,2013-04-18,517,1,0,0,0,4422
1,291687,394,6,2015-04-11,694,1,0,0,0,8297
2,411278,807,4,2013-08-29,970,1,1,0,0,9729
3,664714,802,2,2013-05-28,473,1,1,0,0,6513
4,540835,726,4,2013-10-10,1068,1,1,0,0,10882


In [26]:
df["date"] = pd.to_datetime(df["date"], errors="coerce")
df["year"] = df["date"].dt.year

if "store_ID" in df.columns:
    df = df.rename(columns={"store_ID": "store_id"})


In [27]:
y = df["sales"]
X = df.drop(columns=["sales"])


In [28]:
numeric_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = []

# treat store_id as categorical
if "store_id" in numeric_features:
    numeric_features.remove("store_id")
    categorical_features.append("store_id")

In [29]:
numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocess = ColumnTransformer([
    ("num", numeric_transformer, numeric_features),
    ("cat", categorical_transformer, categorical_features)
])

In [30]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [31]:
model = Pipeline([
    ("preprocess", preprocess),
    ("model", LinearRegression())
])

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [32]:
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("Linear Regression — Store-Year Model")
print("MAE:", round(mae, 2))
print("RMSE:", round(rmse, 2))
print("R2:", round(r2, 4))

Linear Regression — Store-Year Model
MAE: 581.93
RMSE: 841.29
R2: 0.9521


In [33]:
# rf_model = Pipeline([
#     ("preprocess", preprocess),
#     ("model", RandomForestRegressor(
#         n_estimators=3,
#         random_state=42,
#         n_jobs=2
#     ))
# ])

# rf_model.fit(X_train, y_train)
# rf_pred = rf_model.predict(X_test)

In [34]:
# mae = mean_absolute_error(y_test, rf_pred)
# rmse = np.sqrt(mean_squared_error(y_test, rf_pred))
# r2 = r2_score(y_test, rf_pred)

# print("Random Forest")
# print("MAE:", round(mae, 2))
# print("RMSE:", round(rmse, 2))
# print("R2:", round(r2, 4))

In [None]:
from sklearn.feature_selection import RFE

# Run RFE on the preprocessed (array) features and on the final estimator,
# because ColumnTransformer requires DataFrame column names but RFE passes arrays.
preprocessor = model.named_steps["preprocess"]
X_train_trans = preprocessor.transform(X_train)

rfe_score = RFE(estimator=model.named_steps["model"], n_features_to_select=1)
rfe_score.fit(X_train_trans, y_train)
feature_ranking = rfe_score.ranking_

# Build transformed feature names: numeric features + one-hot names for categorical features
ohe = preprocessor.named_transformers_["cat"].named_steps["onehot"]
try:
    cat_ohe_names = list(ohe.get_feature_names_out(categorical_features))
except AttributeError:
    # fallback for older sklearn versions
    cat_ohe_names = []
    for i, cats in enumerate(ohe.categories_):
        col = categorical_features[i]
        cat_ohe_names.extend([f"{col}_{c}" for c in cats])

feature_names = numeric_features + cat_ohe_names

rfe_df = pd.DataFrame({"feature": feature_names, "ranking": feature_ranking}).sort_values("ranking")
print("RFE ranking (1 = most important):")
print(rfe_df.reset_index(drop=True))

ValueError: Specifying the columns using strings is only supported for dataframes.