In [8]:
import os

os.chdir("../")
os.chdir("./artifacts/features_dataTransformation/")

%pwd

'd:\\MyCase\\Projects\\DSAI\\portfolio\\Sales_Forecasting_and_Analytics\\artifacts\\features_dataTransformation'

In [9]:
import pandas as pd

train_df = pd.read_csv("train_final.csv")
test_df = pd.read_csv("test_final.csv")

In [10]:
train_df["transactions"] = train_df["transactions"].fillna(0)  
test_df["transactions"] = test_df["transactions"].fillna(0)

In [11]:
train_df.columns, test_df.columns

(Index(['id', 'date', 'store_nbr', 'family', 'sales', 'onpromotion', 'city',
        'state', 'type_x', 'cluster',
        ...
        'city_Santo Domingo', 'type_x_B', 'type_x_C', 'type_x_D', 'type_x_E',
        'type_y_Bridge', 'type_y_Event', 'type_y_Holiday', 'type_y_Regular Day',
        'type_y_Transfer'],
       dtype='object', length=101),
 Index(['id', 'date', 'store_nbr', 'family', 'sales', 'onpromotion', 'city',
        'state', 'type_x', 'cluster',
        ...
        'city_Santo Domingo', 'type_x_B', 'type_x_C', 'type_x_D', 'type_x_E',
        'type_y_Bridge', 'type_y_Event', 'type_y_Holiday', 'type_y_Regular Day',
        'type_y_Transfer'],
       dtype='object', length=101))

In [12]:
train_df.shape, test_df.shape

((3013362, 101), (28512, 101))

In [14]:
train_features.head(2)

Unnamed: 0,state_Imbabura,state_Imbabura.1,family_PRODUCE,family_PRODUCE.1,state_Guayas,state_Guayas.1,city_Cuenca,city_Cuenca.1,family_PET SUPPLIES,family_PET SUPPLIES.1,...,state_Manabi,state_Manabi.1,month,type_y_Regular Day,type_y_Regular Day.1,family_LAWN AND GARDEN,family_LAWN AND GARDEN.1,family_HOME AND KITCHEN II,family_HOME AND KITCHEN II.1,sales
0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0.0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0.0


In [13]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from xgboost import XGBRegressor


# Fill missing values
train_df["transactions"] = train_df["transactions"].fillna(0)
test_df["transactions"] = test_df["transactions"].fillna(0)
train_df["type_y"] = train_df["type_y"].fillna("Regular Day")
test_df["type_y"] = test_df["type_y"].fillna("Regular Day")
train_df["dcoilwtico"] = train_df["dcoilwtico"].bfill()
test_df["dcoilwtico"] = test_df["dcoilwtico"].bfill()

# Parse date and extract features
for df in [train_df, test_df]:
    df["date"] = pd.to_datetime(df["date"])
    df["year"] = df["date"].dt.year
    df["month"] = df["date"].dt.month
    df["day"] = df["date"].dt.day
    df["day_of_week"] = df["date"].dt.dayofweek
    df["is_weekend"] = (df["day_of_week"] >= 5).astype(int)
    df["day_of_year"] = df["date"].dt.dayofyear
    df["is_month_start"] = df["date"].dt.is_month_start.astype(int)
    df["is_month_end"] = df["date"].dt.is_month_end.astype(int)

# Interactions
train_df["onpromotion_trend"] = train_df["onpromotion"] * train_df["day_of_year"]
test_df["onpromotion_trend"] = test_df["onpromotion"] * test_df["day_of_year"]
train_df["month_sales_interaction"] = train_df["month"] * train_df["sales"]

# One-hot encode
cat_cols = ["family", "state", "city", "type_x", "type_y"]
# train_df = pd.get_dummies(train_df, columns=cat_cols, drop_first=True, dtype=int)
# test_df = pd.get_dummies(test_df, columns=cat_cols, drop_first=True, dtype=int)

encoded = pd.get_dummies(train_df[cat_cols], drop_first=True, dtype=int)
train_df = pd.concat([train_df, encoded], axis=1)

encoded = pd.get_dummies(test_df[cat_cols], drop_first=True, dtype=int)
test_df = pd.concat([test_df, encoded], axis=1)

# Align columns
common_cols = set(train_df.columns) & set(test_df.columns)
train_features = train_df[list(common_cols) + ["sales"]]
test_features = test_df[list(common_cols)]

# Scale selected features
scale_cols = ["onpromotion", "transactions", "dcoilwtico", "onpromotion_trend"]
scaler = MinMaxScaler()
train_features[scale_cols] = scaler.fit_transform(train_features[scale_cols])
test_features[scale_cols] = scaler.transform(test_features[scale_cols])

# Drop non-numeric columns
train_features = train_features.select_dtypes(include=[np.number])
test_features = test_features.select_dtypes(include=[np.number])

# Train model
cat_columns = ["family", "state", "city", "type_x", "type_y", "sales"]
target_col = "sales"   # or whatever target is

X_train = train_features.drop(columns=cat_columns, axis=1)
y_train = train_features[target_col]
# {'subsample': 0.8, 'reg_lambda': 2, 'reg_alpha': 0.1, 'n_estimators': 500, 'max_depth': 7, 'learning_rate': 0.05, 'gamma': 0, 'colsample_bytree': 0.6}

model = XGBRegressor(n_estimators=500, 
                     learning_rate=0.05, 
                     max_depth=7, 
                     subsample=0.8, 
                     colsample_bytree=0.6, 
                     reg_alpha=0.1, 
                     reg_lambda=2, 
                     gamma=0, 
                     random_state=42)


model.fit(X_train, y_train)

# Evaluate on train (no val split here)
y_pred = model.predict(X_train)
rmse = np.sqrt(mean_squared_error(y_train, y_pred))
mae = mean_absolute_error(y_train, y_pred)
r2 = r2_score(y_train, y_pred)

# üëá RMSLE Evaluation
rmsle = np.sqrt(np.mean((np.log1p(y_train) - np.log1p(y_pred)) ** 2))

print("‚úÖ Training Evaluation:")
print(f"RMSE: {rmse:.4f}")
print(f"MAE : {mae:.4f}")
print(f"R2  : {r2:.4f}")
print(f"RMSLE: {rmsle:.4f}")

KeyError: "['family' 'state' 'city' 'type_x' 'type_y' 'sales'] not found in axis"

In [None]:
from catboost import CatBoostRegressor

# Train CatBoost model with similar hyperparameters
catboost_model = CatBoostRegressor(
  iterations=1000,
  learning_rate=0.05,
  depth=7,
  subsample=0.8,
  colsample_bylevel=0.6,
  reg_lambda=2,
  random_seed=42,
  verbose=0
)

catboost_model.fit(X_train, y_train)

# Evaluate on train set
y_pred_cb = catboost_model.predict(X_train)
rmse_cb = np.sqrt(mean_squared_error(y_train, y_pred_cb))
mae_cb = mean_absolute_error(y_train, y_pred_cb)
r2_cb = r2_score(y_train, y_pred_cb)
rmsle_cb = np.sqrt(np.mean((np.log1p(y_train) - np.log1p(y_pred_cb)) ** 2))

print("‚úÖ CatBoost Training Evaluation:")
print(f"RMSE: {rmse_cb:.4f}")
print(f"MAE : {mae_cb:.4f}")
print(f"R2  : {r2_cb:.4f}")
print(f"RMSLE: {rmsle_cb:.4f}")

In [None]:
# Predict on test set
test_df["sales"] = catboost_model.predict(test_features)

# Ensure no negative predictions
test_df["sales"] = test_df["sales"].clip(lower=0)

# Prepare submission
submission = test_df[["id", "sales"]].copy()
submission.to_csv("submission.csv", index=False)
print("‚úÖ submission.csv saved successfully!")


In [None]:
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor

def get_best_hyperparameters(X, y, n_iter=30, cv=3, random_state=42, verbose=1):
    """Performs hyperparameter tuning using RandomizedSearchCV for XGBRegressor."""
    
    param_dist = {
        "n_estimators": [100, 300, 500, 700],
        "learning_rate": [0.01, 0.05, 0.1, 0.2],
        "max_depth": [3, 5, 7, 10],
        "subsample": [0.6, 0.8, 1.0],
        "colsample_bytree": [0.6, 0.8, 1.0],
        "gamma": [0, 0.1, 0.3, 0.5],
        "reg_alpha": [0, 0.1, 0.5],
        "reg_lambda": [1, 1.5, 2]
    }

    xgb = XGBRegressor(objective="reg:squarederror", random_state=random_state)

    random_search = RandomizedSearchCV(
        estimator=xgb,
        param_distributions=param_dist,
        n_iter=n_iter,
        cv=cv,
        verbose=verbose,
        n_jobs=-1,
        scoring='neg_root_mean_squared_error'
    )

    print("üîç Searching for best hyperparameters...")
    random_search.fit(X, y)
    print("‚úÖ Best Hyperparameters Found!")
    print(random_search.best_params_)

    return random_search.best_estimator_, random_search.best_params_


In [None]:
# Call hyperparameter tuning
best_model, best_params = get_best_hyperparameters(X_train, y_train)

# Predict on test features
test_df["sales"] = best_model.predict(test_features).clip(lower=0)
submission = test_df[["id", "sales"]]
submission.to_csv("submission.csv", index=False)


In [None]:
# Predict on test set
test_df["sales"] = model.predict(test_features)

# Ensure no negative predictions
test_df["sales"] = test_df["sales"].clip(lower=0)

# Prepare submission
submission = test_df[["id", "sales"]].copy()
submission.to_csv("submission.csv", index=False)
print("‚úÖ submission.csv saved successfully!")
