In [5]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import math
import pickle
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# ---------------- LOAD DATA ----------------
df = pd.read_csv("../data_clean/saas_sales_clean.csv")
print("✅ Data loaded:", df.shape)

# ---------------- CLEAN & FEATURE PREP ----------------
df['Order Date'] = pd.to_datetime(df['Order Date'], errors='coerce')
df = df.dropna(subset=['Sales'])
df['Sales'] = pd.to_numeric(df['Sales'], errors='coerce')

# Time-based features
df['Year'] = df['Order Date'].dt.year
df['Month'] = df['Order Date'].dt.month
df['Day'] = df['Order Date'].dt.day
df['Weekday'] = df['Order Date'].dt.weekday

# Drop identifiers and irrelevant columns
drop_cols = ['Order ID', 'License', 'Order Date', 'Month_Year']
X = df.drop(columns=drop_cols + ['Sales'], errors='ignore')
y = df['Sales']

# ---------------- FEATURE TYPES ----------------
numeric_feats = X.select_dtypes(include=[np.number]).columns.tolist()
cat_feats = X.select_dtypes(exclude=[np.number]).columns.tolist()
print("Numeric features:", numeric_feats)
print("Categorical features:", cat_feats)

# ---------------- PREPROCESSING ----------------
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='MISSING')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_feats),
    ('cat', categorical_transformer, cat_feats)
])

# ---------------- MODELS ----------------
models = {
    "LinearRegression": LinearRegression(),
    "ElasticNet": ElasticNet(random_state=42),
    "SVR": SVR(),
    "RandomForest": RandomForestRegressor(random_state=42, n_jobs=-1),
    "GradientBoosting": GradientBoostingRegressor(random_state=42)
}

# ---------------- SPLIT ----------------
df = df.sort_values(by="Order Date")
split = int(len(df) * 0.8)
X_train, X_test = X.iloc[:split], X.iloc[split:]
y_train, y_test = y.iloc[:split], y.iloc[split:]
print("Train/Test split:", X_train.shape, X_test.shape)

# ---------------- TRAIN AND EVALUATE ----------------
results = []
fitted_models = {}

for name, model in models.items():
    pipe = Pipeline([('pre', preprocessor), ('model', model)])
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)
    mae = mean_absolute_error(y_test, preds)
    rmse = math.sqrt(mean_squared_error(y_test, preds))
    r2 = r2_score(y_test, preds)
    results.append({"Model": name, "MAE": mae, "RMSE": rmse, "R2": r2})
    fitted_models[name] = pipe
    print(f"{name}: RMSE={rmse:.4f} | MAE={mae:.4f} | R2={r2:.4f}")

# ---------------- BEST MODEL ----------------
res_df = pd.DataFrame(results).sort_values(by="RMSE")
best_name = res_df.iloc[0]["Model"]
best_model = fitted_models[best_name]

print("\n🏆 Best model:", best_name)
print(res_df)

# ---------------- SAVE MODEL ----------------
with open("best_sales_model.pkl", "wb") as f:
    pickle.dump(best_model, f)
print("💾 Saved best model → best_sales_model.pkl")

# ---------------- SAMPLE PREDICTIONS ----------------
preds = best_model.predict(X_test)
sample = pd.DataFrame({"Actual": y_test, "Predicted": preds})
sample.to_csv("sample_predictions.csv", index=False)
print("📄 Saved predictions sample → sample_predictions.csv")


✅ Data loaded: (9994, 23)
Numeric features: ['Date Key', 'Customer ID', 'Quantity', 'Discount', 'Profit', 'Year', 'Month', 'Day', 'Weekday']
Categorical features: ['Contact Name', 'Country', 'City', 'Region', 'Subregion', 'Customer', 'Industry', 'Segment', 'Product']
Train/Test split: (7995, 18) (1999, 18)
LinearRegression: RMSE=521.6896 | MAE=245.5658 | R2=0.2065
ElasticNet: RMSE=522.4722 | MAE=225.1916 | R2=0.2041
SVR: RMSE=604.1409 | MAE=198.5013 | R2=-0.0641
RandomForest: RMSE=238.9629 | MAE=80.5812 | R2=0.8335
GradientBoosting: RMSE=243.7621 | MAE=98.9834 | R2=0.8268

🏆 Best model: RandomForest
              Model         MAE        RMSE        R2
3      RandomForest   80.581223  238.962945  0.833515
4  GradientBoosting   98.983444  243.762088  0.826761
0  LinearRegression  245.565849  521.689603  0.206515
1        ElasticNet  225.191601  522.472207  0.204133
2               SVR  198.501347  604.140905 -0.064121
💾 Saved best model → best_sales_model.pkl
📄 Saved predictions sample 

In [7]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import math
import pickle
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# ---------------- LOAD DATA ----------------
df = pd.read_csv("../data_clean/saas_sales_clean.csv")
print("✅ Data loaded:", df.shape)

# ---------------- BASIC CLEANING ----------------
df["Order Date"] = pd.to_datetime(df["Order Date"], errors="coerce")
df = df.dropna(subset=["Sales"])
df["Sales"] = pd.to_numeric(df["Sales"], errors="coerce")

# ---------------- FEATURE ENGINEERING ----------------
df["Year"] = df["Order Date"].dt.year
df["Month"] = df["Order Date"].dt.month
df["Day"] = df["Order Date"].dt.day
df["Weekday"] = df["Order Date"].dt.weekday

# Extra features
df["Profit_Margin"] = df["Profit"] / (df["Sales"] + 1e-6)
df["Sales_per_Quantity"] = df["Sales"] / (df["Quantity"] + 1e-6)
df["Is_Weekend"] = df["Weekday"].apply(lambda x: 1 if x >= 5 else 0)

drop_cols = ["Order ID", "License", "Order Date", "Month_Year"]
df = df.drop(columns=drop_cols, errors="ignore")

# ---------------- DEFINE FEATURES ----------------
X = df.drop(columns=["Sales"])
y = np.log1p(df["Sales"])  # log transform target

numeric_feats = X.select_dtypes(include=[np.number]).columns.tolist()
cat_feats = X.select_dtypes(exclude=[np.number]).columns.tolist()

print("Numeric features:", numeric_feats)
print("Categorical features:", cat_feats)

# ---------------- PREPROCESSING ----------------
numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])
categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value="MISSING")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])
preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numeric_feats),
    ("cat", categorical_transformer, cat_feats)
])

# ---------------- BASE MODELS ----------------
base_models = {
    "RandomForest": RandomForestRegressor(random_state=42, n_jobs=-1),
    "GradientBoosting": GradientBoostingRegressor(random_state=42),
    "XGBoost": XGBRegressor(random_state=42, n_jobs=-1, verbosity=0)
}

# ---------------- TRAIN TEST SPLIT ----------------
df = df.sort_values(by="Year")
split = int(len(df) * 0.8)
X_train, X_test = X.iloc[:split], X.iloc[split:]
y_train, y_test = y.iloc[:split], y.iloc[split:]
print("Train/Test split:", X_train.shape, X_test.shape)

# ---------------- TRAIN BASE MODELS ----------------
results = {}
for name, model in base_models.items():
    pipe = Pipeline([("pre", preprocessor), ("model", model)])
    pipe.fit(X_train, y_train)
    preds = np.expm1(pipe.predict(X_test))  # inverse log transform
    actuals = np.expm1(y_test)
    mae = mean_absolute_error(actuals, preds)
    rmse = math.sqrt(mean_squared_error(actuals, preds))
    r2 = r2_score(actuals, preds)
    results[name] = {"MAE": mae, "RMSE": rmse, "R2": r2}
    print(f"{name}: RMSE={rmse:.4f} | MAE={mae:.4f} | R2={r2:.4f}")

# ---------------- CHOOSE BEST MODEL ----------------
best_name = max(results, key=lambda k: results[k]["R2"])
print("\n🏆 Best base model:", best_name)
print(pd.DataFrame(results).T)

# ---------------- HYPERPARAMETER TUNING ----------------
if best_name == "XGBoost":
    model = XGBRegressor(random_state=42, n_jobs=-1, verbosity=0)
    param_dist = {
        "n_estimators": [100, 200, 300],
        "max_depth": [3, 5, 7, 9],
        "learning_rate": [0.01, 0.05, 0.1],
        "subsample": [0.6, 0.8, 1.0],
        "colsample_bytree": [0.6, 0.8, 1.0],
        "min_child_weight": [1, 3, 5]
    }
elif best_name == "RandomForest":
    model = RandomForestRegressor(random_state=42, n_jobs=-1)
    param_dist = {
        "n_estimators": [100, 200, 300],
        "max_depth": [10, 20, None],
        "min_samples_split": [2, 5, 10]
    }
else:  # GradientBoosting
    model = GradientBoostingRegressor(random_state=42)
    param_dist = {
        "n_estimators": [100, 200, 300],
        "learning_rate": [0.01, 0.05, 0.1],
        "max_depth": [3, 5, 7]
    }

pipe = Pipeline([("pre", preprocessor), ("model", model)])
search = RandomizedSearchCV(
    pipe, param_distributions={"model__" + k: v for k, v in param_dist.items()},
    n_iter=10, cv=3, scoring="r2", random_state=42, verbose=1, n_jobs=-1
)
search.fit(X_train, y_train)
best_model = search.best_estimator_

# ---------------- FINAL EVALUATION ----------------
preds = np.expm1(best_model.predict(X_test))
actuals = np.expm1(y_test)
mae = mean_absolute_error(actuals, preds)
rmse = math.sqrt(mean_squared_error(actuals, preds))
r2 = r2_score(actuals, preds)

print("\n🎯 Tuned Model Performance:")
print(f"RMSE={rmse:.4f} | MAE={mae:.4f} | R2={r2:.4f}")

# ---------------- SAVE MODEL ----------------
with open("enhanced_best_sales_model.pkl", "wb") as f:
    pickle.dump(best_model, f)
print("💾 Saved tuned model → enhanced_best_sales_model.pkl")

# ---------------- SAVE SAMPLE PREDICTIONS ----------------
sample = pd.DataFrame({"Actual": actuals, "Predicted": preds})
sample.to_csv("enhanced_predictions.csv", index=False)
print("📄 Saved predictions sample → enhanced_predictions.csv")


✅ Data loaded: (9994, 23)
Numeric features: ['Date Key', 'Customer ID', 'Quantity', 'Discount', 'Profit', 'Year', 'Month', 'Day', 'Weekday', 'Profit_Margin', 'Sales_per_Quantity', 'Is_Weekend']
Categorical features: ['Contact Name', 'Country', 'City', 'Region', 'Subregion', 'Customer', 'Industry', 'Segment', 'Product']
Train/Test split: (7995, 21) (1999, 21)
RandomForest: RMSE=153.6565 | MAE=11.0873 | R2=0.9312
GradientBoosting: RMSE=74.7911 | MAE=13.0117 | R2=0.9837
XGBoost: RMSE=139.6858 | MAE=14.4862 | R2=0.9431

🏆 Best base model: GradientBoosting
                        MAE        RMSE        R2
RandomForest      11.087295  153.656459  0.931164
GradientBoosting  13.011734   74.791115  0.983691
XGBoost           14.486191  139.685812  0.943112
Fitting 3 folds for each of 10 candidates, totalling 30 fits

🎯 Tuned Model Performance:
RMSE=140.3077 | MAE=10.0780 | R2=0.9426
💾 Saved tuned model → enhanced_best_sales_model.pkl
📄 Saved predictions sample → enhanced_predictions.csv
