In [1]:
# Import Data Manipulation Libraries
import numpy as np
import pandas as pd

# Import Data Visualization Libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Import warnings
import warnings
warnings.filterwarnings(action="ignore")

# Import logging
import logging
logging.basicConfig(level=logging.INFO,
                    filemode="w",
                    filename="regression_model.log",
                    format="%(asctime)s - %(levelname)s - %(message)s",
                    force=True)

# Import Machine Learning Libraries
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score,root_mean_squared_error
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler,LabelEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, BaggingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
import xgboost 
from xgboost import XGBRegressor

from collections import OrderedDict

In [2]:
# # data ingestion
def data_ingestion():
  df = pd.read_csv(r'C:\SupplyChain_PredictionModel\data\raw\SupplyChain_Dataset.csv')
  df.drop(columns=[
        "Customer_Email",
        "Customer_Password",
        "Customer_Fname",
        "Customer_Lname",
        "Product_Image",
        "Product_Description",
        "Order_Id",
        "Customer_Id",
        'Customer_City',
        'Customer_Country',
        'Customer_Segment',
        'Customer_State',
        'Customer_Street',
        'Customer_Zipcode',
        'Order_City',
        'Order_Country',
        'Order_State',
        'Order_Zipcode',
        'Product_Status',
        "Order_Customer_Id",
        'Category_Id',
        'Latitude',
        'Longitude',
        'Order_Item_Id',
        'Product_Category_Id',
        'shipping_date_(DateOrders)',
        'order_date_(DateOrders)',
        'Product_Card_Id',
        'Order_Item_Cardprod_Id',
        'Department_Id',
        "Delivery_Status",
        "Order_Status",
        "Product_Name",
        'Order_Item_Discount_Rate'
    ],axis=1, inplace=True)
  return df

In [3]:
def data_exploration(df):

    # Segregate numerical and categorical columns
    numerical_cols = df.select_dtypes(exclude='object').columns
    categorical_cols = df.select_dtypes(include='object').columns

    numerical_stats = []

    # Numerical stats
    for i in numerical_cols:

        Q1 = df[i].quantile(0.25)
        Q3 = df[i].quantile(0.75)
        IQR = Q3 - Q1
        LW = Q1 - 1.5 * IQR
        UW = Q3 + 1.5 * IQR

        outlier_flag = "Has Outliers" if ((df[i] < LW) | (df[i] > UW)).any() else "No Outliers"

        num_stats = OrderedDict({
            "Features": i,
            "Maximum": df[i].max(),
            "Minimum": df[i].min(),
            "Mean": df[i].mean(),
            "Median": df[i].median(),
            "Q1": Q1,
            "Q3": Q3,
            "IQR": IQR,
            "Skewness": df[i].skew(),
            "Kurtosis": df[i].kurtosis(),
            "Outlier Comment": outlier_flag
        })

        numerical_stats.append(num_stats)

    numerical_stats_report = pd.DataFrame(numerical_stats)

    # Categorical stats
    categorical_stats = []

    for i in categorical_cols:

        cat_stats = OrderedDict({
            "Features": i,
            "Unique_Values": df[i].nunique(),
            "Mode": df[i].mode()[0],
            "Value_Counts": df[i].value_counts().to_dict()
        })

        categorical_stats.append(cat_stats)

    categorical_stats_report = pd.DataFrame(categorical_stats)

    return numerical_stats_report, categorical_stats_report


In [4]:
def data_preprocessing(df,target_col ='Sales'):

    # Split the data into X and y
    X = df.drop(columns=[target_col], axis=1)
    y = df[target_col]

    # Train test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=0.3,
        random_state=1
    )

    # Encoding categorical columns
    categorical_cols = X_train.select_dtypes(include='object').columns

    for col in categorical_cols:
        le = LabelEncoder()
        X_train[col] = le.fit_transform(X_train[col])
        X_test[col] = le.transform(X_test[col])

    # Scaling numerical columns
    numerical_cols = X_train.select_dtypes(exclude='object').columns

    for col in numerical_cols:
        ms = MinMaxScaler()
        X_train[col] = ms.fit_transform(X_train[[col]])
        X_test[col] = ms.transform(X_test[[col]])

    return X_train, X_test, y_train, y_test


In [None]:
def model_training(X_train, X_test, y_train, y_test):
    models = {
        "Linear Regression": LinearRegression(),
        "Ridge Regression": Ridge(),
        "Lasso Regression": Lasso(),
        "Decision Tree Regressor": DecisionTreeRegressor(),
        "Random Forest Regressor": RandomForestRegressor(),
        "Gradient Boosting Regressor": GradientBoostingRegressor(),
        "AdaBoost Regressor": AdaBoostRegressor(),
        "Bagging Regressor": BaggingRegressor(),
        "XGBoost Regressor": XGBRegressor()
    }

    results = {}

    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        rmse = root_mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        results[name] = {"RMSE": rmse, "R2 Score": r2}

    return results

In [6]:
# kflod using
def k_fold_cv(X_train, y_train, folds=10):
    models = {
        "Linear Regression": LinearRegression(),
        "Decision Tree": DecisionTreeRegressor(),
        "Random Forest": RandomForestRegressor(),
        "Gradient Boosting": GradientBoostingRegressor(),
        "AdaBoost": AdaBoostRegressor(),
        "Bagging": BaggingRegressor(),
        "XGBoost": XGBRegressor(),
    }

    results = []

    for name, model in models.items():
        scores = cross_val_score(
            model, X_train, y_train, cv=folds, scoring="r2"
        )
        results.append([name, scores.mean(), scores.std()])

    return pd.DataFrame(
        results, columns=["Model Name", "CV Mean R2 Score", "CV STD"]).sort_values("CV Mean R2 Score", ascending=False)


In [None]:
def hyperparameter_tuning(X_train, y_train, folds=5):
    tuning_config = {

        "Linear Regression": {
            "model": LinearRegression(),
            "params": {
                "fit_intercept": [True, False],
                "positive": [True, False]
            }
        },

        "XGBoost": {
            "model": XGBRegressor(),
            "params": {
                "eta": [0.1, 0.2, 0.3],
                "max_depth": [3, 5, 7],
                "gamma": [0, 10, 20],
                "reg_lambda": [0, 1],
                "n_estimators": [100, 200]
            }
        },

        "Decision Tree": {
            "model": DecisionTreeRegressor(),
            "params": {
                "max_depth": [None, 5, 10, 15],
                "min_samples_split": [2, 5, 10],
                "min_samples_leaf": [1, 2, 4]
            }
        },

        "Random Forest": {
            "model": RandomForestRegressor(),
            "params": {
                "n_estimators": [100, 200],
                "max_depth": [5, 10, 15],
                "max_features": ["sqrt", "log2"]
            }
        },

        "Bagging Regressor": {
            "model": BaggingRegressor(),
            "params": {
                "n_estimators":  [10, 50, 100],
                "max_samples": [0.5, 0.7, 1.0],
                "max_features": [0.5, 0.7, 1.0]
            }
        }
    }

    best_models = {}
    for name, cfg in tuning_config.items():
        grid = GridSearchCV(
            cfg["model"],
            cfg["params"],
            cv=folds,
            scoring="r2",
            n_jobs=-1
        )
        grid.fit(X_train, y_train)

    best_models[name] = grid.best_estimator_

    return best_models


In [9]:
def main():
    logging.info("Starting data ingestion...")
    df = data_ingestion()
    logging.info("Data ingestion completed.")
    logging.info("Starting data exploration...")
    numerical_stats_report, categorical_stats_report = data_exploration(df)
    logging.info("Data exploration completed.")
    print("Numerical Stats Report:")
    print(numerical_stats_report)
    print("\nCategorical Stats Report:")
    print(categorical_stats_report)
    logging.info("Starting data preprocessing...")
    X_train, X_test, y_train, y_test = data_preprocessing(df)
    logging.info("Data preprocessing completed.")
    logging.info("Starting model training...")
    results = model_training(
        X_train, X_test, y_train, y_test
    )
    print("\nModel Results:")
    print(results)
    logging.info("Starting k-fold cross-validation...")
    cv_results = k_fold_cv(X_train, y_train)
    logging.info("K-fold cross-validation completed.")
    print("\nK-Fold Cross-Validation Results:")
    print(cv_results)
    logging.info("Starting hyperparameter tuning...")
    best_models = hyperparameter_tuning(X_train, y_train,folds=5)
    logging.info("Hyperparameter tuning completed.")
    print("\nBest Models from Hyperparameter Tuning:")
    print(best_models)
if __name__ == "__main__":
    main()

Numerical Stats Report:
                         Features     Maximum     Minimum        Mean  \
0        Days_for_shipping_(real)     6.00000     0.00000    3.497654   
1   Days_for_shipment_(scheduled)     4.00000     0.00000    2.931847   
2               Benefit_per_order   911.79999 -4274.97998   21.974989   
3              Sales_per_customer  1939.98999     7.49000  183.107609   
4              Late_delivery_risk     1.00000     0.00000    0.548291   
5             Order_Item_Discount   500.00000     0.00000   20.664741   
6        Order_Item_Product_Price  2000.00000    10.00000  141.245016   
7         Order_Item_Profit_Ratio     0.50000    -2.75000    0.120647   
8             Order_Item_Quantity     5.00000     1.00000    2.127638   
9                           Sales  1999.98999     9.99000  203.772097   
10               Order_Item_Total  1939.98999     7.49000  183.107609   
11         Order_Profit_Per_Order   911.80000 -4274.98000   21.974989   
12                  Product

KeyboardInterrupt: 