In [1]:
# Importing Data Manipulation Libraries
import pandas as pd
import numpy as np

# Importing Data Visualization Libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Import Warnings
import warnings
warnings.filterwarnings('ignore')

# Importing Logging
import logging
logging.basicConfig(level=logging.INFO,
                    filename='classification_model.log',
                    filemode='w',
                    format='%(asctime)s - %(levelname)s - %(message)s',
                    force=True)


from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

from imblearn.over_sampling import SMOTE
from flaml import AutoML

In [2]:
# data ingestion
def data_ingestion():
  df = pd.read_csv(r'C:\SupplyChain_PredictionModel\data\raw\SupplyChain_Dataset.csv')
  df.drop(columns=[
        "Customer_Email",
        "Customer_Password",
        "Customer_Fname",
        "Customer_Lname",
        "Product_Image",
        "Product_Description",
        "Order_Id",
        "Customer_Id",
        'Customer_City',
        'Customer_Country',
        'Customer_Segment',
        'Customer_State',
        'Customer_Street',
        'Customer_Zipcode',
        'Order_City',
        'Order_Country',
        'Order_State',
        'Order_Zipcode',
        'Product_Status',
        "Order_Customer_Id",
        'Category_Id',
        'Latitude',
        'Longitude',
        'Order_Item_Id',
        'Product_Category_Id',
        'shipping_date_(DateOrders)',
        'order_date_(DateOrders)',
        'Product_Card_Id',
        'Order_Item_Cardprod_Id',
        'Department_Id',
        "Delivery_Status",
        "Order_Status",
        "Product_Name",
        'Order_Item_Discount_Rate'
    ],axis=1, inplace=True)
  return df

In [3]:

def data_exploration(df):
    # Segregate numerical and categorical columns
    numerical_cols = df.select_dtypes(exclude='object').columns
    categorical_cols = df.select_dtypes(include='object').columns

    numerical_stats = []

    # Numerical stats
    for i in numerical_cols:

        Q1 = df[i].quantile(0.25)
        Q3 = df[i].quantile(0.75)
        IQR = Q3 - Q1
        LW = Q1 - 1.5 * IQR
        UW = Q3 + 1.5 * IQR

        outlier_flag = "Has Outliers" if ((df[i] < LW) | (df[i] > UW)).any() else "No Outliers"

        num_stats = OrderedDict({
            "Features": i,
            "Maximum": df[i].max(),
            "Minimum": df[i].min(),
            "Mean": df[i].mean(),
            "Median": df[i].median(),
            "Q1": Q1,
            "Q3": Q3,
            "IQR": IQR,
            "Skewness": df[i].skew(),
            "Kurtosis": df[i].kurtosis(),
            "Outlier Comment": outlier_flag
        })

        numerical_stats.append(num_stats)

    numerical_stats_report = pd.DataFrame(numerical_stats)

    # Categorical stats
    categorical_stats = []

    for i in categorical_cols:

        cat_stats = OrderedDict({
            "Features": i,
            "Unique_Values": df[i].nunique(),
            "Mode": df[i].mode()[0],
            "Value_Counts": df[i].value_counts().to_dict()
        })

        categorical_stats.append(cat_stats)

    categorical_stats_report = pd.DataFrame(categorical_stats)

    return numerical_stats_report, categorical_stats_report


In [4]:
def split_data(df,target_col = 'Late_delivery_risk'):
    X = df.drop(target_col, axis=1)
    y = df[target_col]
    print("Split data completed.")
    return X, y


In [5]:
def split_train_test(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    print("Split train and test data completed.")
    return X_train, X_test, y_train, y_test

In [6]:
def create_preprocessor(X):
    num_cols = X.select_dtypes(include=["int64", "float64"]).columns
    cat_cols = X.select_dtypes(include=["object"]).columns

    numeric_pipeline = SimpleImputer(strategy="median")

    categorical_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown="ignore"))
    ])

    preprocessor = ColumnTransformer([
        ("num", numeric_pipeline, num_cols),
        ("cat", categorical_pipeline, cat_cols)
    ])

    print("Preprocessor Created")
    return preprocessor

In [7]:
def apply_preprocessing(preprocessor, X_train, X_test):
    X_train_processed = preprocessor.fit_transform(X_train)
    X_test_processed = preprocessor.transform(X_test)

    print("Preprocessing Applied")
    return X_train_processed, X_test_processed

In [8]:
def apply_smote(X_train, y_train):
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

    print("SMOTE Applied")
    print("Class Distribution After SMOTE:", np.bincount(y_resampled))

    return X_resampled, y_resampled

In [9]:
def train_flaml(X_train, y_train):
    automl = AutoML()

    settings = {
        "time_budget": 60,
        "metric": "accuracy",
        "task": "classification",
        "estimator_list": ["rf", "extra_tree", "xgboost", "lrl2"],
        "log_file_name": "flaml.log",
        "seed": 42
    }

    automl.fit(X_train=X_train, y_train=y_train, **settings)

    print("FLAML Training Completed")
    print("Best Model:", automl.model.estimator)

    return automl

In [10]:
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)

    print("\nAccuracy:", accuracy_score(y_test, y_pred))
    print("\nClassification Report:\n")
    print(classification_report(y_test, y_pred))

In [14]:
import pickle
def main():

    df = data_ingestion()
    
    X, y = split_data(df)
    
    X_train, X_test, y_train, y_test = split_train_test(X, y)
    
    preprocessor = create_preprocessor(X_train)
    
    X_train_processed, X_test_processed = apply_preprocessing(preprocessor, X_train, X_test)
    
    X_resampled, y_resampled = apply_smote(X_train_processed, y_train)
    
    model = train_flaml(X_resampled, y_resampled)
    
    evaluate_model(model, X_test_processed, y_test)

    with open("best_model.pkl", "wb") as f:
        pickle.dump(model, f)
    
    with open("preprocessor.pkl", "wb") as f:
        pickle.dump(preprocessor, f)

if __name__ == "__main__":
    main()

Split data completed.
Split train and test data completed.
Preprocessor Created
Preprocessing Applied
SMOTE Applied
Class Distribution After SMOTE: [69232 69232]
[flaml.automl.logger: 02-21 11:43:02] {2375} INFO - task = classification
[flaml.automl.logger: 02-21 11:43:02] {2386} INFO - Evaluation method: holdout
[flaml.automl.logger: 02-21 11:43:02] {2489} INFO - Minimizing error metric: 1-accuracy
[flaml.automl.logger: 02-21 11:43:02] {2606} INFO - List of ML learners in AutoML Run: ['rf', 'extra_tree', 'xgboost', 'lrl2']
[flaml.automl.logger: 02-21 11:43:02] {2911} INFO - iteration 0, current learner rf
[flaml.automl.logger: 02-21 11:43:02] {3046} INFO - Estimated sufficient time budget=20149s. Estimated necessary time budget=31s.
[flaml.automl.logger: 02-21 11:43:02] {3097} INFO -  at 0.4s,	estimator rf's best error=1.1447e-01,	best estimator rf's best error=1.1447e-01
[flaml.automl.logger: 02-21 11:43:02] {2911} INFO - iteration 1, current learner xgboost
[flaml.automl.logger: 02-