In [2]:
# Importing Data Manipulation Libraries
import pandas as pd
import numpy as np

# Importing Data Visualization Libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Import Warnings
import warnings
warnings.filterwarnings('ignore')

# Importing Logging
import logging
logging.basicConfig(level=logging.INFO,
                    filename='classification_model.log',
                    filemode='w',
                    format='%(asctime)s - %(levelname)s - %(message)s',
                    force=True)


# Importing Machine Learning Libraries
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

from imblearn.over_sampling import SMOTE
from flaml import AutoML

In [3]:
# data ingestion
def data_ingestion():
  df = pd.read_csv(r'C:\SupplyChain_PredictionModel\data\raw\SupplyChain_Dataset.csv')
  df.drop(columns=[
        "Customer_Email",
        "Customer_Password",
        "Customer_Fname",
        "Customer_Lname",
        "Product_Image",
        "Product_Description",
        "Order_Id",
        "Customer_Id",
        'Customer_City',
        'Customer_Country',
        'Customer_Segment',
        'Customer_State',
        'Customer_Street',
        'Customer_Zipcode',
        'Order_City',
        'Order_Country',
        'Order_State',
        'Order_Zipcode',
        'Product_Status',
        "Order_Customer_Id",
        'Category_Id',
        'Latitude',
        'Longitude',
        'Order_Item_Id',
        'Product_Category_Id',
        'shipping_date_(DateOrders)',
        'order_date_(DateOrders)',
        'Product_Card_Id',
        'Order_Item_Cardprod_Id',
        'Department_Id',
        "Delivery_Status",
        "Order_Status",
        "Product_Name",
        'Order_Item_Discount_Rate'
    ],axis=1, inplace=True)
  return df

In [4]:
from collections import OrderedDict
def data_exploration(df):
    
    # Segregate numerical and categorical columns
    numerical_cols = df.select_dtypes(exclude='object').columns
    categorical_cols = df.select_dtypes(include='object').columns

    numerical_stats = []
    # Numerical stats
    for i in numerical_cols:
        Q1 = df[i].quantile(0.25)
        Q3 = df[i].quantile(0.75)
        IQR = Q3 - Q1
        LW = Q1 - 1.5 * IQR
        UW = Q3 + 1.5 * IQR
        outlier_flag = "Has Outliers" if ((df[i] < LW) | (df[i] > UW)).any() else "No Outliers"
        
        num_stats = OrderedDict({
            "Features": i,
            "Maximum": df[i].max(),
            "Minimum": df[i].min(),
            "Mean": df[i].mean(),
            "Median": df[i].median(),
            "Q1": Q1,
            "Q3": Q3,
            "IQR": IQR,
            "Skewness": df[i].skew(),
            "Kurtosis": df[i].kurtosis(),
            "Outlier Comment": outlier_flag
        })

        numerical_stats.append(num_stats)

    numerical_stats_report = pd.DataFrame(numerical_stats)

    # Categorical stats
    categorical_stats = []

    for i in categorical_cols:

        cat_stats = OrderedDict({
            "Features": i,
            "Unique_Values": df[i].nunique(),
            "Mode": df[i].mode()[0],
            "Value_Counts": df[i].value_counts().to_dict()
        })

        categorical_stats.append(cat_stats)

    categorical_stats_report = pd.DataFrame(categorical_stats)

    return numerical_stats_report, categorical_stats_report


In [5]:
def split_data(df,target_col = 'Late_delivery_risk'):
    X = df.drop(target_col, axis=1)
    y = df[target_col]
    print("Split data completed.")
    return X, y


In [6]:
def split_train_test(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    print("Split train and test data completed.")
    return X_train, X_test, y_train, y_test

In [7]:
def create_preprocessor(X):
    num_cols = X.select_dtypes(include=["int64", "float64"]).columns
    cat_cols = X.select_dtypes(include=["object"]).columns

    numeric_pipeline = SimpleImputer(strategy="median")

    categorical_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown="ignore"))
    ])

    preprocessor = ColumnTransformer([
        ("num", numeric_pipeline, num_cols),
        ("cat", categorical_pipeline, cat_cols)
    ])

    print("Preprocessor Created")
    return preprocessor

In [8]:
def apply_preprocessing(preprocessor, X_train, X_test):
    X_train_processed = preprocessor.fit_transform(X_train)
    X_test_processed = preprocessor.transform(X_test)

    print("Preprocessing Applied")
    return X_train_processed, X_test_processed

In [9]:
def apply_smote(X_train, y_train):
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

    print("SMOTE Applied")
    print("Class Distribution After SMOTE:", np.bincount(y_resampled))

    return X_resampled, y_resampled

In [10]:
def train_flaml(X_train, y_train):
    automl = AutoML()

    settings = {
        "time_budget": 60,
        "metric": "accuracy",
        "task": "classification",
        "estimator_list": ["rf", "extra_tree", "xgboost", "lrl2"],
        "log_file_name": "flaml.log",
        "seed": 42
    }

    automl.fit(X_train=X_train, y_train=y_train, **settings)

    print("FLAML Training Completed")
    print("Best Model:", automl.model.estimator)

    return automl

In [11]:
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)

    print("\nAccuracy:", accuracy_score(y_test, y_pred))
    print("\nClassification Report:\n")
    print(classification_report(y_test, y_pred))

In [12]:
import pickle
def main():
    
    logging.info('Data Loading Started')
    df = data_ingestion()
    logging.info('Data Loaded Successfully')

    logging.info('Data Exploration Started')
    numerical_stats_report, categorical_stats_report = data_exploration(df)
    print('\nNumerical Stats:')
    print(numerical_stats_report)
    print('\nCategorical Stats:')
    print(categorical_stats_report)
    logging.info('Data Exploration Completed')
    
    logging.info('Split the data Started')
    X, y = split_data(df)
    logging.info('Split the data Completed')

    logging.info('Train Test Split Started')
    X_train, X_test, y_train, y_test = split_train_test(X, y)
    logging.info('Train Test Split Completed')

    logging.info('Creating a Preprocessor')
    preprocessor = create_preprocessor(X_train)
    logging.info('Preprocessor created successfully')
    
    logging.info('Applying Preprocessing')
    X_train_processed, X_test_processed = apply_preprocessing(preprocessor, X_train, X_test)
    logging.info('Preprocessing Completed')
    
    logging.info('Applying Smote')
    X_resampled, y_resampled = apply_smote(X_train_processed, y_train)
    logging.info('Smote Applied Successfully')
    
    logging.info('Training Model')
    model = train_flaml(X_resampled, y_resampled)
    logging.info('Training Completed')
    
    logging.info('Model Evaluation Started')
    evaluate_model(model, X_test_processed, y_test)
    logging.info('Model Evaluation Completed')

    with open("best_model.pkl", "wb") as f:
        pickle.dump(model, f)
    
    with open("preprocessor.pkl", "wb") as f:
        pickle.dump(preprocessor, f)
        
if __name__ == "__main__":
    main()


Numerical Stats:
                         Features     Maximum     Minimum        Mean  \
0        Days_for_shipping_(real)     6.00000     0.00000    3.497654   
1   Days_for_shipment_(scheduled)     4.00000     0.00000    2.931847   
2               Benefit_per_order   911.79999 -4274.97998   21.974989   
3              Sales_per_customer  1939.98999     7.49000  183.107609   
4              Late_delivery_risk     1.00000     0.00000    0.548291   
5             Order_Item_Discount   500.00000     0.00000   20.664741   
6        Order_Item_Product_Price  2000.00000    10.00000  141.245016   
7         Order_Item_Profit_Ratio     0.50000    -2.75000    0.120647   
8             Order_Item_Quantity     5.00000     1.00000    2.127638   
9                           Sales  1999.98999     9.99000  203.772097   
10               Order_Item_Total  1939.98999     7.49000  183.107609   
11         Order_Profit_Per_Order   911.80000 -4274.98000   21.974989   
12                  Product_Price

In [14]:
df = data_ingestion()
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180519 entries, 0 to 180518
Data columns (total 19 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   Type                           180519 non-null  object 
 1   Days_for_shipping_(real)       180519 non-null  int64  
 2   Days_for_shipment_(scheduled)  180519 non-null  int64  
 3   Benefit_per_order              180519 non-null  float64
 4   Sales_per_customer             180519 non-null  float64
 5   Late_delivery_risk             180519 non-null  int64  
 6   Category_Name                  180519 non-null  object 
 7   Department_Name                180519 non-null  object 
 8   Market                         180519 non-null  object 
 9   Order_Item_Discount            180519 non-null  float64
 10  Order_Item_Product_Price       180519 non-null  int64  
 11  Order_Item_Profit_Ratio        180519 non-null  float64
 12  Order_Item_Quantity           