In [13]:
# Importing Data Manipulation Libraries
import pandas as pd
import numpy as np

# Importing Data Visualization Libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Import Warnings
import warnings
warnings.filterwarnings('ignore')

# Importing Logging
import logging
logging.basicConfig(level=logging.INFO,
                    filename='classification_model.log',
                    filemode='w',
                    format='%(asctime)s - %(levelname)s - %(message)s',
                    force=True)

# Import Machine Learning Libraries
from sklearn.model_selection import train_test_split,cross_val_score,KFold,GridSearchCV
from sklearn.preprocessing import StandardScaler,MinMaxScaler,OneHotEncoder,LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier,BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
import xgboost as xgb
from xgboost import XGBClassifier
from collections import OrderedDict


In [14]:
# data ingestion
def data_ingestion():
  df = pd.read_csv(r'C:\SupplyChain_PredictionModel\data\raw\SupplyChain_Dataset.csv')
  df.drop(columns=[
        "Customer_Email",
        "Customer_Password",
        "Customer_Fname",
        "Customer_Lname",
        "Product_Image",
        "Product_Description",
        "Order_Id",
        "Customer_Id",
        'Customer_City',
        'Customer_Country',
        'Customer_Segment',
        'Customer_State',
        'Customer_Street',
        'Customer_Zipcode',
        'Order_City',
        'Order_Country',
        'Order_State',
        'Order_Zipcode',
        'Product_Status',
        "Order_Customer_Id",
        'Category_Id',
        'Latitude',
        'Longitude',
        'Order_Item_Id',
        'Product_Category_Id',
        'shipping_date_(DateOrders)',
        'order_date_(DateOrders)',
        'Product_Card_Id',
        'Order_Item_Cardprod_Id',
        'Department_Id',
        "Delivery_Status",
        "Order_Status",
        "Product_Name",
        'Order_Item_Discount_Rate'
    ],axis=1, inplace=True)
  return df

In [15]:
def data_exploration(df):

    # Segregate numerical and categorical columns
    numerical_cols = df.select_dtypes(exclude='object').columns
    categorical_cols = df.select_dtypes(include='object').columns

    numerical_stats = []

    # Numerical stats
    for i in numerical_cols:

        Q1 = df[i].quantile(0.25)
        Q3 = df[i].quantile(0.75)
        IQR = Q3 - Q1
        LW = Q1 - 1.5 * IQR
        UW = Q3 + 1.5 * IQR

        outlier_flag = "Has Outliers" if ((df[i] < LW) | (df[i] > UW)).any() else "No Outliers"

        num_stats = OrderedDict({
            "Features": i,
            "Maximum": df[i].max(),
            "Minimum": df[i].min(),
            "Mean": df[i].mean(),
            "Median": df[i].median(),
            "Q1": Q1,
            "Q3": Q3,
            "IQR": IQR,
            "Skewness": df[i].skew(),
            "Kurtosis": df[i].kurtosis(),
            "Outlier Comment": outlier_flag
        })

        numerical_stats.append(num_stats)

    numerical_stats_report = pd.DataFrame(numerical_stats)

    # Categorical stats
    categorical_stats = []

    for i in categorical_cols:

        cat_stats = OrderedDict({
            "Features": i,
            "Unique_Values": df[i].nunique(),
            "Mode": df[i].mode()[0],
            "Value_Counts": df[i].value_counts().to_dict()
        })

        categorical_stats.append(cat_stats)

    categorical_stats_report = pd.DataFrame(categorical_stats)

    return numerical_stats_report, categorical_stats_report


In [16]:
def data_preprocessing(df):

    # Split the data into X and y
    X = df.drop(columns=['Late_delivery_risk'], axis=1)
    y = df['Late_delivery_risk']

    # Train test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=0.3,
        random_state=1
    )

    # Encoding categorical columns
    categorical_cols = X_train.select_dtypes(include='object').columns

    for col in categorical_cols:
        le = LabelEncoder()
        X_train[col] = le.fit_transform(X_train[col])
        X_test[col] = le.transform(X_test[col])

    # Scaling numerical columns
    numerical_cols = X_train.select_dtypes(exclude='object').columns

    for col in numerical_cols:
        ms = MinMaxScaler()
        X_train[col] = ms.fit_transform(X_train[[col]])
        X_test[col] = ms.transform(X_test[[col]])

    # Apply SMOTE only on training data
    smote = SMOTE(random_state=1)
    X_train, y_train = smote.fit_resample(X_train, y_train)

    return X_train, X_test, y_train, y_test


In [21]:
from flaml import AutoML
from sklearn.metrics import f1_score
def automl_model(X_train, y_train, X_test, y_test, time_budget=60):

    automl = AutoML()

    settings = {
        "time_budget": 60,
        "metric": "f1",  # better for imbalanced classes
        "task": "classification",
        "estimator_list": ["lrl1", "extra_tree", "rf", "xgboost"],
        "log_file_name": "flaml_classification.log",
    }
    automl.fit(X_train=X_train, y_train=y_train, **settings)

    y_pred = automl.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("F1 Score:", f1_score(y_test, y_pred))


    return automl

In [18]:
! pip install flaml lightgbm xgboost catboost


Defaulting to user installation because normal site-packages is not writeable
Collecting flaml
  Downloading flaml-2.5.0-py3-none-any.whl.metadata (13 kB)
Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-win_amd64.whl.metadata (1.5 kB)
Collecting graphviz (from catboost)
  Downloading graphviz-0.21-py3-none-any.whl.metadata (12 kB)
Downloading flaml-2.5.0-py3-none-any.whl (337 kB)
   ---------------------------------------- 0.0/337.7 kB ? eta -:--:--
   ---------------------------------------- 0.0/337.7 kB ? eta -:--:--
   ---------------------------------------- 0.0/337.7 kB ? eta -:--:--
   ---------------------------------------- 0.0/337.7 kB ? eta -:--:--
   - -------------------------------------- 10.2/337.7 kB ? eta -:--:--
   - -------------------------------------- 10.2/337.7 kB ? eta -:--:--
   - -------------------------------------- 10.2/337.7 kB ? eta -:--:--
   --- --------

ERROR: Exception:
Traceback (most recent call last):
  File "C:\ProgramData\anaconda3\Lib\site-packages\pip\_vendor\urllib3\response.py", line 438, in _error_catcher
    yield
  File "C:\ProgramData\anaconda3\Lib\site-packages\pip\_vendor\urllib3\response.py", line 561, in read
    data = self._fp_read(amt) if not fp_closed else b""
           ^^^^^^^^^^^^^^^^^^
  File "C:\ProgramData\anaconda3\Lib\site-packages\pip\_vendor\urllib3\response.py", line 527, in _fp_read
    return self._fp.read(amt) if amt is not None else self._fp.read()
           ^^^^^^^^^^^^^^^^^^
  File "C:\ProgramData\anaconda3\Lib\site-packages\pip\_vendor\cachecontrol\filewrapper.py", line 98, in read
    data: bytes = self.__fp.read(amt)
                  ^^^^^^^^^^^^^^^^^^^
  File "C:\ProgramData\anaconda3\Lib\http\client.py", line 479, in read
    s = self.fp.read(amt)
        ^^^^^^^^^^^^^^^^^
  File "C:\ProgramData\anaconda3\Lib\socket.py", line 708, in readinto
    return self._sock.recv_into(b)
           ^

In [22]:
def main():
    logging.info("Starting main function.")
    df = data_ingestion()
    logging.info("Data ingestion completed.")
    logging.info("Starting data exploration...")
    numerical_stats_report, categorical_stats_report = data_exploration(df)
    logging.info("Data exploration completed.")
    print(numerical_stats_report)
    logging.info("Starting data preprocessing...")
    X_train, X_test, y_train, y_test = data_preprocessing(df) 
    logging.info("Data preprocessing completed.")
    logging.info("Starting model training and evaluation...")
    automl = automl_model(X_train, y_train, X_test, y_test, time_budget=60)
    logging.info("Model training and evaluation completed.")
    print(automl)
is_main = __name__ == "__main__"
if is_main:
    main()

    

                         Features     Maximum     Minimum        Mean  \
0        Days_for_shipping_(real)     6.00000     0.00000    3.497654   
1   Days_for_shipment_(scheduled)     4.00000     0.00000    2.931847   
2               Benefit_per_order   911.79999 -4274.97998   21.974989   
3              Sales_per_customer  1939.98999     7.49000  183.107609   
4              Late_delivery_risk     1.00000     0.00000    0.548291   
5             Order_Item_Discount   500.00000     0.00000   20.664741   
6        Order_Item_Product_Price  2000.00000    10.00000  141.245016   
7         Order_Item_Profit_Ratio     0.50000    -2.75000    0.120647   
8             Order_Item_Quantity     5.00000     1.00000    2.127638   
9                           Sales  1999.98999     9.99000  203.772097   
10               Order_Item_Total  1939.98999     7.49000  183.107609   
11         Order_Profit_Per_Order   911.80000 -4274.98000   21.974989   
12                  Product_Price  1999.99000     9