In [6]:
import pandas as pd
import numpy as np
import os

# 1. Load the dataset
df = pd.read_csv("/content/DataCoSupplyChainDataset.csv", encoding='latin1')

# 2. Basic exploration
print(df.shape)
print(df.columns)
print(df.info())
print(df.isnull().sum())

# 3. Drop irrelevant columns (IDs, links, URLs)
cols_to_drop = ['Customer Email', 'Customer Fname', 'Customer Lname',
                'Customer Password', 'Product Image', 'Customer Id',
                'Order Id', 'Order Item Id']
df.drop(columns=cols_to_drop, inplace=True)

# 4. Convert dates to datetime format
date_cols = ['order date (DateOrders)', 'shipping date (DateOrders)']
for col in date_cols:
    df[col] = pd.to_datetime(df[col], errors='coerce')

# 5. Feature Engineering: Calculate Delivery Delay in Days
df['Delivery Delay'] = (df['shipping date (DateOrders)'] - df['order date (DateOrders)']).dt.days
df['Shipping Delay'] = (df['shipping date (DateOrders)'] - df['order date (DateOrders)']).dt.days
df['Is Late Delivery'] = df['Delivery Delay'] > 0

# 6. Encode categorical variables
df['Late Label'] = df['Is Late Delivery'].astype(int)

# 7. Clean categorical features (optional)
df['Shipping Mode'] = df['Shipping Mode'].str.strip().str.title()
df['Customer Segment'] = df['Customer Segment'].str.strip().str.title()

# 8. Fill missing values if needed
df['Delivery Delay'].fillna(df['Delivery Delay'].median(), inplace=True)
df['Shipping Delay'].fillna(df['Shipping Delay'].median(), inplace=True)

# 9. Summary Stats
print(df[['Delivery Delay', 'Shipping Delay']].describe())
print(df['Late Label'].value_counts())

# 10. Save cleaned dataset
if not os.path.exists('data'):
    os.makedirs('data')
df.to_csv("data/cleaned_supply_chain.csv", index=False)

(180519, 53)
Index(['Type', 'Days for shipping (real)', 'Days for shipment (scheduled)',
       'Benefit per order', 'Sales per customer', 'Delivery Status',
       'Late_delivery_risk', 'Category Id', 'Category Name', 'Customer City',
       'Customer Country', 'Customer Email', 'Customer Fname', 'Customer Id',
       'Customer Lname', 'Customer Password', 'Customer Segment',
       'Customer State', 'Customer Street', 'Customer Zipcode',
       'Department Id', 'Department Name', 'Latitude', 'Longitude', 'Market',
       'Order City', 'Order Country', 'Order Customer Id',
       'order date (DateOrders)', 'Order Id', 'Order Item Cardprod Id',
       'Order Item Discount', 'Order Item Discount Rate', 'Order Item Id',
       'Order Item Product Price', 'Order Item Profit Ratio',
       'Order Item Quantity', 'Sales', 'Order Item Total',
       'Order Profit Per Order', 'Order Region', 'Order State', 'Order Status',
       'Order Zipcode', 'Product Card Id', 'Product Category Id',
     