In [1]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
import random
from datetime import timedelta

# ------------------------------------------
# LOAD DATASET FROM GOOGLE DRIVE
# ------------------------------------------
file_path = "/content/drive/MyDrive/shipment_dataset_10000.xlsx"
df = pd.read_excel(file_path)

# Make a copy
anomalous_df = df.copy()

# 1. INTRODUCE MISSING VALUES (1%)
for col in anomalous_df.columns:
    anomalous_df.loc[anomalous_df.sample(frac=0.01).index, col] = np.nan

# 2. DUPLICATE ROWS

duplicates = anomalous_df.sample(frac=0.005)
anomalous_df = pd.concat([anomalous_df, duplicates], ignore_index=True)

# 3. DATATYPE MISMATCHES (string <-> numeric)
numeric_cols = anomalous_df.select_dtypes(include=[np.number]).columns

for col in numeric_cols[:2]:    # first 2 numeric columns
    idx = anomalous_df.sample(frac=0.01).index
    anomalous_df.loc[idx, col] = anomalous_df.loc[idx, col].astype(str)

# 4. OUTLIERS (Extreme numeric values)

for col in numeric_cols:
    clean_nums = pd.to_numeric(anomalous_df[col], errors="coerce")
    mean = clean_nums.mean()
    std = clean_nums.std()
    outlier_idx = anomalous_df.sample(frac=0.01).index
    anomalous_df.loc[outlier_idx, col] = mean + std * 20

# 5. ERRORS (Country names + Wrong dates)
# Wrong country names
if "Country" in anomalous_df.columns:
    wrong_list = ["Indai", "Unted Stats", "Germnay", "Frnace"]
    idx = anomalous_df.sample(frac=0.01).index
    anomalous_df.loc[idx, "Country"] = random.choice(wrong_list)

# Shipment date BEFORE order date
if "Order Date" in anomalous_df.columns and "Shipment Date" in anomalous_df.columns:
    idx = anomalous_df.sample(frac=0.01).index
    anomalous_df.loc[idx, "Shipment Date"] = anomalous_df.loc[idx, "Order Date"] - timedelta(days=random.randint(1, 10))

# SAVE ANOMALOUS DATASET BACK TO DRIVE
save_path = "/content/drive/MyDrive/anomalous_shipment_dataset.xlsx"
anomalous_df.to_excel(save_path, index=False)

print("Anomalous dataset created and saved to:", save_path)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


 '6900.0' '7598.0' '591.0' '8222.0' '1550.0' '8342.0' '5114.0' '1508.0'
 '870.0' 'nan' '2189.0' '1630.0' '3415.0' '1364.0' '5081.0' '6774.0'
 '5130.0' '5442.0' '5237.0' '5045.0' '6470.0' '6439.0' '7943.0' '6979.0'
 '1356.0' '3903.0' '1184.0' '5908.0' '6893.0' '62.0' '6486.0' '6422.0'
 '4058.0' '9102.0' '5967.0' '1325.0' '2587.0' '383.0' '3215.0' '4638.0'
 '3919.0' '6051.0' '2182.0' '6451.0' '4175.0' '3763.0' '9710.0' '767.0'
 '4615.0' '5503.0' '105.0' '8208.0' '142.0' '6488.0' '2797.0' '1713.0'
 '2397.0' '114.0' '406.0' '1509.0' '3974.0' '7594.0' '6986.0' '3612.0'
 '9332.0' '1515.0' '4309.0' '3594.0' '1843.0' '3137.0' '4000.0' '9906.0'
 '7729.0' '4904.0' '5513.0' '1816.0' '5931.0' '4278.0' '2659.0' '3971.0'
 '6453.0' '2770.0' '2088.0' '8451.0' '5192.0' '780.0' '3536.0' '1812.0'
 '8879.0' '5079.0' '3949.0' '8350.0']' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  anomalous_df.loc[idx, col] = anomalous_df.loc[idx, col].astype(str)
 '5007.0' '18

Anomalous dataset created and saved to: /content/drive/MyDrive/anomalous_shipment_dataset.xlsx


In [2]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import pandas as pd
import numpy as np

def fix_dataset(path):
    df = pd.read_excel(path)

    # 1. FIX MISSING VALUES
    df.fillna(method='ffill', inplace=True)
    df.fillna(method='bfill', inplace=True)

    # 2. REMOVE DUPLICATES

    df.drop_duplicates(inplace=True)

    # 3. FIX DATATYPE MISMATCHES
    for col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='ignore')

    # 4. FIX OUTLIERS USING IQR RULE

    numeric_cols = df.select_dtypes(include=[np.number]).columns

    for col in numeric_cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1

        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR

        df[col] = np.clip(df[col], lower, upper)


    # 5. FIX ERRORS

    # Country value corrections
    if "Country" in df.columns:
        corrections = {
            "Indai": "India",
            "Unted Stats": "United States",
            "Germnay": "Germany",
            "Frnace": "France"
        }
        df["Country"] = df["Country"].replace(corrections)

    # Shipment Date must be >= Order Date
    if "Order Date" in df.columns and "Shipment Date" in df.columns:
        wrong_dates = df["Shipment Date"] < df["Order Date"]
        df.loc[wrong_dates, "Shipment Date"] = df.loc[wrong_dates, "Order Date"]

    return df


if __name__ == "__main__":
    input_path = "/content/drive/MyDrive/anomalous_shipment_dataset.xlsx"
    output_path = "/content/drive/MyDrive/cleaned_shipment_dataset.xlsx"

    cleaned = fix_dataset(input_path)
    cleaned.to_excel(output_path, index=False)

    print(f"Cleaned dataset saved as: {output_path}")


  df.fillna(method='ffill', inplace=True)
  df.fillna(method='bfill', inplace=True)
  df[col] = pd.to_numeric(df[col], errors='ignore')


Cleaned dataset saved as: /content/drive/MyDrive/cleaned_shipment_dataset.xlsx
