In [4]:
import psycopg2
import pandas as pd

# ðŸ”§ PostgreSQL configuration (edit if needed)
PG_HOST = "localhost"
PG_PORT = "5433"
PG_USER = "postgres"
PG_PASSWORD = "admin"
PG_DB = "robot_vacuum"

def get_conn():
    return psycopg2.connect(
        host=PG_HOST,
        port=PG_PORT,
        user=PG_USER,
        password=PG_PASSWORD,
        dbname=PG_DB,
    )


In [5]:
DDL = r"""
CREATE SCHEMA IF NOT EXISTS robot_vacuum;
SET search_path TO robot_vacuum;

DROP TABLE IF EXISTS Review CASCADE;
DROP TABLE IF EXISTS WarehouseDistributionCenter CASCADE;
DROP TABLE IF EXISTS WarehouseProductStock CASCADE;
DROP TABLE IF EXISTS "Order" CASCADE;
DROP TABLE IF EXISTS DistributionCenter CASCADE;
DROP TABLE IF EXISTS Warehouse CASCADE;
DROP TABLE IF EXISTS Product CASCADE;
DROP TABLE IF EXISTS Customer CASCADE;
DROP TABLE IF EXISTS Manufacturer CASCADE;

CREATE TABLE Manufacturer (
    ManufacturerID      VARCHAR(20) PRIMARY KEY,
    ManufacturerName    VARCHAR(120) NOT NULL,
    Country             VARCHAR(60),
    LeadTimeDays        INT,
    ReliabilityScore    DECIMAL(5,2)
);

CREATE TABLE Customer (
    CustomerID              VARCHAR(20) PRIMARY KEY,
    CustomerName            VARCHAR(120) NOT NULL,
    CustomerEmail           VARCHAR(120) NOT NULL,
    CustomerStreetAddress   VARCHAR(150),
    CustomerZipCode         VARCHAR(10),
    BillingZipCode          VARCHAR(10),
    Segment                 VARCHAR(40)
);

CREATE TABLE Product (
    ProductID           VARCHAR(20) PRIMARY KEY,
    ProductName         VARCHAR(160) NOT NULL,
    ModelNumber         VARCHAR(80),
    ManufacturerID      VARCHAR(20) NOT NULL REFERENCES Manufacturer(ManufacturerID),
    UnitPrice           DECIMAL(10,2),
    ProductDescription  TEXT
);

CREATE TABLE Warehouse (
    WarehouseID             VARCHAR(20) PRIMARY KEY,
    WarehouseStreetAddress  VARCHAR(150),
    WarehouseZipCode        VARCHAR(10),
    WarehouseCapacity       INT
);

CREATE TABLE DistributionCenter (
    DistributionCenterID            VARCHAR(20) PRIMARY KEY,
    Region                          VARCHAR(60),
    DistributionCenterStreetAddress VARCHAR(150),
    DistributionCenterZipCode       VARCHAR(10),
    FleetSize                       INT
);

CREATE TABLE "Order" (
    OrderID                 VARCHAR(20) PRIMARY KEY,
    CustomerID              VARCHAR(20) NOT NULL REFERENCES Customer(CustomerID),
    ProductID               VARCHAR(20) NOT NULL REFERENCES Product(ProductID),
    WarehouseID             VARCHAR(20) NOT NULL REFERENCES Warehouse(WarehouseID),
    DistributionCenterID    VARCHAR(20) NOT NULL REFERENCES DistributionCenter(DistributionCenterID),

    Quantity                INT,
    UnitPrice               DECIMAL(10,2),
    DiscountAmount          DECIMAL(10,2),
    PromoCode               VARCHAR(40),
    TaxAmount               DECIMAL(10,2),
    ShippingCost            DECIMAL(10,2),
    CostOfGoods             DECIMAL(10,2),
    TotalAmount             DECIMAL(12,2),

    OrderDate               TIMESTAMP,
    ExpectedDeliveryDate    TIMESTAMP,
    ActualDeliveryDate      TIMESTAMP,

    DeliveryStatus          VARCHAR(40),
    PaymentMethod           VARCHAR(20),
    CardNumber              VARCHAR(40),
    CardBrand               VARCHAR(40),

    BillingZipCode          VARCHAR(10),
    DeliveryStreetAddress   VARCHAR(150),
    DeliveryZipCode         VARCHAR(10),
    ShippingCarrier         VARCHAR(80)
);

CREATE TABLE WarehouseProductStock (
    WarehouseID         VARCHAR(20) NOT NULL REFERENCES Warehouse(WarehouseID),
    ProductID           VARCHAR(20) NOT NULL REFERENCES Product(ProductID),
    StockLevel          INT,
    RestockThreshold    INT,
    LastRestockDate     TIMESTAMP,
    LastUpdated         TIMESTAMP,
    PRIMARY KEY (WarehouseID, ProductID)
);

CREATE TABLE WarehouseDistributionCenter (
    WarehouseID         VARCHAR(20) NOT NULL REFERENCES Warehouse(WarehouseID),
    DistributionCenterID VARCHAR(20) NOT NULL REFERENCES DistributionCenter(DistributionCenterID),
    PRIMARY KEY (WarehouseID, DistributionCenterID)
);

CREATE TABLE Review (
    ReviewID            VARCHAR(20) PRIMARY KEY,
    CustomerID          VARCHAR(20) NOT NULL REFERENCES Customer(CustomerID),
    ProductID           VARCHAR(20) NOT NULL REFERENCES Product(ProductID),
    ReviewRating        DECIMAL(2,1),
    ReviewDate          TIMESTAMP,
    ReviewText          TEXT,
    ReviewSentiment     VARCHAR(40)
);
"""

conn = get_conn()
cur = conn.cursor()
cur.execute(DDL)
conn.commit()
cur.close()
conn.close()
print("âœ… Schema robot_vacuum created & tables ready.")

âœ… Schema robot_vacuum created & tables ready.


In [7]:
csv_path = "../data/RobotVacuumDepot_MasterData.csv"  # adjust path if needed

df = pd.read_csv(csv_path)

print("Columns in CSV:")
print(df.columns.tolist())
print("Total rows:", len(df))

# Convert datetime-ish columns once
for col in ["OrderDate", "ExpectedDeliveryDate", "ActualDeliveryDate",
            "LastRestockDate", "LastUpdated", "ReviewDate"]:
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], errors="coerce")


Columns in CSV:
['OrderID', 'OrderDate', 'CustomerID', 'CustomerName', 'CustomerEmail', 'CustomerZipCode', 'CustomerAddress', 'BillingZipCode', 'BillingAddress', 'DeliveryStatus', 'DeliveryAddress', 'DeliveryZipCode', 'ShippingCost', 'ShippingCarrier', 'Region', 'ProductID', 'ProductName', 'ProductDescription', 'ModelNumber', 'ManufacturerID', 'ManufacturerName', 'ProductPrice', 'TaxAmount', 'DiscountAmount', 'TotalAmount', 'StockLevel', 'WarehouseID', 'WarehouseStreetAddress', 'WarehouseZipCode', 'WarehouseCapacity', 'DistributionCenterID', 'DistributionCenterStreetAddress', 'DistributionCenterZipCode', 'Segment', 'LeadTimeDays', 'ReliabilityScore', 'UnitPrice', 'FleetSize', 'RestockThreshold', 'LastRestockDate', 'LastUpdated', 'Quantity', 'PromoCode', 'ExpectedDeliveryDate', 'ActualDeliveryDate', 'PaymentMethod', 'CardNumber', 'CardBrand', 'ReviewID', 'ReviewRating', 'ReviewText', 'ReviewDate', 'ReviewSentiment']
Total rows: 3771


In [8]:
import numpy as np
from datetime import datetime

def to_ts_or_none(value):
    """Return Python datetime or None (never NaT or string)."""
    if pd.isna(value):
        return None
    if isinstance(value, pd.Timestamp):
        return value.to_pydatetime()
    return value


In [9]:
manufacturers = df[["ManufacturerID", "ManufacturerName", "LeadTimeDays", "ReliabilityScore"]].drop_duplicates()
manufacturers = manufacturers[manufacturers["ManufacturerID"].notna()]

conn = get_conn()
cur = conn.cursor()
count = 0

for _, r in manufacturers.iterrows():
    cur.execute(
        """
        INSERT INTO robot_vacuum.Manufacturer
        (ManufacturerID, ManufacturerName, Country, LeadTimeDays, ReliabilityScore)
        VALUES (%s,%s,%s,%s,%s)
        ON CONFLICT (ManufacturerID) DO NOTHING
        """,
        (r["ManufacturerID"], r["ManufacturerName"], None,
         r["LeadTimeDays"], r["ReliabilityScore"])
    )
    count += cur.rowcount

conn.commit(); cur.close(); conn.close()
print(f"âœ… Manufacturer rows inserted: {count}")


âœ… Manufacturer rows inserted: 8


In [10]:
customers = df[[
    "CustomerID", "CustomerName", "CustomerEmail",
    "CustomerAddress", "CustomerZipCode",
    "BillingZipCode", "Segment"
]].drop_duplicates()

customers = customers[customers["CustomerID"].notna()]

conn = get_conn()
cur = conn.cursor()
count = 0

for _, r in customers.iterrows():
    cur.execute(
        """
        INSERT INTO robot_vacuum.Customer
        (CustomerID, CustomerName, CustomerEmail,
         CustomerStreetAddress, CustomerZipCode, BillingZipCode, Segment)
        VALUES (%s,%s,%s,%s,%s,%s,%s)
        ON CONFLICT (CustomerID) DO NOTHING
        """,
        (
            r["CustomerID"], r["CustomerName"], r["CustomerEmail"],
            r["CustomerAddress"], r["CustomerZipCode"],
            r["BillingZipCode"], r["Segment"]
        )
    )
    count += cur.rowcount

conn.commit(); cur.close(); conn.close()
print(f"âœ… Customer rows inserted: {count}")


âœ… Customer rows inserted: 2568


In [11]:
products = df[[
    "ProductID", "ProductName", "ModelNumber",
    "ManufacturerID", "ProductPrice", "ProductDescription"
]].drop_duplicates()

products = products[products["ProductID"].notna()]

conn = get_conn()
cur = conn.cursor()
count = 0

for _, r in products.iterrows():
    cur.execute(
        """
        INSERT INTO robot_vacuum.Product
        (ProductID, ProductName, ModelNumber, ManufacturerID, UnitPrice, ProductDescription)
        VALUES (%s,%s,%s,%s,%s,%s)
        ON CONFLICT (ProductID) DO NOTHING
        """,
        (
            r["ProductID"], r["ProductName"], r["ModelNumber"],
            r["ManufacturerID"], r["ProductPrice"], r["ProductDescription"]
        )
    )
    count += cur.rowcount

conn.commit(); cur.close(); conn.close()
print(f"âœ… Product rows inserted: {count}")


âœ… Product rows inserted: 100


In [12]:
warehouses = df[[
    "WarehouseID", "WarehouseStreetAddress",
    "WarehouseZipCode", "WarehouseCapacity"
]].drop_duplicates()

warehouses = warehouses[warehouses["WarehouseID"].notna()]

conn = get_conn()
cur = conn.cursor()
count = 0

for _, r in warehouses.iterrows():
    cur.execute(
        """
        INSERT INTO robot_vacuum.Warehouse
        (WarehouseID, WarehouseStreetAddress, WarehouseZipCode, WarehouseCapacity)
        VALUES (%s,%s,%s,%s)
        ON CONFLICT (WarehouseID) DO NOTHING
        """,
        (
            r["WarehouseID"], r["WarehouseStreetAddress"],
            r["WarehouseZipCode"], r["WarehouseCapacity"]
        )
    )
    count += cur.rowcount

conn.commit(); cur.close(); conn.close()
print(f"âœ… Warehouse rows inserted: {count}")


âœ… Warehouse rows inserted: 4


In [13]:
dcs = df[[
    "DistributionCenterID", "Region",
    "DistributionCenterStreetAddress", "DistributionCenterZipCode",
    "FleetSize"
]].drop_duplicates()

dcs = dcs[dcs["DistributionCenterID"].notna()]

conn = get_conn()
cur = conn.cursor()
count = 0

for _, r in dcs.iterrows():
    cur.execute(
        """
        INSERT INTO robot_vacuum.DistributionCenter
        (DistributionCenterID, Region, DistributionCenterStreetAddress,
         DistributionCenterZipCode, FleetSize)
        VALUES (%s,%s,%s,%s,%s)
        ON CONFLICT (DistributionCenterID) DO NOTHING
        """,
        (
            r["DistributionCenterID"], r["Region"],
            r["DistributionCenterStreetAddress"],
            r["DistributionCenterZipCode"], r["FleetSize"]
        )
    )
    count += cur.rowcount

conn.commit(); cur.close(); conn.close()
print(f"âœ… DistributionCenter rows inserted: {count}")


âœ… DistributionCenter rows inserted: 17


In [14]:
wps = df[[
    "WarehouseID", "ProductID", "StockLevel",
    "RestockThreshold", "LastRestockDate", "LastUpdated"
]].drop_duplicates()

wps = wps[wps["WarehouseID"].notna() & wps["ProductID"].notna()]

conn = get_conn()
cur = conn.cursor()
count = 0

for _, r in wps.iterrows():
    cur.execute(
        """
        INSERT INTO robot_vacuum.WarehouseProductStock
        (WarehouseID, ProductID, StockLevel, RestockThreshold, LastRestockDate, LastUpdated)
        VALUES (%s,%s,%s,%s,%s,%s)
        ON CONFLICT (WarehouseID, ProductID) DO NOTHING
        """,
        (
            r["WarehouseID"], r["ProductID"], r["StockLevel"],
            r["RestockThreshold"], to_ts_or_none(r["LastRestockDate"]),
            to_ts_or_none(r["LastUpdated"])
        )
    )
    count += cur.rowcount

conn.commit(); cur.close(); conn.close()
print(f"âœ… WarehouseProductStock rows inserted: {count}")


âœ… WarehouseProductStock rows inserted: 400


In [15]:
wdc = df[["WarehouseID", "DistributionCenterID"]].drop_duplicates()
wdc = wdc[wdc["WarehouseID"].notna() & wdc["DistributionCenterID"].notna()]

conn = get_conn()
cur = conn.cursor()
count = 0

for _, r in wdc.iterrows():
    cur.execute(
        """
        INSERT INTO robot_vacuum.WarehouseDistributionCenter
        (WarehouseID, DistributionCenterID)
        VALUES (%s,%s)
        ON CONFLICT (WarehouseID, DistributionCenterID) DO NOTHING
        """,
        (r["WarehouseID"], r["DistributionCenterID"])
    )
    count += cur.rowcount

conn.commit(); cur.close(); conn.close()
print(f"âœ… WarehouseDistributionCenter rows inserted: {count}")


âœ… WarehouseDistributionCenter rows inserted: 32


In [16]:
orders = df[[
    "OrderID","CustomerID","ProductID","WarehouseID","DistributionCenterID",
    "Quantity","UnitPrice","DiscountAmount","PromoCode","TaxAmount",
    "ShippingCost","TotalAmount","OrderDate","ExpectedDeliveryDate",
    "ActualDeliveryDate","DeliveryStatus","PaymentMethod","CardNumber",
    "CardBrand","BillingZipCode","DeliveryAddress","DeliveryZipCode",
    "ShippingCarrier"
]].drop_duplicates()

orders = orders[orders["OrderID"].notna()]
orders["CostOfGoods"] = None  # placeholder â€“ not in CSV

conn = get_conn()
cur = conn.cursor()
count = 0

for _, r in orders.iterrows():
    cur.execute(
        """
        INSERT INTO robot_vacuum."Order"
        (OrderID, CustomerID, ProductID, WarehouseID, DistributionCenterID, Quantity,
         UnitPrice, DiscountAmount, PromoCode, TaxAmount, ShippingCost, CostOfGoods,
         TotalAmount, OrderDate, ExpectedDeliveryDate, ActualDeliveryDate, DeliveryStatus,
         PaymentMethod, CardNumber, CardBrand, BillingZipCode, DeliveryStreetAddress,
         DeliveryZipCode, ShippingCarrier)
        VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
        ON CONFLICT (OrderID) DO NOTHING
        """,
        (
            r["OrderID"], r["CustomerID"], r["ProductID"], r["WarehouseID"], r["DistributionCenterID"],
            r["Quantity"], r["UnitPrice"], r["DiscountAmount"], r["PromoCode"], r["TaxAmount"],
            r["ShippingCost"], r["CostOfGoods"], r["TotalAmount"],
            to_ts_or_none(r["OrderDate"]),
            to_ts_or_none(r["ExpectedDeliveryDate"]),
            to_ts_or_none(r["ActualDeliveryDate"]),
            r["DeliveryStatus"],
            r["PaymentMethod"], r["CardNumber"], r["CardBrand"], r["BillingZipCode"],
            r["DeliveryAddress"], r["DeliveryZipCode"], r["ShippingCarrier"]
        )
    )
    count += cur.rowcount

conn.commit(); cur.close(); conn.close()
print(f"âœ… Order rows inserted: {count}")


âœ… Order rows inserted: 3771


In [17]:
reviews = df[[
    "ReviewID","CustomerID","ProductID","ReviewRating",
    "ReviewDate","ReviewText","ReviewSentiment"
]].drop_duplicates()

reviews = reviews[reviews["ReviewID"].notna()]

conn = get_conn()
cur = conn.cursor()
inserted = 0
skipped_fk = 0

for _, r in reviews.iterrows():
    try:
        cur.execute(
            """
            INSERT INTO robot_vacuum.Review
            (ReviewID, CustomerID, ProductID, ReviewRating, ReviewDate, ReviewText, ReviewSentiment)
            VALUES (%s,%s,%s,%s,%s,%s,%s)
            ON CONFLICT (ReviewID) DO NOTHING
            """,
            (
                r["ReviewID"], r["CustomerID"], r["ProductID"],
                r["ReviewRating"], to_ts_or_none(r["ReviewDate"]),
                r["ReviewText"], r["ReviewSentiment"]
            )
        )
        inserted += cur.rowcount
    except psycopg2.errors.ForeignKeyViolation:
        conn.rollback()
        skipped_fk += 1
    else:
        conn.commit()

cur.close(); conn.close()
print(f"âœ… Inserted Reviews: {inserted}")
print(f"â›” Skipped (missing customer/product FK): {skipped_fk}")


âœ… Inserted Reviews: 3150
â›” Skipped (missing customer/product FK): 0
