# importing Libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3
from tabulate import tabulate
import psycopg2
import sqlalchemy as sa
from sqlalchemy import create_engine, Column, Integer, String, Float, Date, ForeignKey, MetaData, Table, inspect, text,Numeric
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker, relationship
from datetime import datetime
from decimal import Decimal
import psycopg2
import warnings
warnings.filterwarnings('ignore')

# CSV Import

In [3]:
# Try different encodings
try:
    ecom = pd.read_csv('superstore.csv', encoding='utf-8')
except UnicodeDecodeError:
    # Try alternative encodings
    ecom = pd.read_csv('superstore.csv', encoding='latin-1')

In [4]:
ecom.head()

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,...,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit
0,1,CA-2016-152156,11/8/2016,11/11/2016,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.9136
1,2,CA-2016-152156,11/8/2016,11/11/2016,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3,0.0,219.582
2,3,CA-2016-138688,6/12/2016,6/16/2016,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,...,90036,West,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62,2,0.0,6.8714
3,4,US-2015-108966,10/11/2015,10/18/2015,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,FUR-TA-10000577,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table,957.5775,5,0.45,-383.031
4,5,US-2015-108966,10/11/2015,10/18/2015,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,OFF-ST-10000760,Office Supplies,Storage,Eldon Fold 'N Roll Cart System,22.368,2,0.2,2.5164


# Making a copy of DF

In [5]:
#make a copy of the dataframe
ecom_new = ecom.copy()

# Feature Enginnering 

In [6]:
#change the column names to lowercase and replace spaces with underscores
ecom_new.columns = ecom_new.columns.str.lower().str.replace(' ', '_')

#drop the columns that are not needed
ecom_new.drop(columns=['row_id'], inplace=True)

In [7]:
#create new columns such as category_id and sub_category_id from product_id 
ecom_new["category_id"] = ecom_new["category"].apply(lambda x: x[:3].upper() + "-1000")
ecom_new["subcategory_id"] = ecom_new.apply(lambda row: f"{row['category'][:3].upper()}-{row['sub-category'][:2].upper()}-1000", axis=1)

#create a column unit price for product
ecom_new["unit_price"] = ecom_new["sales"] / ecom_new["quantity"]

In [8]:
ecom_new.head()

Unnamed: 0,order_id,order_date,ship_date,ship_mode,customer_id,customer_name,segment,country,city,state,...,category,sub-category,product_name,sales,quantity,discount,profit,category_id,subcategory_id,unit_price
0,CA-2016-152156,11/8/2016,11/11/2016,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,Kentucky,...,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.9136,FUR-1000,FUR-BO-1000,130.98
1,CA-2016-152156,11/8/2016,11/11/2016,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,Kentucky,...,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3,0.0,219.582,FUR-1000,FUR-CH-1000,243.98
2,CA-2016-138688,6/12/2016,6/16/2016,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,California,...,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62,2,0.0,6.8714,OFF-1000,OFF-LA-1000,7.31
3,US-2015-108966,10/11/2015,10/18/2015,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,Florida,...,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table,957.5775,5,0.45,-383.031,FUR-1000,FUR-TA-1000,191.5155
4,US-2015-108966,10/11/2015,10/18/2015,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,Florida,...,Office Supplies,Storage,Eldon Fold 'N Roll Cart System,22.368,2,0.2,2.5164,OFF-1000,OFF-ST-1000,11.184


In [9]:
ecom_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9994 entries, 0 to 9993
Data columns (total 23 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   order_id        9994 non-null   object 
 1   order_date      9994 non-null   object 
 2   ship_date       9994 non-null   object 
 3   ship_mode       9994 non-null   object 
 4   customer_id     9994 non-null   object 
 5   customer_name   9994 non-null   object 
 6   segment         9994 non-null   object 
 7   country         9994 non-null   object 
 8   city            9994 non-null   object 
 9   state           9994 non-null   object 
 10  postal_code     9994 non-null   int64  
 11  region          9994 non-null   object 
 12  product_id      9994 non-null   object 
 13  category        9994 non-null   object 
 14  sub-category    9994 non-null   object 
 15  product_name    9994 non-null   object 
 16  sales           9994 non-null   float64
 17  quantity        9994 non-null   i

# Define the main function

In [10]:
def create_normalized_database(df, connection_string):
    """
    Create a normalized database using the actual columns in the DataFrame
    
    Parameters:
    -----------
    df : pandas.DataFrame
        DataFrame with e-commerce data
    connection_string : str
        SQLAlchemy connection string for database connection
    """
    # Print column names to debug
    print("Available columns in DataFrame:", df.columns.tolist())
    
    # Create database engine
    engine = create_engine(connection_string, echo=False)

# Drop existing tables to start fresh

In [11]:
from sqlalchemy import create_engine, text

# Create a database engine
# Replace with your actual database URL
engine = create_engine('postgresql://postgres:postgres@localhost:5432/superstore_v6')
# Or for SQLite
# engine = create_engine('sqlite:///your_database.db')

# Your existing drop tables SQL
drop_tables_sql = """
    DROP TABLE IF EXISTS order_details CASCADE;
    DROP TABLE IF EXISTS orders CASCADE;
    DROP TABLE IF EXISTS products CASCADE;
    DROP TABLE IF EXISTS customers CASCADE;
    DROP TABLE IF EXISTS segments CASCADE;
    DROP TABLE IF EXISTS regions CASCADE;
    """
    
# Now execute with the defined engine
with engine.begin() as conn:
    conn.execute(text(drop_tables_sql))

OperationalError: (psycopg2.OperationalError) connection to server at "localhost" (127.0.0.1), port 5432 failed: Connection refused
	Is the server running on that host and accepting TCP/IP connections?
connection to server at "localhost" (::1), port 5432 failed: Connection refused
	Is the server running on that host and accepting TCP/IP connections?

(Background on this error at: https://sqlalche.me/e/20/e3q8)

# Create new tables with proper schema

In [None]:
# Create tables with schema matching the diagram
create_tables_sql = """
    -- Create regions table
    CREATE TABLE regions (
        region_id INTEGER PRIMARY KEY,
        region VARCHAR(50) NOT NULL
    );

    -- Create segments table
    CREATE TABLE segments (
        segment_id INTEGER PRIMARY KEY,
        segment VARCHAR(50) NOT NULL
    );

    -- Create customers table
    CREATE TABLE customers (
        customer_id VARCHAR(50) PRIMARY KEY,
        customer_name VARCHAR(100) NOT NULL,
        segment_id INTEGER REFERENCES segments(segment_id),
        country VARCHAR(50),
        city VARCHAR(100),
        state VARCHAR(50),
        postal_code VARCHAR(20),
        region_id INTEGER REFERENCES regions(region_id)
    );

    -- Create products table
    CREATE TABLE products (
        product_id VARCHAR(50) PRIMARY KEY,
        product_name VARCHAR(255) NOT NULL,
        base_price DECIMAL(10, 2),
        category_id INTEGER,
        category VARCHAR(50),
        subcategory_id INTEGER,
        subcategory VARCHAR(50)
    );

    -- Create orders table
    CREATE TABLE orders (
        order_id VARCHAR(50) PRIMARY KEY,
        order_date DATE NOT NULL,
        ship_date DATE,
        ship_mode VARCHAR(50),
        customer_id VARCHAR(50) REFERENCES customers(customer_id)
    );

    -- Create order_details table
    CREATE TABLE order_details (
        order_id VARCHAR(50) REFERENCES orders(order_id),
        product_id VARCHAR(50) REFERENCES products(product_id),
        quantity INTEGER NOT NULL,
        discount DECIMAL(5, 2),
        sales DECIMAL(10, 2),
        profit DECIMAL(10, 2),
        PRIMARY KEY (order_id, product_id)
    );
    """
    
    # Execute each statement separately for better error handling
with engine.begin() as conn:
        for stmt in create_tables_sql.split(';'):
            if stmt.strip():
                conn.execute(text(stmt))

# Extract reference data for lookup tables

In [None]:
# Create regions table
regions_df = ecom_new[['region']].drop_duplicates().reset_index(drop=True)
regions_df['region_id'] = regions_df.index + 1  # Create sequential IDs
	
# Create segments table
segments_df = ecom_new[['segment']].drop_duplicates().reset_index(drop=True)
segments_df['segment_id'] = segments_df.index + 1  # Create sequential IDs
	
# Create mapping dictionaries
region_id_map = dict(zip(regions_df['region'], regions_df['region_id']))
segment_id_map = dict(zip(segments_df['segment'], segments_df['segment_id']))

# Prepare customers table

In [None]:
# Create customers table
customers_df = ecom_new[['customer_id', 'customer_name', 'segment', 
                'country', 'city', 'state', 'postal_code', 'region']].drop_duplicates('customer_id')
    
# Map segment and region to IDs
customers_df['segment_id'] = customers_df['segment'].map(segment_id_map)
customers_df['region_id'] = customers_df['region'].map(region_id_map)
    
# Remove redundant columns
customers_df = customers_df.drop(['segment', 'region'], axis=1)

# Prepare products table

In [None]:
# Check which product columns actually exist
product_cols = ['product_id', 'product_name', 'category_id', 'category', 'subcategory_id']
    
# Check for subcategory column variants
if 'subcategory' in ecom_new.columns:
    product_cols.append('subcategory')
elif 'sub-category' in ecom_new.columns:
    product_cols.append('sub-category')
    
# Check for price column variants
if 'base_price' in ecom_new.columns:
    product_cols.append('base_price')
elif 'unit_price' in ecom_new.columns:
    product_cols.append('unit_price')
        
# Get available product columns
available_product_cols = [col for col in product_cols if col in ecom_new.columns]
products_df = ecom_new[available_product_cols].drop_duplicates('product_id')
    
# Rename columns if needed for database schema
rename_map = {}
if 'sub-category' in products_df.columns:
    rename_map['sub-category'] = 'subcategory'
if 'unit_price' in products_df.columns and 'base_price' not in products_df.columns:
    rename_map['unit_price'] = 'base_price'

if rename_map:
    products_df = products_df.rename(columns=rename_map)

# Convert category_id and subcategory_id to integers by extracting numeric parts if they're strings
for col in ['category_id', 'subcategory_id']:
    if col in products_df.columns and products_df[col].dtype == 'object':
        try:
            products_df[col] = products_df[col].str.extract(r'(\d+)').astype(int)
        except:
            pass  # If conversion fails, keep as string

# Prepare orders and order_details tables

In [None]:
# Create orders table
orders_df = ecom_new[['order_id', 'order_date', 'ship_date', 
                   'ship_mode', 'customer_id']].drop_duplicates('order_id')

# Convert date columns to datetime
orders_df['order_date'] = pd.to_datetime(orders_df['order_date'])
orders_df['ship_date'] = pd.to_datetime(orders_df['ship_date'])

# Create order_details table
order_details_df = ecom_new[['order_id', 'product_id', 'quantity', 
                          'discount', 'sales', 'profit']]

# Check for duplicates in order_details
duplicate_count = order_details_df.duplicated(['order_id', 'product_id']).sum()
if duplicate_count > 0:
    print(f"Found {duplicate_count} duplicate order_id/product_id combinations. Aggregating...")
    
    # Aggregate duplicates
    order_details_df = order_details_df.groupby(['order_id', 'product_id']).agg({
        'quantity': 'sum',
        'discount': 'mean',
        'sales': 'sum',
        'profit': 'sum'
    }).reset_index()

Found 8 duplicate order_id/product_id combinations. Aggregating...


# Import data to the database

In [None]:
# Import data in the correct order to satisfy foreign key constraints
regions_df.to_sql('regions', engine, if_exists='append', index=False)
segments_df.to_sql('segments', engine, if_exists='append', index=False)
customers_df.to_sql('customers', engine, if_exists='append', index=False)
products_df.to_sql('products', engine, if_exists='append', index=False)
orders_df.to_sql('orders', engine, if_exists='append', index=False)
order_details_df.to_sql('order_details', engine, if_exists='append', index=False)

986

# Display sample data from each table

In [None]:
# Print 5 rows from each table to verify import
table_names = ['regions', 'segments', 'customers', 'products', 'orders', 'order_details']
    
for table in table_names:
        print(f"\n=== 5 rows from {table} table ===")
        try:
            query = f"SELECT * FROM {table} LIMIT 5;"
            result = pd.read_sql(query, engine)
            
            if len(result) > 0:
                print(", ".join(result.columns))
                for _, row in result.iterrows():
                    print(", ".join(str(val) for val in row.values))
            else:
                print(f"No data in {table} table")
        except Exception as e:
            print(f"Error querying {table}: {str(e)}")
    
print("\nDatabase creation and population complete!")


=== 5 rows from regions table ===
region_id, region
1, South
2, West
3, Central
4, East

=== 5 rows from segments table ===
segment_id, segment
1, Consumer
2, Corporate
3, Home Office

=== 5 rows from customers table ===
customer_id, customer_name, segment_id, country, city, state, postal_code, region_id
CG-12520, Claire Gute, 1, United States, Henderson, Kentucky, 42420, 1
DV-13045, Darrin Van Huff, 2, United States, Los Angeles, California, 90036, 2
SO-20335, Sean O'Donnell, 1, United States, Fort Lauderdale, Florida, 33311, 1
BH-11710, Brosina Hoffman, 1, United States, Los Angeles, California, 90032, 2
AA-10480, Andrew Allen, 1, United States, Concord, North Carolina, 28027, 1

=== 5 rows from products table ===
product_id, product_name, base_price, category_id, category, subcategory_id, subcategory
FUR-BO-10001798, Bush Somerset Collection Bookcase, 130.98, 1000, Furniture, 1000, Bookcases
FUR-CH-10000454, Hon Deluxe Fabric Upholstered Stacking Chairs, Rounded Back, 243.98, 1000,

# Call the function

In [None]:
# Load your data
# df = pd.read_csv('your_data.csv')  # or however you load your data

# Create the normalized database
create_normalized_database(ecom_new, 'postgresql://postgres:postgres@localhost:5432/superstore_v6')

Available columns in DataFrame: ['order_id', 'order_date', 'ship_date', 'ship_mode', 'customer_id', 'customer_name', 'segment', 'country', 'city', 'state', 'postal_code', 'region', 'product_id', 'category', 'sub-category', 'product_name', 'sales', 'quantity', 'discount', 'profit', 'category_id', 'subcategory_id', 'unit_price']


In [None]:
#exporting all the tables to csv files
regions_df.to_csv('regions.csv', index=False)
segments_df.to_csv('segments.csv', index=False)
customers_df.to_csv('customers.csv', index=False)
products_df.to_csv('products.csv', index=False)
orders_df.to_csv('orders.csv', index=False)
order_details_df.to_csv('order_details.csv', index=False)
