# importing Libraries

In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3
from tabulate import tabulate
import psycopg2

# importing CSV

In [6]:
# Try different encodings
try:
    ecom = pd.read_csv('superstore.csv', encoding='utf-8')
except UnicodeDecodeError:
    # Try alternative encodings
    ecom = pd.read_csv('superstore.csv', encoding='latin-1')

In [8]:
#create a copy of the dataset
ecom_copy = ecom.copy()

#display the first few rows of the copied dataset
ecom_copy.head()




Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,...,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit
0,1,CA-2016-152156,11/8/2016,11/11/2016,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.9136
1,2,CA-2016-152156,11/8/2016,11/11/2016,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3,0.0,219.582
2,3,CA-2016-138688,6/12/2016,6/16/2016,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,...,90036,West,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62,2,0.0,6.8714
3,4,US-2015-108966,10/11/2015,10/18/2015,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,FUR-TA-10000577,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table,957.5775,5,0.45,-383.031
4,5,US-2015-108966,10/11/2015,10/18/2015,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,OFF-ST-10000760,Office Supplies,Storage,Eldon Fold 'N Roll Cart System,22.368,2,0.2,2.5164


In [9]:
#change the column names to lowercase and replace spaces with underscores
ecom_copy.columns = ecom_copy.columns.str.lower().str.replace(' ', '_')

#display the first few rows of the dataset
ecom_copy.head()

# number of rows and columns in the dataset
print(ecom_copy.shape)

# number of unique customers in the dataset
print(ecom_copy['customer_id'].nunique())

#show the data types of the columns
print(ecom_copy.dtypes)

#drop row_id column
ecom_copy.drop('row_id', axis=1, inplace=True)



(9994, 21)
793
row_id             int64
order_id          object
order_date        object
ship_date         object
ship_mode         object
customer_id       object
customer_name     object
segment           object
country           object
city              object
state             object
postal_code        int64
region            object
product_id        object
category          object
sub-category      object
product_name      object
sales            float64
quantity           int64
discount         float64
profit           float64
dtype: object


# Converting Column types

In [10]:
#change the column types to the appropriate data types
ecom_copy['order_date'] = pd.to_datetime(ecom_copy['order_date'])  # Already datetime64[ns]
ecom_copy['ship_date'] = pd.to_datetime(ecom_copy['ship_date'])    # Already datetime64[ns]
ecom_copy['customer_id'] = ecom_copy['customer_id'].astype('category')  # Change from object to category
ecom_copy['product_id'] = ecom_copy['product_id'].astype('category')    # Change from object to category
ecom_copy['category'] = ecom_copy['category'].astype('category')        # Change from object to category
ecom_copy['product_name'] = ecom_copy['product_name'].astype(str)       # Keep as string (object)
ecom_copy['sales'] = ecom_copy['sales'].astype(float)              # Already float64
ecom_copy['quantity'] = ecom_copy['quantity'].astype(int)          # Already int64
ecom_copy['discount'] = ecom_copy['discount'].astype(float)        # Already float64
ecom_copy['profit'] = ecom_copy['profit'].astype(float)            # Already float64
ecom_copy['customer_name'] = ecom_copy['customer_name'].astype('category')  # Change from object to category
ecom_copy['segment'] = ecom_copy['segment'].astype('category')          # Change from object to category
ecom_copy['city'] = ecom_copy['city'].astype('category')                # Change from object to category
ecom_copy['state'] = ecom_copy['state'].astype('category')              # Change from object to category
ecom_copy['country'] = ecom_copy['country'].astype('category')          # Change from object to category
ecom_copy['region'] = ecom_copy['region'].astype('category')            # Change from object to category
ecom_copy['order_id'] = ecom_copy['order_id'].astype('category')        # Change from object to category
ecom_copy['ship_mode'] = ecom_copy['ship_mode'].astype('category')      # Change from object to category
ecom_copy['postal_code'] = ecom_copy['postal_code'].astype(str)         # Change from int64 to string
ecom_copy['sub-category'] = ecom_copy['sub-category'].astype('category')  # Change from object to category

# Convert postal_code from object to category
ecom_copy['postal_code'] = ecom_copy['postal_code'].astype('category')

# Optionally convert product_name to category if there are many duplicates
# First check the cardinality ratio
product_name_ratio = ecom_copy['product_name'].nunique() / len(ecom_copy)
if product_name_ratio < 0.5:  # If less than 50% unique values
    ecom_copy['product_name'] = ecom_copy['product_name'].astype('category')
    




In [11]:
ecom_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9994 entries, 0 to 9993
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   order_id       9994 non-null   category      
 1   order_date     9994 non-null   datetime64[ns]
 2   ship_date      9994 non-null   datetime64[ns]
 3   ship_mode      9994 non-null   category      
 4   customer_id    9994 non-null   category      
 5   customer_name  9994 non-null   category      
 6   segment        9994 non-null   category      
 7   country        9994 non-null   category      
 8   city           9994 non-null   category      
 9   state          9994 non-null   category      
 10  postal_code    9994 non-null   category      
 11  region         9994 non-null   category      
 12  product_id     9994 non-null   category      
 13  category       9994 non-null   category      
 14  sub-category   9994 non-null   category      
 15  product_name   9994 n

# Feature Enginnering

In [12]:
def enrich_ecom_data(df):
    """
    Adds category, subcategory, segment, and region IDs to the DataFrame.
    
    Args:
        df (pandas.DataFrame): Original DataFrame containing columns:
                              'Product ID', 'Segment', 'Region'
        
    Returns:
        pandas.DataFrame: Enriched DataFrame with ID columns
    """
    # Create a copy to avoid modifying the original
    result_df = df.copy()
    
    # 1. Extract category_id and subcategory_id from Product ID
    result_df['category_id'] = result_df['product_id'].apply(
        lambda x: x.split('-')[0] + '-' + x.split('-')[-1][:4]
    )
    
    result_df['subcategory_id'] = result_df['product_id'].apply(
        lambda x: x.split('-')[0] + '-' + x.split('-')[1] + '-' + x.split('-')[-1][:4]
    )
    
    # 2. Define the segment ID mapping
    segment_id_map = {
        'Consumer': 'CONS-1000',
        'Corporate': 'CORP-1000',
        'Home Office': 'HOME-1000'
    }
    
    # Add the segment_id column
    result_df['segment_id'] = result_df['segment'].map(segment_id_map)
    
    # 3. Define the region ID mapping
    region_id_map = {
        'Central': 'CENT-1000',
        'East': 'EAST-1000',
        'South': 'SOUT-1000',
        'West': 'WEST-1000'
    }
    
    # Add the region_id column
    result_df['region_id'] = result_df['region'].map(region_id_map)
    
    return result_df

# Apply to your ecom_copy DataFrame
ecom_copy = enrich_ecom_data(ecom_copy)



# Creating Tables using SQLite 

In [31]:
def create_database_tables(db_name='ecom_v2.db'):
    """
    Step 1: Creates tables in SQLite database without populating them.
    
    Args:
        db_name (str): Name of the SQLite database file
        
    Returns:
        sqlite3.Connection: The database connection
    """
    # Connect to the database (creates it if it doesn't exist)
    conn = sqlite3.connect(db_name)
    cursor = conn.cursor()
    
    print(f"Creating database: {db_name}")
    
    # 1. Lookup tables
    print("Creating lookup tables...")
    
    # Segments lookup table
    cursor.execute('''
    CREATE TABLE IF NOT EXISTS segments (
        segment_id TEXT PRIMARY KEY,
        segment_name TEXT UNIQUE
    )
    ''')
    
    # Regions lookup table
    cursor.execute('''
    CREATE TABLE IF NOT EXISTS regions (
        region_id TEXT PRIMARY KEY,
        region_name TEXT UNIQUE
    )
    ''')
    
    # Categories lookup table
    cursor.execute('''
    CREATE TABLE IF NOT EXISTS categories (
        category_id TEXT PRIMARY KEY,
        category_name TEXT UNIQUE
    )
    ''')
    
    # Subcategories lookup table
    cursor.execute('''
    CREATE TABLE IF NOT EXISTS subcategories (
        subcategory_id TEXT PRIMARY KEY,
        subcategory_name TEXT,
        category_id TEXT,
        FOREIGN KEY (category_id) REFERENCES categories (category_id)
    )
    ''')
    
    # 2. Main tables
    print("Creating main tables...")
    
    # Customers table
    cursor.execute('''
    CREATE TABLE IF NOT EXISTS customers (
        customer_id TEXT PRIMARY KEY,
        customer_name TEXT,
        segment_id TEXT,
        country TEXT,
        city TEXT,
        state TEXT,
        postal_code TEXT,
        region_id TEXT,
        FOREIGN KEY (segment_id) REFERENCES segments (segment_id),
        FOREIGN KEY (region_id) REFERENCES regions (region_id)
    )
    ''')
    
    # Products table
    cursor.execute('''
    CREATE TABLE IF NOT EXISTS products (
        product_id TEXT PRIMARY KEY,
        product_name TEXT,
        category_id TEXT,
        subcategory_id TEXT,
        FOREIGN KEY (category_id) REFERENCES categories (category_id),
        FOREIGN KEY (subcategory_id) REFERENCES subcategories (subcategory_id)
    )
    ''')
    
    # Orders table
    cursor.execute('''
    CREATE TABLE IF NOT EXISTS orders (
        order_id TEXT PRIMARY KEY,
        order_date TEXT,
        ship_date TEXT,
        ship_mode TEXT,
        customer_id TEXT,
        FOREIGN KEY (customer_id) REFERENCES customers (customer_id)
    )
    ''')
    
    # Enhanced Order details table with feature engineering
    cursor.execute('''
    CREATE TABLE IF NOT EXISTS order_details (
        order_id TEXT,
        product_id TEXT,
        quantity INTEGER,
        sales REAL,
        discount REAL,
        profit REAL,
        
        -- Feature engineered fields
        unit_price REAL,
        price_before_discount REAL,
        discount_amount REAL,
        cost_per_unit REAL,
        margin_percentage REAL,
        
        PRIMARY KEY (order_id, product_id),
        FOREIGN KEY (order_id) REFERENCES orders (order_id),
        FOREIGN KEY (product_id) REFERENCES products (product_id)
    )
    ''')
    
    conn.commit()
    print(f"Tables created successfully in {db_name}")
    
    return conn

# Populating the database tables with data

In [32]:
def populate_database_tables(conn, ecom_copy):
    """
    Step 2: Populates the database tables with data from the enriched DataFrame.
    
    Args:
        conn (sqlite3.Connection): The database connection
        ecom_copy (pandas.DataFrame): Enriched DataFrame with the column names as shown
        
    Returns:
        sqlite3.Connection: The database connection
    """
    cursor = conn.cursor()
    
    print("Populating database tables...")
    
    # 1. Populate lookup tables
    print("Populating lookup tables...")
    
    # Segments
    segments_df = ecom_copy[['segment_id', 'segment']].drop_duplicates()
    for _, row in segments_df.iterrows():
        cursor.execute(
            "INSERT OR IGNORE INTO segments (segment_id, segment_name) VALUES (?, ?)",
            (row['segment_id'], row['segment'])
        )
    
    # Regions
    regions_df = ecom_copy[['region_id', 'region']].drop_duplicates()
    for _, row in regions_df.iterrows():
        cursor.execute(
            "INSERT OR IGNORE INTO regions (region_id, region_name) VALUES (?, ?)",
            (row['region_id'], row['region'])
        )
    
    # Categories
    categories_df = ecom_copy[['category_id', 'category']].drop_duplicates()
    for _, row in categories_df.iterrows():
        cursor.execute(
            "INSERT OR IGNORE INTO categories (category_id, category_name) VALUES (?, ?)",
            (row['category_id'], row['category'])
        )
    
    # Subcategories
    subcategories_df = ecom_copy[['subcategory_id', 'sub-category', 'category_id']].drop_duplicates()
    for _, row in subcategories_df.iterrows():
        cursor.execute(
            "INSERT OR IGNORE INTO subcategories (subcategory_id, subcategory_name, category_id) VALUES (?, ?, ?)",
            (row['subcategory_id'], row['sub-category'], row['category_id'])
        )
    
    # 2. Populate main tables
    print("Populating main tables...")
    
    # Customers
    customers_df = ecom_copy[['customer_id', 'customer_name', 'segment_id', 'country', 'city', 
                       'state', 'postal_code', 'region_id']].drop_duplicates()
    for _, row in customers_df.iterrows():
        cursor.execute('''
        INSERT OR IGNORE INTO customers 
        (customer_id, customer_name, segment_id, country, city, state, postal_code, region_id)
        VALUES (?, ?, ?, ?, ?, ?, ?, ?)
        ''', (row['customer_id'], row['customer_name'], row['segment_id'], row['country'], 
              row['city'], row['state'], row['postal_code'], row['region_id']))
    
    # Products
    products_df = ecom_copy[['product_id', 'product_name', 'category_id', 'subcategory_id']].drop_duplicates()
    for _, row in products_df.iterrows():
        cursor.execute('''
        INSERT OR IGNORE INTO products
        (product_id, product_name, category_id, subcategory_id)
        VALUES (?, ?, ?, ?)
        ''', (row['product_id'], row['product_name'], row['category_id'], row['subcategory_id']))
    
    # Orders - Converting timestamps to strings to avoid SQLite binding errors
    orders_df = ecom_copy[['order_id', 'order_date', 'ship_date', 'ship_mode', 'customer_id']].drop_duplicates()
    for _, row in orders_df.iterrows():
        # Convert timestamps to strings in ISO format
        order_date_str = row['order_date'].strftime('%Y-%m-%d') if pd.notna(row['order_date']) else None
        ship_date_str = row['ship_date'].strftime('%Y-%m-%d') if pd.notna(row['ship_date']) else None
        
        cursor.execute('''
        INSERT OR IGNORE INTO orders
        (order_id, order_date, ship_date, ship_mode, customer_id)
        VALUES (?, ?, ?, ?, ?)
        ''', (row['order_id'], order_date_str, ship_date_str, 
              row['ship_mode'], row['customer_id']))
    
    # Order details with feature engineering
    for _, row in ecom_copy.iterrows():
        # Handle edge cases
        quantity = row['quantity'] if row['quantity'] > 0 else 1  # Avoid division by zero
        sales = row['sales'] if row['sales'] != 0 else 0.01  # Avoid division by zero
        discount = row['discount'] if 0 <= row['discount'] < 1 else 0  # Ensure discount is valid
        
        # Calculate derived metrics
        unit_price = sales / quantity
        price_before_discount = sales / (1 - discount) if discount < 1 else sales
        discount_amount = price_before_discount - sales
        cost = sales - row['profit']
        cost_per_unit = cost / quantity
        margin_percentage = (row['profit'] / sales) * 100 if sales > 0 else 0
        
        cursor.execute('''
        INSERT OR IGNORE INTO order_details
        (order_id, product_id, quantity, sales, discount, profit, 
         unit_price, price_before_discount, discount_amount, cost_per_unit, margin_percentage)
        VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
        ''', (row['order_id'], row['product_id'], row['quantity'], 
              row['sales'], row['discount'], row['profit'],
              round(unit_price, 2), round(price_before_discount, 2), 
              round(discount_amount, 2), round(cost_per_unit, 2), round(margin_percentage, 2)))
    
    # Commit changes
    conn.commit()
    print("Tables populated successfully")
    
    return conn


# Testing the DB 

In [35]:
def verify_database_population(db_name='ecom_v2.db'):
    """
    Runs a simple query to verify data was properly populated across all main tables.
    
    Args:
        db_name (str): Name of the SQLite database file
    """
    conn = sqlite3.connect(db_name)
    cursor = conn.cursor()
    
    print(f"Verifying data population in {db_name}...")
    
    # 1. Check record counts in all tables
    tables = ['segments', 'regions', 'categories', 'subcategories', 
              'customers', 'products', 'orders', 'order_details']
    
    print("\nTable record counts:")
    for table in tables:
        cursor.execute(f"SELECT COUNT(*) FROM {table}")
        count = cursor.fetchone()[0]
        print(f"- {table}: {count} records")
    
    # 2. Run a simple join across all tables to verify relationships
    query = """
    SELECT 
        o.order_id,
        o.order_date,
        c.customer_name,
        s.segment_name,
        r.region_name,
        p.product_name,
        cat.category_name,
        subcat.subcategory_name,
        od.quantity,
        od.sales,
        od.profit,
        od.unit_price,
        od.margin_percentage
    FROM order_details od
    JOIN orders o ON od.order_id = o.order_id
    JOIN customers c ON o.customer_id = c.customer_id
    JOIN segments s ON c.segment_id = s.segment_id
    JOIN regions r ON c.region_id = r.region_id
    JOIN products p ON od.product_id = p.product_id
    JOIN categories cat ON p.category_id = cat.category_id
    JOIN subcategories subcat ON p.subcategory_id = subcat.subcategory_id
    LIMIT 5
    """
    
    try:
        results = pd.read_sql_query(query, conn)
        print("\nSuccessfully joined all tables!")
        print("\nSample data from complete join:")
        print(results)
        
        if len(results) > 0:
            print("\n✅ Database verification complete: Data successfully populated across all tables")
        else:
            print("\n❌ Database verification failed: Join query returned no results")
            
    except Exception as e:
        print(f"\n❌ Database verification failed: {str(e)}")
    
    # 3. Extra verification: Check feature-engineered fields calculation
    verification_query = """
    SELECT
        od.order_id,
        od.product_id,
        od.quantity,
        od.sales,
        od.discount,
        od.profit,
        od.unit_price,
        od.price_before_discount,
        od.discount_amount,
        od.cost_per_unit,
        od.margin_percentage,
        -- Verify calculations are correct
        (od.sales / od.quantity) AS calculated_unit_price,
        (od.sales / (1 - od.discount)) AS calculated_price_before_discount,
        ((od.sales / (1 - od.discount)) - od.sales) AS calculated_discount_amount,
        ((od.sales - od.profit) / od.quantity) AS calculated_cost_per_unit,
        ((od.profit / od.sales) * 100) AS calculated_margin_percentage
    FROM order_details od
    WHERE od.quantity > 0 
      AND od.sales > 0
      AND od.discount < 1
    LIMIT 3
    """
    
    try:
        verification_results = pd.read_sql_query(verification_query, conn)
        print("\nFeature engineering verification:")
        
        with pd.option_context('display.max_columns', None, 'display.width', 1000):
            print(verification_results)
            
        # Calculate discrepancies
        if len(verification_results) > 0:
            discrepancies = []
            
            for col in ['unit_price', 'price_before_discount', 'discount_amount', 'cost_per_unit', 'margin_percentage']:
                calc_col = f'calculated_{col}'
                # Check if values are within 0.01 of each other (account for rounding differences)
                diff = abs(verification_results[col] - verification_results[calc_col]).max()
                if diff > 0.01:
                    discrepancies.append(f"{col} (max diff: {diff:.4f})")
            
            if discrepancies:
                print(f"\n⚠️ Feature engineering discrepancies found in: {', '.join(discrepancies)}")
            else:
                print("\n✅ Feature engineering calculations verified correctly")
    except Exception as e:
        print(f"\n❌ Feature engineering verification failed: {str(e)}")
    
    conn.close()
    print("\nDatabase connection closed.")

if __name__ == "__main__":
    # Run the verification
    verify_database_population()

Verifying data population in ecom_v2.db...

Table record counts:
- segments: 3 records
- regions: 4 records
- categories: 3 records
- subcategories: 17 records
- customers: 793 records
- products: 1862 records
- orders: 5009 records
- order_details: 9986 records

Successfully joined all tables!

Sample data from complete join:
         order_id  order_date    customer_name segment_name region_name  \
0  CA-2016-152156  2016-11-08      Claire Gute     Consumer       South   
1  CA-2016-152156  2016-11-08      Claire Gute     Consumer       South   
2  CA-2016-138688  2016-06-12  Darrin Van Huff    Corporate        West   
3  US-2015-108966  2015-10-11   Sean O'Donnell     Consumer       South   
4  US-2015-108966  2015-10-11   Sean O'Donnell     Consumer       South   

                                        product_name    category_name  \
0                  Bush Somerset Collection Bookcase        Furniture   
1  Hon Deluxe Fabric Upholstered Stacking Chairs,...        Furniture   
2

In [36]:
# Reopen the connection
conn = sqlite3.connect('ecom_v2.db')

# Load individual tables
orders_df = pd.read_sql("SELECT * FROM orders", conn)
customers_df = pd.read_sql("SELECT * FROM customers", conn)
products_df = pd.read_sql("SELECT * FROM products", conn)
order_details_df = pd.read_sql("SELECT * FROM order_details", conn)
categories_df = pd.read_sql("SELECT * FROM categories", conn)
subcategories_df = pd.read_sql("SELECT * FROM subcategories", conn)
segments_df = pd.read_sql("SELECT * FROM segments", conn)
regions_df = pd.read_sql("SELECT * FROM regions", conn)

print(tabulate(orders_df.head(), headers='keys', tablefmt='psql'))
print(tabulate(customers_df.head(), headers='keys', tablefmt='psql'))
print(tabulate(products_df.head(), headers='keys', tablefmt='psql'))
print(tabulate(order_details_df.head(), headers='keys', tablefmt='psql'))
print(tabulate(categories_df.head(), headers='keys', tablefmt='psql'))
print(tabulate(subcategories_df.head(), headers='keys', tablefmt='psql'))
print(tabulate(segments_df.head(), headers='keys', tablefmt='psql'))
print(tabulate(regions_df.head(), headers='keys', tablefmt='psql'))


# Close the connection
conn.close()


+----+----------------+--------------+-------------+----------------+---------------+
|    | order_id       | order_date   | ship_date   | ship_mode      | customer_id   |
|----+----------------+--------------+-------------+----------------+---------------|
|  0 | CA-2016-152156 | 2016-11-08   | 2016-11-11  | Second Class   | CG-12520      |
|  1 | CA-2016-138688 | 2016-06-12   | 2016-06-16  | Second Class   | DV-13045      |
|  2 | US-2015-108966 | 2015-10-11   | 2015-10-18  | Standard Class | SO-20335      |
|  3 | CA-2014-115812 | 2014-06-09   | 2014-06-14  | Standard Class | BH-11710      |
|  4 | CA-2017-114412 | 2017-04-15   | 2017-04-20  | Standard Class | AA-10480      |
+----+----------------+--------------+-------------+----------------+---------------+
+----+---------------+-----------------+--------------+---------------+-----------------+----------------+---------------+-------------+
|    | customer_id   | customer_name   | segment_id   | country       | city            |

In [38]:
#convert all the tables columns data types to the appropriate data types

# Convert orders_df columns to appropriate data types
orders_df['order_date'] = pd.to_datetime(orders_df['order_date'])
orders_df['ship_date'] = pd.to_datetime(orders_df['ship_date'])
orders_df['order_id'] = orders_df['order_id'].astype('category')
orders_df['ship_mode'] = orders_df['ship_mode'].astype('category')
orders_df['customer_id'] = orders_df['customer_id'].astype('category')

# Convert customers_df columns to appropriate data types
customers_df['customer_id'] = customers_df['customer_id'].astype('category')
customers_df['customer_name'] = customers_df['customer_name'].astype(str)
customers_df['segment_id'] = customers_df['segment_id'].astype('category')
customers_df['country'] = customers_df['country'].astype('category')
customers_df['city'] = customers_df['city'].astype('category')
customers_df['state'] = customers_df['state'].astype('category')

# Convert products_df columns to appropriate data types
products_df['product_id'] = products_df['product_id'].astype('category')
products_df['product_name'] = products_df['product_name'].astype(str)
products_df['category_id'] = products_df['category_id'].astype('category')
products_df['subcategory_id'] = products_df['subcategory_id'].astype('category')

# Convert categories_df columns to appropriate data types
categories_df['category_id'] = categories_df['category_id'].astype('category')
categories_df['category_name'] = categories_df['category_name'].astype(str)

# Convert subcategories_df columns to appropriate data types
subcategories_df['subcategory_id'] = subcategories_df['subcategory_id'].astype('category')
subcategories_df['subcategory_name'] = subcategories_df['subcategory_name'].astype(str)
subcategories_df['category_id'] = subcategories_df['category_id'].astype('category')

# Convert segments_df columns to appropriate data types
segments_df['segment_id'] = segments_df['segment_id'].astype('category')
segments_df['segment_name'] = segments_df['segment_name'].astype(str)

# Convert regions_df columns to appropriate data types
regions_df['region_id'] = regions_df['region_id'].astype('category')
regions_df['region_name'] = regions_df['region_name'].astype(str)

#convert the order_details_df columns to the appropriate data types
order_details_df['order_id'] = order_details_df['order_id'].astype('category')
order_details_df['product_id'] = order_details_df['product_id'].astype('category')
order_details_df['quantity'] = order_details_df['quantity'].astype(int)
order_details_df['sales'] = order_details_df['sales'].astype(float)
order_details_df['discount'] = order_details_df['discount'].astype(float)
order_details_df['profit'] = order_details_df['profit'].astype(float)
order_details_df['unit_price'] = order_details_df['unit_price'].astype(float)
order_details_df['price_before_discount'] = order_details_df['price_before_discount'].astype(float)
order_details_df['discount_amount'] = order_details_df['discount_amount'].astype(float)
order_details_df['cost_per_unit'] = order_details_df['cost_per_unit'].astype(float)
order_details_df['margin_percentage'] = order_details_df['margin_percentage'].astype(float)


# Display the data types of the columns
print(orders_df.dtypes)
print(customers_df.dtypes)
print(products_df.dtypes)
print(order_details_df.dtypes)
print(categories_df.dtypes)
print(subcategories_df.dtypes)
print(segments_df.dtypes)
print(regions_df.dtypes)




order_id             category
order_date     datetime64[ns]
ship_date      datetime64[ns]
ship_mode            category
customer_id          category
dtype: object
customer_id      category
customer_name      object
segment_id       category
country          category
city             category
state            category
postal_code        object
region_id          object
dtype: object
product_id        category
product_name        object
category_id       category
subcategory_id    category
dtype: object
order_id                 category
product_id               category
quantity                    int64
sales                     float64
discount                  float64
profit                    float64
unit_price                float64
price_before_discount     float64
discount_amount           float64
cost_per_unit             float64
margin_percentage         float64
dtype: object
category_id      category
category_name      object
dtype: object
subcategory_id      category
subcatego