# importing Libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


# importing CSV

In [3]:
# Try different encodings
try:
    ecom = pd.read_csv('superstore.csv', encoding='utf-8')
except UnicodeDecodeError:
    # Try alternative encodings
    ecom = pd.read_csv('superstore.csv', encoding='latin-1')

In [4]:
#display the first few rows of the dataset
ecom.head()

#display the last few rows of the dataset
ecom.tail()



Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,...,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit
9989,9990,CA-2014-110422,1/21/2014,1/23/2014,Second Class,TB-21400,Tom Boeckenhauer,Consumer,United States,Miami,...,33180,South,FUR-FU-10001889,Furniture,Furnishings,Ultra Door Pull Handle,25.248,3,0.2,4.1028
9990,9991,CA-2017-121258,2/26/2017,3/3/2017,Standard Class,DB-13060,Dave Brooks,Consumer,United States,Costa Mesa,...,92627,West,FUR-FU-10000747,Furniture,Furnishings,Tenex B1-RE Series Chair Mats for Low Pile Car...,91.96,2,0.0,15.6332
9991,9992,CA-2017-121258,2/26/2017,3/3/2017,Standard Class,DB-13060,Dave Brooks,Consumer,United States,Costa Mesa,...,92627,West,TEC-PH-10003645,Technology,Phones,Aastra 57i VoIP phone,258.576,2,0.2,19.3932
9992,9993,CA-2017-121258,2/26/2017,3/3/2017,Standard Class,DB-13060,Dave Brooks,Consumer,United States,Costa Mesa,...,92627,West,OFF-PA-10004041,Office Supplies,Paper,"It's Hot Message Books with Stickers, 2 3/4"" x 5""",29.6,4,0.0,13.32
9993,9994,CA-2017-119914,5/4/2017,5/9/2017,Second Class,CC-12220,Chris Cortes,Consumer,United States,Westminster,...,92683,West,OFF-AP-10002684,Office Supplies,Appliances,"Acco 7-Outlet Masterpiece Power Center, Wihtou...",243.16,2,0.0,72.948


In [5]:
#create a copy of the dataset
ecom_copy = ecom.copy()

#display the first few rows of the copied dataset
ecom_copy.head()



Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,...,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit
0,1,CA-2016-152156,11/8/2016,11/11/2016,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.9136
1,2,CA-2016-152156,11/8/2016,11/11/2016,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3,0.0,219.582
2,3,CA-2016-138688,6/12/2016,6/16/2016,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,...,90036,West,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62,2,0.0,6.8714
3,4,US-2015-108966,10/11/2015,10/18/2015,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,FUR-TA-10000577,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table,957.5775,5,0.45,-383.031
4,5,US-2015-108966,10/11/2015,10/18/2015,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,OFF-ST-10000760,Office Supplies,Storage,Eldon Fold 'N Roll Cart System,22.368,2,0.2,2.5164


In [9]:
#change the column names to lowercase and replace spaces with underscores
ecom_copy.columns = ecom_copy.columns.str.lower().str.replace(' ', '_')

#display the first few rows of the dataset
ecom_copy.head()

# number of rows and columns in the dataset
print(ecom_copy.shape)

# number of unique customers in the dataset
print(ecom_copy['customer_id'].nunique())

#show the data types of the columns
print(ecom_copy.dtypes)



(9994, 21)
793
row_id                    int64
order_id                 object
order_date       datetime64[ns]
ship_date        datetime64[ns]
ship_mode                object
customer_id              object
customer_name            object
segment                  object
country                  object
city                     object
state                    object
postal_code               int64
region                   object
product_id               object
category                 object
sub-category             object
product_name             object
sales                   float64
quantity                  int64
discount                float64
profit                  float64
dtype: object


# Converting Column types

In [17]:
#change the column types to the appropriate data types
ecom_copy['order_date'] = pd.to_datetime(ecom_copy['order_date'])  # Already datetime64[ns]
ecom_copy['ship_date'] = pd.to_datetime(ecom_copy['ship_date'])    # Already datetime64[ns]
ecom_copy['customer_id'] = ecom_copy['customer_id'].astype('category')  # Change from object to category
ecom_copy['product_id'] = ecom_copy['product_id'].astype('category')    # Change from object to category
ecom_copy['category'] = ecom_copy['category'].astype('category')        # Change from object to category
ecom_copy['product_name'] = ecom_copy['product_name'].astype(str)       # Keep as string (object)
ecom_copy['sales'] = ecom_copy['sales'].astype(float)              # Already float64
ecom_copy['quantity'] = ecom_copy['quantity'].astype(int)          # Already int64
ecom_copy['discount'] = ecom_copy['discount'].astype(float)        # Already float64
ecom_copy['profit'] = ecom_copy['profit'].astype(float)            # Already float64
ecom_copy['customer_name'] = ecom_copy['customer_name'].astype('category')  # Change from object to category
ecom_copy['segment'] = ecom_copy['segment'].astype('category')          # Change from object to category
ecom_copy['city'] = ecom_copy['city'].astype('category')                # Change from object to category
ecom_copy['state'] = ecom_copy['state'].astype('category')              # Change from object to category
ecom_copy['country'] = ecom_copy['country'].astype('category')          # Change from object to category
ecom_copy['region'] = ecom_copy['region'].astype('category')            # Change from object to category
ecom_copy['order_id'] = ecom_copy['order_id'].astype('category')        # Change from object to category
ecom_copy['row_id'] = ecom_copy['row_id'].astype(int)                   # Keep as int64
ecom_copy['ship_mode'] = ecom_copy['ship_mode'].astype('category')      # Change from object to category
ecom_copy['postal_code'] = ecom_copy['postal_code'].astype(str)         # Change from int64 to string
ecom_copy['sub-category'] = ecom_copy['sub-category'].astype('category')  # Change from object to category

# Convert postal_code from object to category
ecom_copy['postal_code'] = ecom_copy['postal_code'].astype('category')

# Optionally convert product_name to category if there are many duplicates
# First check the cardinality ratio
product_name_ratio = ecom_copy['product_name'].nunique() / len(ecom_copy)
if product_name_ratio < 0.5:  # If less than 50% unique values
    ecom_copy['product_name'] = ecom_copy['product_name'].astype('category')
    
#display the first few rows of the dataset
ecom_copy.head()




Unnamed: 0,row_id,order_id,order_date,ship_date,ship_mode,customer_id,customer_name,segment,country,city,...,postal_code,region,product_id,category,sub-category,product_name,sales,quantity,discount,profit
0,1,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.9136
1,2,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3,0.0,219.582
2,3,CA-2016-138688,2016-06-12,2016-06-16,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,...,90036,West,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62,2,0.0,6.8714
3,4,US-2015-108966,2015-10-11,2015-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,FUR-TA-10000577,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table,957.5775,5,0.45,-383.031
4,5,US-2015-108966,2015-10-11,2015-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,OFF-ST-10000760,Office Supplies,Storage,Eldon Fold 'N Roll Cart System,22.368,2,0.2,2.5164


In [21]:
ecom_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9994 entries, 0 to 9993
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   row_id         9994 non-null   int64         
 1   order_id       9994 non-null   category      
 2   order_date     9994 non-null   datetime64[ns]
 3   ship_date      9994 non-null   datetime64[ns]
 4   ship_mode      9994 non-null   category      
 5   customer_id    9994 non-null   category      
 6   customer_name  9994 non-null   category      
 7   segment        9994 non-null   category      
 8   country        9994 non-null   category      
 9   city           9994 non-null   category      
 10  state          9994 non-null   category      
 11  postal_code    9994 non-null   category      
 12  region         9994 non-null   category      
 13  product_id     9994 non-null   category      
 14  category       9994 non-null   category      
 15  sub-category   9994 n

# Feature Enginnering

In [34]:
def extract_category_ids(df):
    """
    Extracts category and subcategory IDs from product IDs and adds them as new columns.
    
    For example, with Product ID "FUR-BO-10001798":
    - Category ID: "FUR-1000"
    - Subcategory ID: "FUR-BO-1000"
    
    Args:
        df (pandas.DataFrame): DataFrame containing Product ID column
        
    Returns:
        pandas.DataFrame: DataFrame with added category_id and subcategory_id columns
    """
    # Create a copy of the dataframe to avoid modifying the original
    result_df = ecom_copy.copy()
    
    # Extract category_id (first segment + first 4 digits of number)
    result_df['category_id'] = result_df['product_id'].apply(
        lambda x: x.split('-')[0] + '-' + x.split('-')[-1][:4]
    )
    
    # Extract subcategory_id (first two segments + first 4 digits)
    result_df['subcategory_id'] = result_df['product_id'].apply(
        lambda x: x.split('-')[0] + '-' + x.split('-')[1] + '-' + x.split('-')[-1][:4]
    )
    
    return result_df

# Apply the function to your ecom_copy DataFrame
ecom_copy = extract_category_ids(ecom_copy)

# Check the first few rows to make sure the new columns were added correctly
print(ecom_copy[['product_id', 'category_id', 'subcategory_id']].head())

        product_id category_id subcategory_id
0  FUR-BO-10001798    FUR-1000    FUR-BO-1000
1  FUR-CH-10000454    FUR-1000    FUR-CH-1000
2  OFF-LA-10000240    OFF-1000    OFF-LA-1000
3  FUR-TA-10000577    FUR-1000    FUR-TA-1000
4  OFF-ST-10000760    OFF-1000    OFF-ST-1000


In [38]:
def add_segment_ids(df):
    """
    Adds segment IDs to the DataFrame based on the Segment column.
    
    Mapping:
    - Consumer → "CONS-1000"
    - Corporate → "CORP-1000"
    - Home Office → "HOME-1000"
    
    Args:
        df (pandas.DataFrame): DataFrame containing Segment column
        
    Returns:
        pandas.DataFrame: DataFrame with added segment_id column
    """
    # Create a copy to avoid modifying the original
    result_df = ecom_copy.copy()
    
    # Define the segment ID mapping
    segment_id_map = {
        'Consumer': 'CONS-1000',
        'Corporate': 'CORP-1000',
        'Home Office': 'HOME-1000'
    }
    
    # Add the segment_id column
    result_df['segment_id'] = result_df['segment'].map(segment_id_map)
    
    return result_df

# Apply both the segment and category ID functions to your DataFrame
def enrich_ecom_data(df):
    """
    Adds category, subcategory, and segment IDs to the DataFrame.
    
    Args:
        df (pandas.DataFrame): Original DataFrame
        
    Returns:
        pandas.DataFrame: Enriched DataFrame with ID columns
    """
    # First add the category and subcategory IDs
    enriched_df = extract_category_ids(df)
    
    # Then add the segment IDs
    enriched_df = add_segment_ids(enriched_df)
    
    return enriched_df

# Apply to your ecom_copy DataFrame
ecom_copy = enrich_ecom_data(ecom_copy)

# Check the first few rows to verify all IDs
print(ecom_copy[['segment', 'segment_id', 'product_id', 'category_id', 'subcategory_id']].head())

     segment segment_id       product_id category_id subcategory_id
0   Consumer  CONS-1000  FUR-BO-10001798    FUR-1000    FUR-BO-1000
1   Consumer  CONS-1000  FUR-CH-10000454    FUR-1000    FUR-CH-1000
2  Corporate  CORP-1000  OFF-LA-10000240    OFF-1000    OFF-LA-1000
3   Consumer  CONS-1000  FUR-TA-10000577    FUR-1000    FUR-TA-1000
4   Consumer  CONS-1000  OFF-ST-10000760    OFF-1000    OFF-ST-1000


In [39]:
ecom_copy.head()

Unnamed: 0,row_id,order_id,order_date,ship_date,ship_mode,customer_id,customer_name,segment,country,city,...,category,sub-category,product_name,sales,quantity,discount,profit,category_id,subcategory_id,segment_id
0,1,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.9136,FUR-1000,FUR-BO-1000,CONS-1000
1,2,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3,0.0,219.582,FUR-1000,FUR-CH-1000,CONS-1000
2,3,CA-2016-138688,2016-06-12,2016-06-16,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,...,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62,2,0.0,6.8714,OFF-1000,OFF-LA-1000,CORP-1000
3,4,US-2015-108966,2015-10-11,2015-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table,957.5775,5,0.45,-383.031,FUR-1000,FUR-TA-1000,CONS-1000
4,5,US-2015-108966,2015-10-11,2015-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,Office Supplies,Storage,Eldon Fold 'N Roll Cart System,22.368,2,0.2,2.5164,OFF-1000,OFF-ST-1000,CONS-1000
