In [183]:
import pandas as pd
import numpy as np

## Load Warehouse

In [184]:
df_warehouse = pd.read_csv('data/warehouse.csv', parse_dates=['InvoiceDate'])
display(df_warehouse.info())
display(df_warehouse.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1067371 entries, 0 to 1067370
Data columns (total 8 columns):
 #   Column       Non-Null Count    Dtype         
---  ------       --------------    -----         
 0   Invoice      1067371 non-null  object        
 1   StockCode    1067371 non-null  object        
 2   Description  1062989 non-null  object        
 3   Quantity     1067371 non-null  int64         
 4   InvoiceDate  1067371 non-null  datetime64[ns]
 5   Price        1067371 non-null  float64       
 6   Customer ID  824364 non-null   float64       
 7   Country      1067371 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 65.1+ MB


None

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085.0,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom


## Data Wrangling

In [185]:
def skim_data(data):
    numeric_cols = set(data.select_dtypes(include=[np.number]).columns)
    numeric_stats = {}
    for col in numeric_cols:
        numeric_stats[col] = {
            'neg_%': round((data[col] < 0).mean() * 100, 3),
            'zero_%': round((data[col] == 0).mean() * 100, 3)
        }

    is_missing = (data.isna()) | (data == '')
    skimmed_data = pd.DataFrame({
        'feature': data.columns.values,
        'dtype': data.dtypes.astype(str).values,
        'null_%': round(is_missing.mean() * 100, 3).values,
        'negative_%': [numeric_stats.get(col, {}).get('neg_%', '-') for col in data.columns],
        'zero_%': [numeric_stats.get(col, {}).get('zero_%', '-') for col in data.columns],
        'n_unique': data.nunique().values,
        'unique_%': round(data.nunique() / len(data) * 100, 2).values,
        'sample_values': [list(data[col].dropna().unique()[:5]) for col in data.columns]
    })

    print(f"Total duplicate rows: {data.duplicated().sum()}")

    return skimmed_data

In [186]:
skim_data(df_warehouse)

Total duplicate rows: 34335


Unnamed: 0,feature,dtype,null_%,negative_%,zero_%,n_unique,unique_%,sample_values
0,Invoice,object,0.0,-,-,53628,5.02,"[489434, 489435, 489436, 489437, 489438]"
1,StockCode,object,0.0,-,-,5305,0.5,"[85048, 79323P, 79323W, 22041, 21232]"
2,Description,object,0.411,-,-,5698,0.53,"[15CM CHRISTMAS GLASS BALL 20 LIGHTS, PINK CHE..."
3,Quantity,int64,0.0,2.15,0.0,1057,0.1,"[12, 48, 24, 10, 18]"
4,InvoiceDate,datetime64[ns],0.0,-,-,47635,4.46,"[2009-12-01 07:45:00, 2009-12-01 07:46:00, 200..."
5,Price,float64,0.0,0.0,0.581,2807,0.26,"[6.95, 6.75, 2.1, 1.25, 1.65]"
6,Customer ID,float64,22.767,0.0,0.0,5942,0.56,"[13085.0, 13078.0, 15362.0, 18102.0, 12682.0]"
7,Country,object,0.0,-,-,43,0.0,"[United Kingdom, France, USA, Belgium, Australia]"


## Issues

- Inappropriate column names
- Customer ID type is not string
- Negative value for Quantity
- Zero value for Price
- A lot of null values in Customer ID
- Duplicate rows

### Column names and customer_id type

In [187]:
df_warehouse = (
    df_warehouse
    .rename(columns={
        'Invoice': 'invoice_id',
        'StockCode': 'product_id',
        'Description': 'product_description',
        'Quantity': 'order_amt',
        'InvoiceDate': 'purchase_date',
        'Price': 'product_price',
        'Customer ID': 'customer_id',
        'Country': 'customer_country'
    })
    .assign(
        customer_id=lambda x: x['customer_id'].astype('Int64')
    )
)
display(df_warehouse.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1067371 entries, 0 to 1067370
Data columns (total 8 columns):
 #   Column               Non-Null Count    Dtype         
---  ------               --------------    -----         
 0   invoice_id           1067371 non-null  object        
 1   product_id           1067371 non-null  object        
 2   product_description  1062989 non-null  object        
 3   order_amt            1067371 non-null  int64         
 4   purchase_date        1067371 non-null  datetime64[ns]
 5   product_price        1067371 non-null  float64       
 6   customer_id          824364 non-null   Int64         
 7   customer_country     1067371 non-null  object        
dtypes: Int64(1), datetime64[ns](1), float64(1), int64(1), object(4)
memory usage: 66.2+ MB


None

In [188]:
display(skim_data(df_warehouse))

Total duplicate rows: 34335


Unnamed: 0,feature,dtype,null_%,negative_%,zero_%,n_unique,unique_%,sample_values
0,invoice_id,object,0.0,-,-,53628,5.02,"[489434, 489435, 489436, 489437, 489438]"
1,product_id,object,0.0,-,-,5305,0.5,"[85048, 79323P, 79323W, 22041, 21232]"
2,product_description,object,0.411,-,-,5698,0.53,"[15CM CHRISTMAS GLASS BALL 20 LIGHTS, PINK CHE..."
3,order_amt,int64,0.0,2.15,0.0,1057,0.1,"[12, 48, 24, 10, 18]"
4,purchase_date,datetime64[ns],0.0,-,-,47635,4.46,"[2009-12-01 07:45:00, 2009-12-01 07:46:00, 200..."
5,product_price,float64,0.0,0.0,0.581,2807,0.26,"[6.95, 6.75, 2.1, 1.25, 1.65]"
6,customer_id,Int64,22.767,0.0,0.0,5942,0.56,"[13085, 13078, 15362, 18102, 12682]"
7,customer_country,object,0.0,-,-,43,0.0,"[United Kingdom, France, USA, Belgium, Australia]"


### Remove negative value from Quantity

In [189]:
df_warehouse = (
    df_warehouse
    .query('order_amt > 0')
)

In [190]:
display(df_warehouse.info())
display(skim_data(df_warehouse))

<class 'pandas.core.frame.DataFrame'>
Index: 1044421 entries, 0 to 1067370
Data columns (total 8 columns):
 #   Column               Non-Null Count    Dtype         
---  ------               --------------    -----         
 0   invoice_id           1044421 non-null  object        
 1   product_id           1044421 non-null  object        
 2   product_description  1042728 non-null  object        
 3   order_amt            1044421 non-null  int64         
 4   purchase_date        1044421 non-null  datetime64[ns]
 5   product_price        1044421 non-null  float64       
 6   customer_id          805620 non-null   Int64         
 7   customer_country     1044421 non-null  object        
dtypes: Int64(1), datetime64[ns](1), float64(1), int64(1), object(4)
memory usage: 72.7+ MB


None

Total duplicate rows: 33881


Unnamed: 0,feature,dtype,null_%,negative_%,zero_%,n_unique,unique_%,sample_values
0,invoice_id,object,0.0,-,-,41944,4.02,"[489434, 489435, 489436, 489437, 489438]"
1,product_id,object,0.0,-,-,4985,0.48,"[85048, 79323P, 79323W, 22041, 21232]"
2,product_description,object,0.162,-,-,5469,0.52,"[15CM CHRISTMAS GLASS BALL 20 LIGHTS, PINK CHE..."
3,order_amt,int64,0.0,0.0,0.0,564,0.05,"[12, 48, 24, 10, 18]"
4,purchase_date,datetime64[ns],0.0,-,-,38429,3.68,"[2009-12-01 07:45:00, 2009-12-01 07:46:00, 200..."
5,product_price,float64,0.0,0.0,0.263,2280,0.22,"[6.95, 6.75, 2.1, 1.25, 1.65]"
6,customer_id,Int64,22.864,0.0,0.0,5881,0.56,"[13085, 13078, 15362, 18102, 12682]"
7,customer_country,object,0.0,-,-,43,0.0,"[United Kingdom, France, USA, Belgium, Australia]"


### Null values in Customer ID

In [191]:
df_warehouse = (
    df_warehouse
    .loc[lambda x: x['customer_id'].notna()]
    .assign(customer_id=lambda x: x['customer_id'].astype('str'))
)

In [192]:
df_warehouse.info()
display(skim_data(df_warehouse))

<class 'pandas.core.frame.DataFrame'>
Index: 805620 entries, 0 to 1067370
Data columns (total 8 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   invoice_id           805620 non-null  object        
 1   product_id           805620 non-null  object        
 2   product_description  805620 non-null  object        
 3   order_amt            805620 non-null  int64         
 4   purchase_date        805620 non-null  datetime64[ns]
 5   product_price        805620 non-null  float64       
 6   customer_id          805620 non-null  object        
 7   customer_country     805620 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(5)
memory usage: 55.3+ MB
Total duplicate rows: 26125


Unnamed: 0,feature,dtype,null_%,negative_%,zero_%,n_unique,unique_%,sample_values
0,invoice_id,object,0.0,-,-,36975,4.59,"[489434, 489435, 489436, 489437, 489438]"
1,product_id,object,0.0,-,-,4631,0.57,"[85048, 79323P, 79323W, 22041, 21232]"
2,product_description,object,0.0,-,-,5283,0.66,"[15CM CHRISTMAS GLASS BALL 20 LIGHTS, PINK CHE..."
3,order_amt,int64,0.0,0.0,0.0,438,0.05,"[12, 48, 24, 10, 18]"
4,purchase_date,datetime64[ns],0.0,-,-,34591,4.29,"[2009-12-01 07:45:00, 2009-12-01 07:46:00, 200..."
5,product_price,float64,0.0,0.0,0.009,666,0.08,"[6.95, 6.75, 2.1, 1.25, 1.65]"
6,customer_id,object,0.0,-,-,5881,0.73,"[13085, 13078, 15362, 18102, 12682]"
7,customer_country,object,0.0,-,-,41,0.01,"[United Kingdom, France, USA, Belgium, Australia]"


### Duplicate rows

In [193]:
df_warehouse = (
    df_warehouse
    .drop_duplicates()
)

In [194]:
df_warehouse.info()
display(skim_data(df_warehouse))

<class 'pandas.core.frame.DataFrame'>
Index: 779495 entries, 0 to 1067370
Data columns (total 8 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   invoice_id           779495 non-null  object        
 1   product_id           779495 non-null  object        
 2   product_description  779495 non-null  object        
 3   order_amt            779495 non-null  int64         
 4   purchase_date        779495 non-null  datetime64[ns]
 5   product_price        779495 non-null  float64       
 6   customer_id          779495 non-null  object        
 7   customer_country     779495 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(5)
memory usage: 53.5+ MB
Total duplicate rows: 0


Unnamed: 0,feature,dtype,null_%,negative_%,zero_%,n_unique,unique_%,sample_values
0,invoice_id,object,0.0,-,-,36975,4.74,"[489434, 489435, 489436, 489437, 489438]"
1,product_id,object,0.0,-,-,4631,0.59,"[85048, 79323P, 79323W, 22041, 21232]"
2,product_description,object,0.0,-,-,5283,0.68,"[15CM CHRISTMAS GLASS BALL 20 LIGHTS, PINK CHE..."
3,order_amt,int64,0.0,0.0,0.0,438,0.06,"[12, 48, 24, 10, 18]"
4,purchase_date,datetime64[ns],0.0,-,-,34591,4.44,"[2009-12-01 07:45:00, 2009-12-01 07:46:00, 200..."
5,product_price,float64,0.0,0.0,0.009,666,0.09,"[6.95, 6.75, 2.1, 1.25, 1.65]"
6,customer_id,object,0.0,-,-,5881,0.75,"[13085, 13078, 15362, 18102, 12682]"
7,customer_country,object,0.0,-,-,41,0.01,"[United Kingdom, France, USA, Belgium, Australia]"


## Fact and Dimension Tables

- Fact table: invoice_id, product_id, customer_id, date, order_amt, product_price
- Dimension tables:
    - dim_product: product_id, product_description
    - dim_date: date (purchase_date), year, month, day_of_week, month_name, is_weekend
    - dim_customer: customer_id, customer_country

In [196]:
fct_sales = pd.DataFrame(
    data={
        'invoice_id': df_warehouse['invoice_id'],
        'product_id': df_warehouse['product_id'],
        'customer_id': df_warehouse['customer_id'],
        'date_id': df_warehouse['purchase_date'].dt.date,
        'order_amt': df_warehouse['order_amt'],
        'product_price': df_warehouse['product_price'],
        'purchase_timestamp': df_warehouse['purchase_date'],
    }
)

dim_product = pd.DataFrame(
    data={
        'product_id': df_warehouse['product_id'],
        'product_description': df_warehouse['product_description']
    }
)

dim_customer = pd.DataFrame(
    data={
        'customer_id': df_warehouse['customer_id'],
        'customer_country': df_warehouse['customer_country'],
    }
)

dim_date = (
    pd.DataFrame(
        data={
            'date_id': pd.date_range(
                start=df_warehouse['purchase_date'].min().normalize(),
                end=df_warehouse['purchase_date'].max().normalize(),
                freq='D'
            ),
        }
    )
    .assign(
        year=lambda x: x['date_id'].dt.year,
        month=lambda x: x['date_id'].dt.month,
        day_of_week=lambda x: x['date_id'].dt.day_of_week,
        is_weekend=lambda x: x['day_of_week'].isin([5, 6])
    )
)

In [198]:
fct_sales.info()
display(fct_sales.sample(10, random_state=1))

<class 'pandas.core.frame.DataFrame'>
Index: 779495 entries, 0 to 1067370
Data columns (total 7 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   invoice_id          779495 non-null  object        
 1   product_id          779495 non-null  object        
 2   customer_id         779495 non-null  object        
 3   date_id             779495 non-null  object        
 4   order_amt           779495 non-null  int64         
 5   product_price       779495 non-null  float64       
 6   purchase_timestamp  779495 non-null  datetime64[ns]
dtypes: datetime64[ns](1), float64(1), int64(1), object(4)
memory usage: 47.6+ MB


Unnamed: 0,invoice_id,product_id,customer_id,date_id,order_amt,product_price,purchase_timestamp
448947,532087,22035,13735,2010-11-10,12,0.42,2010-11-10 15:32:00
407655,528397,84406B,12972,2010-10-22,8,3.25,2010-10-22 08:41:00
433661,530838,85175,13395,2010-11-04,2,0.42,2010-11-04 14:00:00
512375,537201,22242,12472,2010-12-05,12,1.65,2010-12-05 14:19:00
916691,570660,22910,14866,2011-10-11,80,2.55,2011-10-11 14:18:00
11463,490302,20747,17841,2009-12-04,1,14.95,2009-12-04 14:29:00
469190,533581,22566,16445,2010-11-18,1,0.85,2010-11-18 11:29:00
934218,572026,23535,13245,2011-10-20,4,5.95,2011-10-20 11:40:00
797640,560754,16161P,17754,2011-07-20,25,0.42,2011-07-20 15:56:00
11400,490302,84971S,17841,2009-12-04,1,0.85,2009-12-04 14:29:00


In [None]:
dim_product.info()

<class 'pandas.core.frame.DataFrame'>
Index: 779495 entries, 0 to 1067370
Data columns (total 2 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   product_id           779495 non-null  object
 1   product_description  779495 non-null  object
dtypes: object(2)
memory usage: 17.8+ MB


In [None]:
dim_customer.info()

<class 'pandas.core.frame.DataFrame'>
Index: 779495 entries, 0 to 1067370
Data columns (total 2 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   customer_id       779495 non-null  object
 1   customer_country  779495 non-null  object
dtypes: object(2)
memory usage: 17.8+ MB


In [197]:
dim_date.info()
display(dim_date.sample(10, random_state=1))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 739 entries, 0 to 738
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   date_id      739 non-null    datetime64[ns]
 1   year         739 non-null    int32         
 2   month        739 non-null    int32         
 3   day_of_week  739 non-null    int32         
 4   is_weekend   739 non-null    bool          
dtypes: bool(1), datetime64[ns](1), int32(3)
memory usage: 15.3 KB


Unnamed: 0,date_id,year,month,day_of_week,is_weekend
680,2011-10-12,2011,10,2,False
257,2010-08-15,2010,8,6,True
526,2011-05-11,2011,5,2,False
496,2011-04-11,2011,4,0,False
427,2011-02-01,2011,2,1,False
349,2010-11-15,2010,11,0,False
434,2011-02-08,2011,2,1,False
189,2010-06-08,2010,6,1,False
355,2010-11-21,2010,11,6,True
537,2011-05-22,2011,5,6,True


In [None]:
dim_date[['purchase_date']]

Unnamed: 0,purchase_date
0,2009-12-01 07:45:00
1,2009-12-01 07:45:00
2,2009-12-01 07:45:00
3,2009-12-01 07:45:00
4,2009-12-01 07:45:00
...,...
1067366,2011-12-09 12:50:00
1067367,2011-12-09 12:50:00
1067368,2011-12-09 12:50:00
1067369,2011-12-09 12:50:00
