# DS2002 Project 1 - Retail Sales ETL Pipeline

## 1. Import Libraries & Read CSV


In [18]:
import pandas as pd
from sqlalchemy import create_engine
df = pd.read_csv('../data/Fashion_Retail_Sales.csv')
df.head()


Unnamed: 0,Customer Reference ID,Item Purchased,Purchase Amount (USD),Date Purchase,Review Rating,Payment Method
0,4018,Handbag,4619.0,2023-02-05,,Credit Card
1,4115,Tunic,2456.0,2023-07-11,2.0,Credit Card
2,4019,Tank Top,2102.0,2023-03-23,4.1,Cash
3,4097,Leggings,3126.0,2023-03-15,3.2,Cash
4,3997,Wallet,3003.0,2022-11-27,4.7,Cash


## 2. Data Exploration

In [19]:
print(df.info())
print(df.isnull().sum())
for col in df.columns:
    print(f"{col}: {df[col].unique()[:5]}")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3400 entries, 0 to 3399
Data columns (total 6 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Customer Reference ID  3400 non-null   int64  
 1   Item Purchased         3400 non-null   object 
 2   Purchase Amount (USD)  2750 non-null   float64
 3   Date Purchase          3400 non-null   object 
 4   Review Rating          3076 non-null   float64
 5   Payment Method         3400 non-null   object 
dtypes: float64(2), int64(1), object(3)
memory usage: 159.5+ KB
None
Customer Reference ID      0
Item Purchased             0
Purchase Amount (USD)    650
Date Purchase              0
Review Rating            324
Payment Method             0
dtype: int64
Customer Reference ID: [4018 4115 4019 4097 3997]
Item Purchased: ['Handbag' 'Tunic' 'Tank Top' 'Leggings' 'Wallet']
Purchase Amount (USD): [4619. 2456. 2102. 3126. 3003.]
Date Purchase: ['2023-02-05' '2023-07-11' '2023-03

## 3. Data Cleaning

In [20]:
df = df.dropna()
df = df.drop_duplicates()

df.columns = [c.replace(' ', '_').lower() for c in df.columns]
df.head()


Unnamed: 0,customer_reference_id,item_purchased,purchase_amount_(usd),date_purchase,review_rating,payment_method
1,4115,Tunic,2456.0,2023-07-11,2.0,Credit Card
2,4019,Tank Top,2102.0,2023-03-23,4.1,Cash
3,4097,Leggings,3126.0,2023-03-15,3.2,Cash
4,3997,Wallet,3003.0,2022-11-27,4.7,Cash
5,4080,Onesie,2914.0,2022-12-11,4.5,Credit Card


## 4. Transform & Prepare Dimensional Data

In [21]:
# Product Dimension
products = df[['item_purchased']].drop_duplicates()
products['product_id'] = range(1, len(products)+1)

# Customer Dimension
customers = df[['customer_reference_id']].drop_duplicates()
customers['customer_id'] = range(1, len(customers)+1)

# Date Dimension
dates = df[['date_purchase']].drop_duplicates()
dates['date_id'] = range(1, len(dates)+1)
dates['purchase_date'] = pd.to_datetime(dates['date_purchase'])
dates['year'] = dates['purchase_date'].dt.year
dates['month'] = dates['purchase_date'].dt.month
dates['day'] = dates['purchase_date'].dt.day

# Payment Method Dimension
methods = df[['payment_method']].drop_duplicates()
methods['payment_id'] = range(1, len(methods)+1)


## 5. Connect to MySQL

In [22]:
engine = create_engine('mysql+mysqlconnector://root:Oct2703thh@localhost/ds2002_retail')


## 6. Load Dimensions

In [23]:
products.to_sql('product_dim', engine, if_exists='append', index=False)
customers.to_sql('customer_dim', engine, if_exists='append', index=False)
dates[['date_id','purchase_date','year','month','day']].to_sql('date_dim', engine, if_exists='append', index=False)
methods.to_sql('payment_dim', engine, if_exists='append', index=False)


2

## 7. Prepare & Load Fact Table

In [24]:
df_fact = df.merge(products, on='item_purchased') \
            .merge(customers, on='customer_reference_id') \
            .merge(dates, on='date_purchase') \
            .merge(methods, on='payment_method')

sales_fact = df_fact[['customer_id','product_id','date_id','payment_id','purchase_amount_(usd)','review_rating']]
sales_fact.columns = ['customer_id','product_id','date_id','payment_id','purchase_amount','review_rating']

sales_fact.to_sql('sales_fact', engine, if_exists='append', index=False)


2487

## 8. Validation: Query Loaded Data

In [25]:
query = """
SELECT 
    pd.item_purchased, 
    SUM(sf.purchase_amount) AS total_sales
FROM sales_fact sf
JOIN product_dim pd ON sf.product_id = pd.product_id
GROUP BY pd.item_purchased
ORDER BY total_sales DESC
"""
result = pd.read_sql(query, engine)
result.head()


Unnamed: 0,item_purchased,total_sales
0,Tunic,16980.0
1,Jeans,12475.0
2,Shorts,12246.0
3,Gloves,11793.0
4,Boots,11174.0
