In [1]:
import pandas as pd
import numpy as np

In [2]:
FILE_NAME = "online_retail_eda (1).csv"

df = pd.read_csv(FILE_NAME)

print("Rows, Columns:", df.shape)
df.head()


Rows, Columns: (200, 9)


Unnamed: 0,Customer_ID,Age,Gender,Region,Product_Category,Purchase_Amount,Purchase_Date,Payment_Method,Is_Returned
0,C0001,56,Male,East,Home & Garden,103.12,2023-03-16,Gift Card,No
1,C0002,69,Female,West,Clothing,20.12,2023-10-21,Bank Transfer,No
2,C0003,46,Female,South,Books,119.63,2023-07-29,Bank Transfer,Yes
3,C0004,32,Female,West,Electronics,144.19,2023-12-13,Credit Card,No
4,C0005,60,Male,East,Clothing,187.36,2023-01-17,Gift Card,No


In [3]:
DATASET_NAME = "Online Retail Purchase Dataset (Synthetic)"
SOURCE_LINK = "PASTE_YOUR_KAGGLE_OR_UCI_LINK_HERE"

DESCRIPTION = "Customer purchases with age, gender, region, product category, purchase amount, purchase date, payment method, and return status."
BUSINESS_PROBLEM = "Helps a retail company understand customer buying patterns and returns to improve marketing, inventory, and customer experience."

print("Dataset:", DATASET_NAME)
print("Source:", SOURCE_LINK)
print("\nDescription:", DESCRIPTION)
print("\nBusiness/Social problem:", BUSINESS_PROBLEM)


Dataset: Online Retail Purchase Dataset (Synthetic)
Source: PASTE_YOUR_KAGGLE_OR_UCI_LINK_HERE

Description: Customer purchases with age, gender, region, product category, purchase amount, purchase date, payment method, and return status.

Business/Social problem: Helps a retail company understand customer buying patterns and returns to improve marketing, inventory, and customer experience.


In [4]:
df.head()

Unnamed: 0,Customer_ID,Age,Gender,Region,Product_Category,Purchase_Amount,Purchase_Date,Payment_Method,Is_Returned
0,C0001,56,Male,East,Home & Garden,103.12,2023-03-16,Gift Card,No
1,C0002,69,Female,West,Clothing,20.12,2023-10-21,Bank Transfer,No
2,C0003,46,Female,South,Books,119.63,2023-07-29,Bank Transfer,Yes
3,C0004,32,Female,West,Electronics,144.19,2023-12-13,Credit Card,No
4,C0005,60,Male,East,Clothing,187.36,2023-01-17,Gift Card,No


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Customer_ID       200 non-null    object 
 1   Age               200 non-null    int64  
 2   Gender            200 non-null    object 
 3   Region            200 non-null    object 
 4   Product_Category  200 non-null    object 
 5   Purchase_Amount   200 non-null    float64
 6   Purchase_Date     200 non-null    object 
 7   Payment_Method    200 non-null    object 
 8   Is_Returned       200 non-null    object 
dtypes: float64(1), int64(1), object(7)
memory usage: 14.2+ KB


In [6]:
df.describe(include="all")

Unnamed: 0,Customer_ID,Age,Gender,Region,Product_Category,Purchase_Amount,Purchase_Date,Payment_Method,Is_Returned
count,200,200.0,200,200,200,200.0,200,200,200
unique,200,,3,4,5,,164,4,2
top,C0001,,Male,North,Beauty,,2023-10-15,Gift Card,No
freq,1,,96,60,47,,3,59,176
mean,,43.425,,,,119.5264,,,
std,,14.94191,,,,60.033588,,,
min,,18.0,,,,5.0,,,
25%,,31.0,,,,82.4325,,,
50%,,43.5,,,,115.195,,,
75%,,56.0,,,,159.685,,,


In [7]:
missing = df.isna().sum().sort_values(ascending=False)
missing[missing > 0]

Series([], dtype: int64)

In [8]:
df_clean = df.copy()

In [9]:
dup_count = df_clean.duplicated().sum()
dup_count

np.int64(0)

In [10]:
df_clean = df_clean.drop_duplicates()
print("After removing duplicates:", df_clean.shape)

After removing duplicates: (200, 9)


Cell 7 — Convert data types

In [12]:
# Convert Purchase_Date to datetime
df_clean["Purchase_Date"] = pd.to_datetime(df_clean["Purchase_Date"], errors="coerce")

# Convert some text columns to category (better for analysis)
for col in ["Gender", "Region", "Product_Category", "Payment_Method", "Is_Returned"]:
    df_clean[col] = df_clean[col].astype("category")

df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Customer_ID       200 non-null    object        
 1   Age               200 non-null    int64         
 2   Gender            200 non-null    category      
 3   Region            200 non-null    category      
 4   Product_Category  200 non-null    category      
 5   Purchase_Amount   200 non-null    float64       
 6   Purchase_Date     200 non-null    datetime64[ns]
 7   Payment_Method    200 non-null    category      
 8   Is_Returned       200 non-null    category      
dtypes: category(5), datetime64[ns](1), float64(1), int64(1), object(1)
memory usage: 8.2+ KB


Cell 8 — Rename columns

In [13]:
import re

def to_snake(s):
    s = re.sub(r"[^0-9a-zA-Z]+", "_", s.strip())
    s = re.sub(r"([a-z0-9])([A-Z])", r"\1_\2", s)
    return s.lower().strip("_")

df_clean.columns = [to_snake(c) for c in df_clean.columns]
df_clean.columns


Index(['customer_id', 'age', 'gender', 'region', 'product_category',
       'purchase_amount', 'purchase_date', 'payment_method', 'is_returned'],
      dtype='object')

Cell 9 — Create new columns

In [14]:
# datetime features
df_clean["purchase_month"] = df_clean["purchase_date"].dt.month
df_clean["purchase_year"] = df_clean["purchase_date"].dt.year
df_clean["purchase_day"] = df_clean["purchase_date"].dt.day_name()

# flag: returned yes/no to 1/0 (this gives us a 3rd numeric column for Lab 6)
df_clean["is_returned_flag"] = df_clean["is_returned"].astype(str).map({"Yes": 1, "No": 0})

df_clean.head()


Unnamed: 0,customer_id,age,gender,region,product_category,purchase_amount,purchase_date,payment_method,is_returned,purchase_month,purchase_year,purchase_day,is_returned_flag
0,C0001,56,Male,East,Home & Garden,103.12,2023-03-16,Gift Card,No,3,2023,Thursday,0
1,C0002,69,Female,West,Clothing,20.12,2023-10-21,Bank Transfer,No,10,2023,Saturday,0
2,C0003,46,Female,South,Books,119.63,2023-07-29,Bank Transfer,Yes,7,2023,Saturday,1
3,C0004,32,Female,West,Electronics,144.19,2023-12-13,Credit Card,No,12,2023,Wednesday,0
4,C0005,60,Male,East,Clothing,187.36,2023-01-17,Gift Card,No,1,2023,Tuesday,0


Cell 10 — Save cleaned dataset

In [15]:
df_clean.to_csv("cleaned_online_retail.csv", index=False)
print("Saved: cleaned_online_retail.csv")


Saved: cleaned_online_retail.csv


Lab Exercise 6 (Stats)

Cell 12 — Compute stats (mean/median/mode/std/variance/range/IQR)

In [16]:
def iqr(series):
    return series.quantile(0.75) - series.quantile(0.25)

numeric_cols = ["age", "purchase_amount", "is_returned_flag"]
cat_cols = ["gender", "region"]

num_stats = []
for c in numeric_cols:
    s = df_clean[c].dropna()
    num_stats.append({
        "column": c,
        "mean": s.mean(),
        "median": s.median(),
        "mode": s.mode().iloc[0] if len(s.mode()) > 0 else np.nan,
        "std": s.std(ddof=1),
        "variance": s.var(ddof=1),
        "range": s.max() - s.min(),
        "IQR": iqr(s)
    })

num_stats_df = pd.DataFrame(num_stats)
num_stats_df


Unnamed: 0,column,mean,median,mode,std,variance,range,IQR
0,age,43.425,43.5,56.0,14.94191,223.260678,51.0,25.0
1,purchase_amount,119.5264,115.195,5.0,60.033588,3604.031666,282.03,77.2525
2,is_returned_flag,0.12,0.0,0.0,0.325777,0.106131,1.0,0.0


Cell 13 — Categorical summary (mode + top counts)Cell 13 — Categorical summary (mode + top counts)

In [17]:
cat_stats = []
for c in cat_cols:
    s = df_clean[c].astype(str)
    cat_stats.append({
        "column": c,
        "mode": s.mode().iloc[0] if len(s.mode()) > 0 else "",
        "unique_values": s.nunique(),
        "top_5_counts": s.value_counts().head(5).to_dict()
    })

cat_stats_df = pd.DataFrame(cat_stats)
cat_stats_df


Unnamed: 0,column,mode,unique_values,top_5_counts
0,gender,Male,3,"{'Male': 96, 'Female': 82, 'Other': 22}"
1,region,North,4,"{'North': 60, 'West': 53, 'East': 44, 'South':..."


Cell 14 — Answer the 4 questions

answers = """
1) Biggest challenges in preparing data:
- Converting the date column correctly.
- Making sure categories are set properly for analysis.
- Creating one extra numeric column for Lab 6 (return flag).

2) Patterns or trends discovered:
- Some regions buy more than others.
- Certain product categories appear more often.
- Purchase amounts vary widely across customers.

3) Transformations with most impact:
- Converting purchase_date to datetime (so we can create month/year/day features).
- Creating is_returned_flag (Yes/No → 1/0) to compare returns easily.
- Setting categorical columns to category type for cleaner grouping.

4) How BI can use these insights:
- Dashboard: sales by region/category/month.
- Return rate by product category/payment method.
- Customer segmentation by age + spending.
"""
print(answers)
