# Exploratory Data Analysis (EDA)

**Dataset:** `./cleaned_ecommerce_dataset.csv`

## Libraries used:
- pandas, numpy  : for data handling
- matplotlib     : for plotting
- seaborn        : for prettier statistical plots

**NOTE:** This code is written to be GENERIC – it tries to auto-detect columns like:
user id, age, gender, location, product category, revenue, timestamp, etc.
If any of these do not exist in your dataset, the code will simply skip those parts
and print a helpful message instead of crashing.


## 0. IMPORT LIBRARIES & BASIC SETTINGS


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# For Jupyter Notebook users – uncomment this line:
# %matplotlib inline

# Optional: make plots look a bit nicer
sns.set(style="whitegrid")


## 1. LOAD CLEANED DATASET & BASIC EXPLORATION


In [None]:
# 1.1 Load the cleaned dataset
file_path = "../Data/cleaned_ecommerce_dataset.csv"
df = pd.read_csv(file_path)

print("===== First 5 rows of the dataset =====")
print(df.head())

print("\n===== Dataset Shape (rows, columns) =====")
print(df.shape)

# 1.2 Basic statistical summary for numerical columns
print("\n===== Basic Statistical Summary (Numerical Columns) =====")
print(df.describe())

# Optional: summary for categorical columns as well
print("\n===== Summary for Categorical Columns =====")
print(df.describe(include=["object", "category"]))

# 1.3 Identify numerical and categorical columns
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = df.select_dtypes(include=["object", "category"]).columns.tolist()

print("\n===== Numerical Columns =====")
print(numeric_cols)

print("\n===== Categorical Columns =====")
print(cat_cols)


### 1.4 Distribution of each numerical column (histograms)


In [None]:
for col in numeric_cols:
    plt.figure(figsize=(6, 4))
    sns.histplot(df[col].dropna(), kde=True)
    plt.title(f"Distribution of {col}")
    plt.xlabel(col)
    plt.ylabel("Frequency")
    plt.tight_layout()
    plt.show()


### 1.5 Count of unique values for each categorical column


In [None]:
print("\n===== Unique Value Counts per Categorical Column =====")
for col in cat_cols:
    print(f"\nColumn: {col}")
    print(f"Number of unique values: {df[col].nunique()}")
    print("Top 10 value counts:")
    print(df[col].value_counts().head(10))


## HELPER: AUTO-DETECT IMPORTANT BUSINESS COLUMNS


In [None]:
# Try to guess user ID column
user_id_candidates = [c for c in df.columns
                      if any(k in c.lower() for k in ["user", "customer", "client", "buyer"])]
user_id_col = user_id_candidates[0] if user_id_candidates else None

# Try to guess session column
session_candidates = [c for c in df.columns
                      if "session" in c.lower()]
session_col = session_candidates[0] if session_candidates else None

# Try to guess action/activity column
action_candidates = [c for c in df.columns
                     if any(k in c.lower() for k in ["action", "activity", "event", "event_type"])]
action_col = action_candidates[0] if action_candidates else None

# Try to guess age column
age_candidates = [c for c in df.columns if "age" in c.lower()]
age_col = age_candidates[0] if age_candidates else None

# Try to guess gender column
gender_candidates = [c for c in df.columns if any(k in c.lower() for k in ["gender", "sex"])]
gender_col = gender_candidates[0] if gender_candidates else None

# Try to guess location/region column
location_candidates = [c for c in df.columns
                       if any(k in c.lower() for k in ["country", "city", "region", "state", "location"])]
location_col = location_candidates[0] if location_candidates else None

# Try to guess product category / interest column
category_candidates = [c for c in df.columns
                       if any(k in c.lower() for k in ["category", "segment", "interest", "product_category"])]
product_cat_col = category_candidates[0] if category_candidates else None

# Try to guess revenue / amount / price column
revenue_candidates = [c for c in df.columns
                      if any(k in c.lower() for k in ["revenue", "amount", "price", "sales", "purchase", "order_value", "total"])]
# Keep only numeric ones for safety
revenue_candidates = [c for c in revenue_candidates if c in numeric_cols]
revenue_col = revenue_candidates[0] if revenue_candidates else None

# Try to guess timestamp / datetime column
date_candidates = [c for c in df.columns
                   if any(k in c.lower() for k in ["date", "time", "timestamp", "datetime"])]
# Prefer datetime dtype if already converted
datetime_cols = df.select_dtypes(include=["datetime64[ns]"]).columns.tolist()
if datetime_cols:
    date_col = datetime_cols[0]
elif date_candidates:
    date_col = date_candidates[0]
    # Try to convert to datetime just in case
    df[date_col] = pd.to_datetime(df[date_col], errors="coerce")
else:
    date_col = None

print("\n===== Auto-detected key columns =====")
print(f"user_id_col   : {user_id_col}")
print(f"session_col   : {session_col}")
print(f"action_col    : {action_col}")
print(f"age_col       : {age_col}")
print(f"gender_col    : {gender_col}")
print(f"location_col  : {location_col}")
print(f"product_cat_col: {product_cat_col}")
print(f"revenue_col   : {revenue_col}")
print(f"date_col      : {date_col}")


In [None]:
print("\n===== USER BEHAVIOR ANALYSIS =====")

# 2.1 Total number of unique users
if user_id_col:
    total_users = df[user_id_col].nunique()
    print(f"Total number of unique users: {total_users}")
else:
    print("User ID column not found. Cannot compute unique users.")

# 2.2 Sessions per user distribution
if user_id_col:
    if session_col:
        sessions_per_user = df.groupby(user_id_col)[session_col].nunique()
        print("\nSessions per user (using session column):")
    else:
        # If no explicit session column, use number of rows as "sessions"
        sessions_per_user = df.groupby(user_id_col).size()
        print("\nSessions per user (approximated by row count per user):")

    print(sessions_per_user.describe())

    # Plot distribution of sessions per user
    plt.figure(figsize=(6, 4))
    sns.histplot(sessions_per_user, kde=True)
    plt.title("Distribution of Sessions per User")
    plt.xlabel("Number of Sessions")
    plt.ylabel("Number of Users")
    plt.tight_layout()
    plt.show()
else:
    print("Cannot compute sessions per user because user ID column was not detected.")

# 2.3 Most common actions or activities
if action_col:
    print(f"\nMost common actions/activities in column '{action_col}':")
    print(df[action_col].value_counts().head(10))
else:
    print("\nNo action/activity column detected. Skipping action analysis.")


In [None]:
print("\n===== USER SEGMENTATION =====")

# 3.1 Segment by age groups
if age_col:
    # Create age groups (bins)
    bins = [0, 18, 25, 35, 45, 60, 120]
    labels = ["<18", "18-24", "25-34", "35-44", "45-59", "60+"]
    df["age_group"] = pd.cut(df[age_col], bins=bins, labels=labels, right=False)

    # Group by age_group
    if user_id_col:
        age_segment = df.groupby("age_group")[user_id_col].nunique().reset_index()
        age_segment.columns = ["age_group", "unique_users"]
    else:
        age_segment = df["age_group"].value_counts().reset_index()
        age_segment.columns = ["age_group", "count"]

    print("\n--- Segmentation by Age Group ---")
    print(age_segment)
else:
    print("\nNo age column detected. Skipping age segmentation.")

# 3.2 Segment by gender
if gender_col:
    if user_id_col:
        gender_segment = df.groupby(gender_col)[user_id_col].nunique().reset_index()
        gender_segment.columns = [gender_col, "unique_users"]
    else:
        gender_segment = df[gender_col].value_counts().reset_index()
        gender_segment.columns = [gender_col, "count"]

    print("\n--- Segmentation by Gender ---")
    print(gender_segment)
else:
    print("\nNo gender column detected. Skipping gender segmentation.")

# 3.3 Segment by location / region
if location_col:
    if user_id_col:
        location_segment = df.groupby(location_col)[user_id_col].nunique().reset_index()
        location_segment.columns = [location_col, "unique_users"]
    else:
        location_segment = df[location_col].value_counts().reset_index()
        location_segment.columns = [location_col, "count"]

    print("\n--- Segmentation by Location/Region ---")
    print(location_segment.head(20))  # Show top 20 for readability
else:
    print("\nNo location/region column detected. Skipping location segmentation.")

# 3.4 Segment by product category / interest
if product_cat_col:
    if user_id_col:
        category_segment = df.groupby(product_cat_col)[user_id_col].nunique().reset_index()
        category_segment.columns = [product_cat_col, "unique_users"]
    else:
        category_segment = df[product_cat_col].value_counts().reset_index()
        category_segment.columns = [product_cat_col, "count"]

    print(f"\n--- Segmentation by Product Category / Interest ({product_cat_col}) ---")
    print(category_segment.head(20))
else:
    print("\nNo product category/interest column detected. Skipping category segmentation.")


## 4. REVENUE / PURCHASE ANALYSIS


In [None]:
print("\n===== REVENUE / PURCHASE ANALYSIS =====")

if revenue_col:
    # 4.1 Total revenue
    total_revenue = df[revenue_col].sum()
    print(f"Total revenue: {total_revenue:.2f}")

    # 4.2 Average revenue per user
    if user_id_col:
        revenue_per_user = df.groupby(user_id_col)[revenue_col].sum()
        avg_revenue_per_user = revenue_per_user.mean()
        print(f"Average revenue per user: {avg_revenue_per_user:.2f}")
    else:
        print("User ID column not found. Cannot calculate revenue per user accurately.")

    # 4.3 Top product categories by revenue
    if product_cat_col:
        category_revenue = df.groupby(product_cat_col)[revenue_col].sum().sort_values(ascending=False)
        print(f"\nTop product categories by revenue (using '{product_cat_col}'):")
        print(category_revenue.head(10))
    else:
        category_revenue = None
        print("\nNo product category column detected. Cannot compute category-wise revenue.")

    # 4.4 Top user segments by revenue (age/gender/category)
    # By age group
    if age_col:
        if "age_group" not in df.columns:
            bins = [0, 18, 25, 35, 45, 60, 120]
            labels = ["<18", "18-24", "25-34", "35-44", "45-59", "60+"]
            df["age_group"] = pd.cut(df[age_col], bins=bins, labels=labels, right=False)
        age_revenue = df.groupby("age_group")[revenue_col].sum().sort_values(ascending=False)
        print("\nTop age group by revenue:")
        print(age_revenue.head(10))

    # By gender
    if gender_col:
        gender_revenue = df.groupby(gender_col)[revenue_col].sum().sort_values(ascending=False)
        print("\nTop gender segment by revenue:")
        print(gender_revenue)

    # By product category
    if product_cat_col:
        print("\nTop product category by revenue (already shown above).")

else:
    print("No revenue/amount/price column detected. Skipping revenue analysis.")


## 5. VISUALIZATIONS


In [None]:
print("\n===== VISUALIZATIONS =====")

# 5.1 Top 10 product categories bar chart
if product_cat_col:
    if revenue_col:
        top_cat_rev = df.groupby(product_cat_col)[revenue_col].sum().sort_values(ascending=False).head(10)
        y_label = "Total Revenue"
    else:
        top_cat_rev = df[product_cat_col].value_counts().head(10)
        y_label = "Count"

    plt.figure(figsize=(8, 5))
    sns.barplot(x=top_cat_rev.index, y=top_cat_rev.values)
    plt.title("Top 10 Product Categories")
    plt.xlabel("Product Category")
    plt.ylabel(y_label)
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    plt.show()
else:
    print("No product category column detected. Skipping Top 10 categories bar chart.")

# 5.2 User age distribution histogram
if age_col:
    plt.figure(figsize=(6, 4))
    sns.histplot(df[age_col].dropna(), kde=True, bins=20)
    plt.title("User Age Distribution")
    plt.xlabel("Age")
    plt.ylabel("Frequency")
    plt.tight_layout()
    plt.show()
else:
    print("No age column detected. Skipping age distribution plot.")

# 5.3 Revenue distribution boxplot
if revenue_col:
    plt.figure(figsize=(6, 4))
    sns.boxplot(x=df[revenue_col].dropna())
    plt.title("Revenue Distribution")
    plt.xlabel(revenue_col)
    plt.tight_layout()
    plt.show()
else:
    print("No revenue column detected. Skipping revenue boxplot.")

# 5.4 Gender distribution pie or bar chart
if gender_col:
    gender_counts = df[gender_col].value_counts()
    plt.figure(figsize=(5, 5))
    # Pie chart
    plt.pie(gender_counts.values, labels=gender_counts.index, autopct="%1.1f%%", startangle=90)
    plt.title("Gender Distribution")
    plt.tight_layout()
    plt.show()

    # Bar chart (optional)
    plt.figure(figsize=(6, 4))
    sns.barplot(x=gender_counts.index, y=gender_counts.values)
    plt.title("Gender Distribution (Bar Chart)")
    plt.xlabel("Gender")
    plt.ylabel("Count")
    plt.tight_layout()
    plt.show()
else:
    print("No gender column detected. Skipping gender distribution plot.")

# 5.5 Monthly revenue line plot (if timestamp exists)
if (date_col is not None) and (revenue_col is not None):
    # Ensure datetime
    df[date_col] = pd.to_datetime(df[date_col], errors="coerce")
    df["year_month"] = df[date_col].dt.to_period("M").dt.to_timestamp()

    monthly_revenue = df.groupby("year_month")[revenue_col].sum().reset_index()

    plt.figure(figsize=(8, 4))
    sns.lineplot(data=monthly_revenue, x="year_month", y=revenue_col, marker="o")
    plt.title("Monthly Revenue Over Time")
    plt.xlabel("Month")
    plt.ylabel("Total Revenue")
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    plt.show()
else:
    print("Either date column or revenue column not available. Skipping monthly revenue plot.")


## 6. INSIGHTS (AUTO-GENERATED BULLET POINTS)


In [None]:
print("\n===== AUTO-GENERATED INSIGHTS =====")

insights = []

# 6.1 Most popular / high-revenue category
if product_cat_col:
    if revenue_col:
        cat_rev = df.groupby(product_cat_col)[revenue_col].sum()
        if not cat_rev.empty:
            top_cat = cat_rev.idxmax()
            top_cat_val = cat_rev.max()
            insights.append(f"- The product category '{top_cat}' generates the highest revenue (~{top_cat_val:.2f}).")
    else:
        cat_counts = df[product_cat_col].value_counts()
        if not cat_counts.empty:
            top_cat = cat_counts.idxmax()
            insights.append(f"- The most frequently purchased/visited category is '{top_cat}'.")

# 6.2 Top buying age group
if age_col and revenue_col and "age_group" in df.columns:
    age_rev = df.groupby("age_group")[revenue_col].sum()
    age_rev = age_rev[age_rev > 0]
    if not age_rev.empty:
        top_age_group = age_rev.idxmax()
        insights.append(f"- Users in the age group '{top_age_group}' contribute the most to revenue.")

# 6.3 Top buying gender
if gender_col and revenue_col:
    g_rev = df.groupby(gender_col)[revenue_col].sum()
    if not g_rev.empty:
        top_gender = g_rev.idxmax()
        insights.append(f"- The gender segment '{top_gender}' spends the most overall.")

# 6.4 Less active regions or segments (by user count)
if location_col and user_id_col:
    loc_users = df.groupby(location_col)[user_id_col].nunique()
    if loc_users.nunique() > 1:
        least_loc = loc_users.idxmin()
        insights.append(f"- The region/location '{least_loc}' has the least number of active users.")

# 6.5 Session behavior
if user_id_col:
    if session_col:
        sess_per_user = df.groupby(user_id_col)[session_col].nunique()
    else:
        sess_per_user = df.groupby(user_id_col).size()
    if not sess_per_user.empty:
        insights.append(f"- Average sessions per user: {sess_per_user.mean():.2f}. Some users are significantly more active than others.")

# 6.6 Generic insight if no specific ones could be created
if not insights:
    insights.append("- Dataset cleaned and basic EDA completed. Specific insights depend on which business columns are present.")

# Print insights
for line in insights:
    print(line)
