In [94]:
# Import required libraries
import pandas as pd # Main library for data manipulation and analysis
# Load the three datasets using pd.read_csv()
# This function reads CSV files and converts them into pandas DataFrames
customers = pd.read_csv('customers.csv')
products = pd.read_csv('products.csv')
sales = pd.read_csv('sales.csv')
# Display basic information about what we've loaded
# len() gives us the number of rows in each DataFrame
print(f"✅ Successfully loaded:")
print(f"   - {len(customers)} customers")      # Total number of customer records
print(f"   - {len(products)} products")        # Total number of products in catalog
print(f"   - {len(sales)} sales transactions") # Total number of sales transactions

# Display the first few rows of each dataset to understand the structure
# .head(n) shows the first n rows of a DataFrame - useful for data exploration
print(f"\n👀 First few rows of each dataset:")
print(f"\nCustomers:")
print(customers.head(3))  # Show first 3 rows of customer data
print(f"\nProducts:")
print(products.head(3))   # Show first 3 rows of product data
print(f"\nSales:")
print(sales.head(3))      # Show first 3 rows of sales data

✅ Successfully loaded:
   - 100 customers
   - 60 products
   - 150 sales transactions

👀 First few rows of each dataset:

Customers:
   customer_id first_name last_name  age state  income registration_date  \
0            1       John     Smith   34    CA   75000        2022-01-15   
1            2      Sarah   Johnson   28    NY   82000        2022-01-22   
2            3    Michael     Brown   45    TX   95000        2022-02-03   

                     email  
0     john.smith@email.com  
1  sarah.johnson@email.com  
2  michael.brown@email.com  

Products:
   product_id                   product_name       category  price  \
0           1  Wireless Bluetooth Headphones    Electronics  79.99   
1           2         Organic Cotton T-Shirt       Clothing  24.99   
2           3   Stainless Steel Water Bottle  Home & Garden  19.99   

  launch_date      brand  
0  2021-03-15  AudioTech  
1  2021-04-20    EcoWear  
2  2021-05-10  HydroLife  

Sales:
   sale_id  customer_id  product_id  

In [95]:
# =============================================================================
# PART 1: DATETIME OPERATIONS
# =============================================================================
# DateTime operations are crucial for time-based analysis in business data
# We'll convert text dates to proper datetime objects and extract useful components

print("\n" + "=" * 60)
print("PART 1: DATETIME OPERATIONS")
print("=" * 60)

print("\n🕐 1.1 Converting and Working with Dates")

# Convert date columns from strings to pandas datetime objects
# pd.to_datetime() is the main function for converting various date formats
# Once converted, we can perform date arithmetic and extract components
print("Converting date columns to datetime...")
# YOUR CODE HERE
customers['registration_date'] = pd.to_datetime(customers['registration_date'], errors='coerce')
products['launch_date'] = pd.to_datetime(products['launch_date'], errors='coerce')
sales['transaction_date'] = pd.to_datetime(sales['transaction_date'], errors='coerce')
print("✅ All date columns converted successfully")
# double checking :
customers.info()
products.info()
sales.info()


PART 1: DATETIME OPERATIONS

🕐 1.1 Converting and Working with Dates
Converting date columns to datetime...
✅ All date columns converted successfully
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   customer_id        100 non-null    int64         
 1   first_name         100 non-null    object        
 2   last_name          100 non-null    object        
 3   age                100 non-null    int64         
 4   state              100 non-null    object        
 5   income             100 non-null    int64         
 6   registration_date  100 non-null    datetime64[ns]
 7   email              100 non-null    object        
dtypes: datetime64[ns](1), int64(3), object(4)
memory usage: 6.4+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Data columns (total 6 columns):
 #   Column        Non-Nu

In [96]:
# Extract useful components from datetime objects using the .dt accessor
# The .dt accessor gives us access to datetime-specific methods and properties
print("\nExtracting datetime components...")   
# Extract year (2022, 2023, etc.)
customers['registration_year'] = customers['registration_date'].dt.year
products['launch_year'] = products['launch_date'].dt.year
sales['transaction_year'] = sales['transaction_date'].dt.year
# Extract month number (1-12)
customers['registration_month'] = customers['registration_date'].dt.month_name()
products['launch_month'] = products['launch_date'].dt.month_name()
sales['transaction_month'] = sales['transaction_date'].dt.month_name()
# Day of week (Monday=0, Sunday=6)
customers['registration_day'] = customers['registration_date'].dt.dayofweek
products['launch_day'] = products['launch_date'].dt.dayofweek
sales['transaction_day'] = sales['transaction_date'].dt.dayofweek
# Day name as text (Monday, Tuesday, etc.)
customers['registration_day_name'] = customers['registration_date'].dt.day_name()
products['launch_day_name'] = products['launch_date'].dt.day_name()
sales['transaction_day_name'] = sales['transaction_date'].dt.day_name()
print("✅ Extracted: year, month, day of week, and day name")
#double checking
print(f"\n updated dataframes :")
print("\nCustomers:")
print(customers.head())

print("\nProducts:")
print(products.head())

print("\nSales:")
print(sales.head())


Extracting datetime components...
✅ Extracted: year, month, day of week, and day name

 updated dataframes :

Customers:
   customer_id first_name last_name  age state  income registration_date  \
0            1       John     Smith   34    CA   75000        2022-01-15   
1            2      Sarah   Johnson   28    NY   82000        2022-01-22   
2            3    Michael     Brown   45    TX   95000        2022-02-03   
3            4      Emily     Davis   31    FL   68000        2022-02-14   
4            5      David    Wilson   52    IL  110000        2022-02-28   

                     email  registration_year registration_month  \
0     john.smith@email.com               2022            January   
1  sarah.johnson@email.com               2022            January   
2  michael.brown@email.com               2022           February   
3    emily.davis@email.com               2022           February   
4   david.wilson@email.com               2022           February   

   registrat

In [97]:
# Find the date range of our sales data by using min() and max()
# This helps us understand the time span our analysis covers
# YOUR CODE HERE
print(sales["transaction_date"].min())
print(sales["transaction_date"].max())
print("\n🔍 1.2 Basic Time Analysis")

2022-02-10 14:30:00
2024-03-05 10:30:00

🔍 1.2 Basic Time Analysis


In [98]:
# Basic Time Analysis;
# Find the busiest day of the week by counting transactions
# .value_counts() counts occurrences of each unique value
# YOUR CODE HERE
counts_transactions_by_day = sales['transaction_day_name'].value_counts()
top_value_day = counts_transactions_by_day.index[0]
top_count_day = counts_transactions_by_day.iloc[0]
print("the buisest day is:", top_value_day, "with", top_count_day, "transactions")

#Determine which month had the most transactions
counts_transactions_by_month = sales['transaction_month'].value_counts()
top_value_month = counts_transactions_by_month.index[0]
top_count_month = counts_transactions_by_month.iloc[0]
print("the month with the most transactions is :", top_value_month, "with", top_count_month, "transactions")

#Count how many sales happened in each year

counts_transactions_by_year = sales['transaction_year'].value_counts().reset_index(name = "count")
print(f" \n Sales by year :")
print(counts_transactions_by_year)



the buisest day is: Friday with 24 transactions
the month with the most transactions is : February with 16 transactions
 
 Sales by year :
   transaction_year  count
0              2023     72
1              2022     65
2              2024     13


In [99]:
# =============================================================================
# PART 2: GROUPBY AND AGGREGATION
# =============================================================================

In [100]:
products.head()

Unnamed: 0,product_id,product_name,category,price,launch_date,brand,launch_year,launch_month,launch_day,launch_day_name
0,1,Wireless Bluetooth Headphones,Electronics,79.99,2021-03-15,AudioTech,2021,March,0,Monday
1,2,Organic Cotton T-Shirt,Clothing,24.99,2021-04-20,EcoWear,2021,April,1,Tuesday
2,3,Stainless Steel Water Bottle,Home & Garden,19.99,2021-05-10,HydroLife,2021,May,0,Monday
3,4,Leather Crossbody Bag,Accessories,89.99,2021-06-01,StyleCraft,2021,June,1,Tuesday
4,5,Yoga Mat Premium,Sports & Outdoors,34.99,2021-06-15,FitZone,2021,June,1,Tuesday


In [110]:
# 1.Product Analysis:
# Group products by category and calculate average price for each category
# .groupby() splits the data into groups based on the 'category' column
# .mean() calculates the average for each group
# .round(2) rounds to 2 decimal places for currency formatting
# .sort_values(ascending=False) sorts from highest to lowest price
products_category_by_average_price = products.groupby("category")["price"].mean().round(2).reset_index(name = "average_price").sort_values(by = "average_price", ascending = False)

print("Product Category and Its Average Price ")
print(products_category_by_average_price)

Product Category and Its Average Price 
            category  average_price
4  Sports & Outdoors         105.90
2        Electronics          59.99
1           Clothing          52.24
0        Accessories          35.07
3      Home & Garden          30.16


In [102]:
# Count how many products exist in each category
# .value_counts() counts occurrences of each unique category

products_category_by_number_of_products = products["category"].value_counts().reset_index(name = "number_of_products").sort_values(by = "number_of_products" , ascending = False)

print("products categorys and the number of products in each category")
print(products_category_by_number_of_products)

products categorys and the number of products in each category
            category  number_of_products
0        Electronics                  13
1           Clothing                  12
2      Home & Garden                  12
3        Accessories                  12
4  Sports & Outdoors                  11


In [112]:
# 2. Customer Analysis:
# Group customers by state and calculate average age
# This shows us the demographic profile of customers in different regions

state_by_average_age = customers.groupby("state")["age"].mean().round(2).reset_index(name = "average_age").sort_values(by = "average_age", ascending = False)
print("average customer age in each state")
print(state_by_average_age)

average customer age in each state
  state  average_age
1    FL        39.00
4    TX        38.30
2    IL        37.25
3    NY        36.50
0    CA        34.10


In [109]:
#Calculate total spending per customer (sum up all their purchases)

In [108]:
total_spending_per_customer = (
    sales.groupby("customer_id")["total_amount"]
    .sum()
    .round(2)
    .reset_index(name = "total_spent")
    .sort_values(by = "total_spent", ascending = False)
)
print(total_spending_per_customer)

    customer_id  total_spent
30           31       379.97
12           13       359.96
40           41       329.98
83           84       319.98
29           30       284.98
..          ...          ...
69           70        33.98
88           89        29.99
68           69        28.99
71           72        24.99
76           77        12.99

[90 rows x 2 columns]


In [131]:
sales.head()

Unnamed: 0,sale_id,customer_id,product_id,transaction_date,quantity,total_amount,transaction_year,transaction_month,transaction_day,transaction_day_name
0,1,1,1,2022-02-10 14:30:00,1,79.99,2022,February,3,Thursday
1,2,2,3,2022-02-15 10:15:00,2,39.98,2022,February,1,Tuesday
2,3,3,5,2022-02-20 16:45:00,1,34.99,2022,February,6,Sunday
3,4,4,2,2022-02-25 11:20:00,3,74.97,2022,February,4,Friday
4,5,5,4,2022-03-01 13:10:00,1,89.99,2022,March,1,Tuesday


In [117]:
# Find customers with the most purchases (transaction frequency)
# .groupby().size() counts the number of transactions per customer

purchases_by_customers = (
    sales.groupby(("customer_id"))
    .size()
    .reset_index(name = "number_of_purchases")
    .sort_values(by = "number_of_purchases", ascending = False)
)
print(purchases_by_customers.head())


    customer_id  number_of_purchases
0             1                    2
31           32                    2
33           34                    2
34           35                    2
35           36                    2


In [None]:
#3.Sales Analysis:

In [120]:
# Calculate total sales revenue by month
# Group sales by month and sum the total_amount for each month

total_sales_by_month = (
    sales.groupby("transaction_month")["total_amount"]
    .sum()
    .round(2)
    .reset_index(name = "total_revenue")
    .sort_values(by = "total_revenue", ascending = False)
)
print(total_sales_by_month)



   transaction_month  total_revenue
3           February        1669.66
2           December        1458.71
10           October        1235.80
8                May        1177.79
11         September        1150.76
7              March        1115.77
9           November        1022.76
6               June         986.68
5               July         930.80
0              April         924.77
1             August         870.74
4            January         864.79


In [123]:
# Calculate average transaction amount by day of week
# This shows us if people spend more on certain days

average_transaction_amount_by_day_of_week = (
    sales.groupby("transaction_day_name")["total_amount"]
    .mean()
    .round(2)
    .reset_index(name = "average_transaction_amount")
    .sort_values(by = "average_transaction_amount", ascending = False)
)
print(average_transaction_amount_by_day_of_week)


  transaction_day_name  average_transaction_amount
4             Thursday                      102.27
0               Friday                      100.94
3               Sunday                       99.55
6            Wednesday                       96.07
1               Monday                       90.93
5              Tuesday                       74.03
2             Saturday                       61.12


In [134]:
# Determine which product category generates the most total revenue

sales_with_category = pd.merge(sales, products[["product_id","category"]], on = "product_id", how = "left")

total_revenue_by_category = (
    sales_with_category.groupby("category")["total_amount"]
    .sum()
    .round(2)
    .reset_index(name = "total_revenue")
    .sort_values(by = "total_revenue", ascending = False)
)
print(total_revenue_by_category)



            category  total_revenue
4  Sports & Outdoors        3544.64
2        Electronics        3524.29
1           Clothing        2745.39
0        Accessories        2053.25
3      Home & Garden        1541.46


In [None]:
# =============================================================================
# PART 3: Basic Data Insights
# =============================================================================

In [None]:
# 1. Simple Comparisons:

In [135]:
# Find which state has the most customers
customer_per_state = (
    customers.groupby("state")["customer_id"]
    .count()
    .reset_index(name = "total_customers")
    .sort_values(by = "total_customers", ascending = False)
)
print(customer_per_state)


  state  total_customers
0    CA               20
1    FL               20
2    IL               20
3    NY               20
4    TX               20


In [136]:
# Identify the top 5 customers by total spending
total_spending_by_customer = (
    sales.groupby("customer_id")["total_amount"]
    .sum()
    .round(2)
    .reset_index(name = "total_spent")
    .sort_values(by = "total_spent", ascending = False)
)
print(total_spending_by_customer.head())

    customer_id  total_spent
30           31       379.97
12           13       359.96
40           41       329.98
83           84       319.98
29           30       284.98


In [137]:
# Determine which day of the week has the highest average transaction value
#previously done in part 2 / sales analysis so we r just going to call the top value
print(average_transaction_amount_by_day_of_week.head(1))

  transaction_day_name  average_transaction_amount
4             Thursday                      102.27
