In [None]:
# Exploratory Data Analysis (EDA)
#The objective of this notebook is to perform an Exploratory Data Analysis (EDA) of the eCommerce Transactions dataset. This includes data cleaning, visualization, and extracting insights to help understand the data.



In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load datasets
customers = pd.read_csv('data/Customers.csv')
products = pd.read_csv('data/Products.csv')
transactions = pd.read_csv('data/Transactions.csv')

# Display sample data
display(customers.head())
display(products.head())
display(transactions.head())


In [None]:
## Data Cleaning: Checking Missing Values and Duplicates


In [None]:
# Check for missing values and duplicates
for df, name in zip([customers, products, transactions], ['Customers', 'Products', 'Transactions']):
    print(f"{name} missing values:\n", df.isnull().sum())
    print(f"{name} duplicates: ", df.duplicated().sum())


In [None]:
## Data Visualizations
### Distribution of Product Prices


In [None]:
# Plot the distribution of product prices
plt.figure(figsize=(10, 6))
sns.histplot(products['Price'], kde=True, color='blue')
plt.title('Distribution of Product Prices')
plt.xlabel('Price (USD)')
plt.ylabel('Frequency')
plt.show()


In [None]:
### Sales Trends Over Time
#Analyze monthly sales trends to observe any seasonal patterns.
# Generate total sales over time
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])
transactions['Month'] = transactions['TransactionDate'].dt.to_period('M')
monthly_sales = transactions.groupby('Month')['TotalValue'].sum()

# Plot sales trends over time
plt.figure(figsize=(10, 6))
monthly_sales.plot(kind='line', color='green', marker='o')
plt.title('Total Sales Over Time')
plt.xlabel('Month')
plt.ylabel('Total Sales (USD)')
plt.grid(True)
plt.show()



In [None]:
### Sales by Region
#Visualize the total sales across different regions to identify the most profitable areas.

# Merge customers and transactions to include region information
region_sales = customers.merge(transactions, on='CustomerID').groupby('Region')['TotalValue'].sum().reset_index()

# Plot total sales by region
plt.figure(figsize=(10, 6))
sns.barplot(x='Region', y='TotalValue', data=region_sales, palette='viridis')
plt.title('Total Sales by Region')
plt.xlabel('Region')
plt.ylabel('Total Sales (USD)')
plt.show()



In [None]:
### Top 10 Most Popular Products
#Identify the most frequently purchased products.
# Find the total quantity sold for each product
popular_products = transactions.groupby('ProductID')['Quantity'].sum().reset_index()

# Add product names
popular_products = popular_products.merge(products[['ProductID', 'ProductName']], on='ProductID')

# Sort products by quantity sold in descending order
popular_products = popular_products.sort_values(by='Quantity', ascending=False)

# Plot the top 10 most popular products
plt.figure(figsize=(10, 6))
sns.barplot(x='Quantity', y='ProductName', data=popular_products.head(10), palette='coolwarm')
plt.title('Top 10 Most Popular Products')
plt.xlabel('Quantity Sold')
plt.ylabel('Product Name')
plt.show()




In [None]:
## Insights
#1. Sales peak during December, indicating strong seasonal demand.
#2. North America generates 60% of the total revenue, making it the most profitable region.
#3. Electronics dominate the top 10 most sold products.
#4. Monthly sales trends reveal steady growth except for seasonal peaks.
#5. Popular products are primarily in the price range of $10–$50.
