In [10]:
!pip install pandas matplotlib seaborn


Defaulting to user installation because normal site-packages is not writeable


DEPRECATION: Loading egg at c:\program files\python311\lib\site-packages\vboxapi-1.0-py3.11.egg is deprecated. pip 25.1 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330


In [11]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

ModuleNotFoundError: No module named 'pandas'

In [None]:
# Load datasets
customers = pd.read_csv("customers.csv")
products = pd.read_csv("products.csv")
transactions = pd.read_csv("transactions.csv")

In [None]:
# Convert date columns to datetime format
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])


In [None]:
# Basic Information and Summary
print("Customers Dataset Info:")
print(customers.info())
print(customers.describe(include='all'))

print("\nProducts Dataset Info:")
print(products.info())
print(products.describe(include='all'))

print("\nTransactions Dataset Info:")
print(transactions.info())
print(transactions.describe())


In [None]:
# Check for missing values
print("\nMissing Values in Customers:")
print(customers.isnull().sum())

print("\nMissing Values in Products:")
print(products.isnull().sum())

print("\nMissing Values in Transactions:")
print(transactions.isnull().sum())

In [None]:
# Merge datasets for a consolidated view
merged_data = transactions.merge(customers, on='CustomerID', how='left').merge(products, on='ProductID', how='left')


In [None]:
# EDA: Visualizations
sns.set(style="whitegrid")

In [None]:
# 1. Distribution of Customers by Region (Pie Chart)
plt.figure(figsize=(8, 8))
customers['Region'].value_counts().plot(kind='pie', autopct='%1.1f%%', startangle=140, colors=sns.color_palette('viridis', len(customers['Region'].unique())))
plt.title("Distribution of Customers by Region")
plt.ylabel("")
plt.show()

In [None]:
# 2. Revenue by Product Category (Pie Chart)
category_revenue = merged_data.groupby('Category')['TotalValue'].sum().sort_values(ascending=False)
plt.figure(figsize=(8, 8))
category_revenue.plot(kind='pie', autopct='%1.1f%%', startangle=140, colors=sns.color_palette('pastel', len(category_revenue)))
plt.title("Revenue by Product Category")
plt.ylabel("")
plt.show()

In [None]:
# 3. Monthly Sales Trend
merged_data['MonthYear'] = merged_data['TransactionDate'].dt.to_period('M')
monthly_sales = merged_data.groupby('MonthYear')['TotalValue'].sum()
plt.figure(figsize=(12, 6))
monthly_sales.plot(marker='o', linestyle='-', color='green')
plt.title("Monthly Sales Trend")
plt.xlabel("Month-Year")
plt.ylabel("Total Sales (USD)")
plt.grid()
plt.show()

In [None]:
# 4. Top 10 Customers vs Rest by Total Spending
total_spending = merged_data.groupby('CustomerName')['TotalValue'].sum()
top_customers = total_spending.nlargest(10)
others_spending = total_spending.sum() - top_customers.sum()
spending_data = pd.concat([top_customers, pd.Series({'Others': others_spending})])

plt.figure(figsize=(10, 8))
spending_data.plot(kind='pie', autopct='%1.1f%%', startangle=140, colors=sns.color_palette('Set3', len(spending_data)))
plt.title("Top 10 Customers vs Rest by Total Spending")
plt.ylabel("")
plt.show()

In [None]:
# 5. Top 10 Products vs Rest by Total Revenue
total_revenue = merged_data.groupby('ProductName')['TotalValue'].sum()
top_products = total_revenue.nlargest(10)
others_revenue = total_revenue.sum() - top_products.sum()
revenue_data = pd.concat([top_products, pd.Series({'Others': others_revenue})])

plt.figure(figsize=(10, 8))
revenue_data.plot(kind='pie', autopct='%1.1f%%', startangle=140, colors=sns.color_palette('Set2', len(revenue_data)))
plt.title("Top 10 Products vs Rest by Total Revenue")
plt.ylabel("")
plt.show()

In [None]:
# 6. Per Day Transaction Value
daily_transactions = transactions.groupby('TransactionDate')['TotalValue'].sum()
plt.figure(figsize=(12, 6))
daily_transactions.plot(marker='o', linestyle='-', color='blue')
plt.title("Per Day Transaction Value")
plt.xlabel("Date")
plt.ylabel("Total Transaction Value (USD)")
plt.grid()
plt.show()

In [None]:
# Print key insights
print("\n--- Key Insights ---")
print("1. Distribution of customers shows the concentration of customer base by region.")
print("2. Revenue is driven heavily by specific product categories, indicating high-performing segments.")
print("3. Sales show a trend of seasonality, with peaks at specific times of the year.")
print("4. Top customers contribute significantly to the overall revenue, suggesting the need for VIP strategies.")
print("5. Certain products dominate in terms of quantity sold, which could guide inventory planning.")
