In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('whitegrid')
%matplotlib inline

# Load data
cust = pd.read_csv("Customers.csv")
prod = pd.read_csv("Products.csv")
trans = pd.read_csv("Transactions.csv")

# Preview data
print(cust.head())
print(prod.head())
print(trans.head())

# Convert dates
cust['SignupDate'] = pd.to_datetime(cust['SignupDate'])
trans['TransactionDate'] = pd.to_datetime(trans['TransactionDate'])

# Check for missing values and duplicates
print(cust.isnull().sum())
print(prod.isnull().sum())
print(trans.isnull().sum())
print(cust.duplicated().sum(), prod.duplicated().sum(), trans.duplicated().sum())

# Descriptive stats
print(trans.describe())

# Merge data
data = trans.merge(cust, on="CustomerID", how="left").merge(prod, on="ProductID", how="left")

# Revenue by region
rev_by_region = data.groupby("Region")["TotalValue"].sum().sort_values(ascending=False)
print(rev_by_region)

# Top products
top_prod = data.groupby("ProductName")["Quantity"].sum().sort_values(ascending=False).head(10)
print(top_prod)

# Sales over time
data['MonthYear'] = data['TransactionDate'].dt.to_period('M')
sales_time = data.groupby('MonthYear')['TotalValue'].sum()

# Customer behavior
cust_beh = data.groupby("CustomerID").agg({"TotalValue": "sum", "Quantity": "sum"}).sort_values(by="TotalValue", ascending=False)
print(cust_beh.head(10))

# Boxplot for transaction values
plt.figure(figsize=(10, 6))
sns.boxplot(x=trans['TotalValue'], color="lightblue")
plt.title("Transaction Values")
plt.show()

# Correlation heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(trans[['Quantity', 'TotalValue']].corr(), annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()

# Repeat customers
repeat_rate = (data['CustomerID'].value_counts() > 1).mean() * 100
print(f"Repeat Customer Rate: {repeat_rate:.2f}%")

# Region trends
region_trends = data.groupby(['Region', 'MonthYear'])['TotalValue'].sum().unstack()

# Plots
plt.figure(figsize=(10, 6))
sns.barplot(x=rev_by_region.index, y=rev_by_region.values, palette='viridis')
plt.title("Revenue by Region")
plt.xticks(rotation=45)
plt.show()

plt.figure(figsize=(12, 6))
sns.barplot(x=top_prod.values, y=top_prod.index, palette='muted')
plt.title("Top Products")
plt.show()

plt.figure(figsize=(14, 7))
sales_time.plot(marker='o', figsize=(14, 7), color="orange")
plt.title("Sales Over Time")
plt.xticks(rotation=45)
plt.show()

region_trends.T.plot(figsize=(14, 7), marker='o', cmap='tab10')
plt.title("Region Trends Over Time")
plt.xticks(rotation=45)
plt.show()

plt.figure(figsize=(10, 6))
sns.histplot(trans['TotalValue'], bins=30, kde=True, color='teal')
plt.title("Transaction Value Distribution")
plt.show()

plt.figure(figsize=(12, 6))
sns.barplot(x=cust_beh['TotalValue'].head(10).values, y=cust_beh.head(10).index, palette='deep')
plt.title("Top Customers")
plt.show()

# Insights
def insights():
    print("\nInsights:")
    print(f"1. Region X has the highest revenue.")
    print(f"2. Product Y is the most purchased product.")
    print(f"3. Monthly sales show seasonal trends.")
    print(f"4. Repeat customer rate is {repeat_rate:.2f}%.")
    print(f"5. Transaction values have significant outliers.")

insights()