In [None]:
from google.colab import drive
drive.mount('/content/drive')

pip install pandas numpy matplotlib seaborn scikit-learn plotly

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity

#%% [markdown]
### *Task 1: EDA and Business Insights*
#%%
# Load data with proper column renaming
customers = pd.read_csv('/content/drive/MyDrive/DataScienceAssignment/Customers.csv')
products = pd.read_csv('/content/drive/MyDrive/DataScienceAssignment/Products.csv').rename(columns={'Price': 'ProductPrice'})
transactions = pd.read_csv('/content/drive/MyDrive/DataScienceAssignment/Transactions.csv')

# Merge datasets
merged_df = pd.merge(pd.merge(transactions, customers, on='CustomerID'), 
                    products, on='ProductID')

# Convert dates
merged_df['TransactionDate'] = pd.to_datetime(merged_df['TransactionDate'])
merged_df['SignupDate'] = pd.to_datetime(merged_df['SignupDate'])

#%% [markdown]
#### *Updated EDA Visualizations*
#%%
# 1. Sales Distribution by Region
plt.figure(figsize=(10,6))
region_sales = merged_df.groupby('Region')['TotalValue'].sum().sort_values(ascending=False)
sns.barplot(x=region_sales.values, y=region_sales.index, palette='viridis')
plt.title('Total Sales by Region', fontsize=14)
plt.xlabel('Total Sales (USD)', fontsize=12)
plt.ylabel('Region', fontsize=12)
plt.savefig('sales_by_region.png', bbox_inches='tight')
plt.show()

# 2. Product Category Distribution
plt.figure(figsize=(10,6))
category_dist = merged_df['Category'].value_counts()
plt.pie(category_dist, labels=category_dist.index, autopct='%1.1f%%', 
        startangle=90, colors=sns.color_palette('pastel'))
plt.title('Product Category Distribution', fontsize=14)
plt.savefig('category_distribution.png', bbox_inches='tight')
plt.show()

# 3. Monthly Sales Trend
plt.figure(figsize=(12,6))
monthly_sales = merged_df.set_index('TransactionDate').resample('M')['TotalValue'].sum()
sns.lineplot(x=monthly_sales.index, y=monthly_sales.values, marker='o')
plt.title('Monthly Sales Trend', fontsize=14)
plt.xlabel('Month', fontsize=12)
plt.ylabel('Total Sales (USD)', fontsize=12)
plt.xticks(rotation=45)
plt.savefig('monthly_sales_trend.png', bbox_inches='tight')
plt.show()

# 4. Customer Spending vs Product Pricing (Fixed)
plt.figure(figsize=(10,6))
sns.scatterplot(
    x='ProductPrice', 
    y='TotalValue', 
    hue='Region', 
    data=merged_df, 
    alpha=0.6,
    palette='tab10'
)
plt.title('Customer Spending vs Product Pricing', fontsize=14)
plt.xlabel('Product Price (USD)', fontsize=12)
plt.ylabel('Transaction Value (USD)', fontsize=12)
plt.savefig('value_vs_price.png', bbox_inches='tight')
plt.show()

# 5. Customer Transaction Frequency
plt.figure(figsize=(10,6))
transaction_counts = merged_df['CustomerID'].value_counts().head(20)
sns.barplot(x=transaction_counts.values, y=transaction_counts.index, palette='rocket')
plt.title('Top 20 Customers by Transaction Frequency', fontsize=14)
plt.xlabel('Number of Transactions', fontsize=12)
plt.ylabel('Customer ID', fontsize=12)
plt.savefig('transaction_frequency.png', bbox_inches='tight')
plt.show()