# Loading and Using Local Datasets for Unsupervised Learning

This notebook demonstrates how to load and analyze the CSV datasets in your directory.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

plt.style.use('seaborn-v0_8')
np.random.seed(42)

## Dataset 1: Customer Data Analysis

In [None]:
# Load customer data
customers = pd.read_csv('customer_data.csv')
print("📊 Customer Dataset:")
print(customers.head())
print(f"\nShape: {customers.shape}")
print("\nSummary Statistics:")
print(customers.describe())

In [None]:
# Customer segmentation using K-means
features = ['annual_spending', 'visit_frequency', 'avg_order_value', 'days_since_last_purchase']
X_customers = customers[features]

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_customers)

# Apply K-means
kmeans = KMeans(n_clusters=4, random_state=42, n_init=10)
customers['segment'] = kmeans.fit_predict(X_scaled)

# Visualize segments
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
colors = ['red', 'blue', 'green', 'orange']
for i in range(4):
    mask = customers['segment'] == i
    plt.scatter(customers[mask]['annual_spending'], customers[mask]['visit_frequency'], 
               c=colors[i], alpha=0.7, label=f'Segment {i}')
plt.xlabel('Annual Spending')
plt.ylabel('Visit Frequency')
plt.title('Customer Segments')
plt.legend()

plt.subplot(1, 3, 2)
for i in range(4):
    mask = customers['segment'] == i
    plt.scatter(customers[mask]['avg_order_value'], customers[mask]['days_since_last_purchase'], 
               c=colors[i], alpha=0.7, label=f'Segment {i}')
plt.xlabel('Average Order Value')
plt.ylabel('Days Since Last Purchase')
plt.title('Purchase Behavior')
plt.legend()

plt.subplot(1, 3, 3)
segment_counts = customers['segment'].value_counts().sort_index()
plt.bar(range(4), segment_counts, color=colors)
plt.xlabel('Segment')
plt.ylabel('Number of Customers')
plt.title('Segment Distribution')

plt.tight_layout()
plt.show()

print("\n🎯 Segment Analysis:")
print(customers.groupby('segment')[features].mean().round(2))

## Dataset 2: Marketing Campaign Analysis

In [None]:
# Load marketing data
marketing = pd.read_csv('marketing_data.csv')
print("📈 Marketing Dataset:")
print(marketing.head())
print(f"\nShape: {marketing.shape}")

# Encode categorical variables
marketing_encoded = marketing.copy()
marketing_encoded['age_group_encoded'] = pd.Categorical(marketing['age_group']).codes
marketing_encoded['income_level_encoded'] = pd.Categorical(marketing['income_level']).codes

# Select numerical features for clustering
features_marketing = ['email_opens', 'click_rate', 'conversion_rate', 'spend_amount', 
                     'age_group_encoded', 'income_level_encoded']
X_marketing = marketing_encoded[features_marketing]

# Standardize and cluster
scaler_marketing = StandardScaler()
X_marketing_scaled = scaler_marketing.fit_transform(X_marketing)

kmeans_marketing = KMeans(n_clusters=3, random_state=42, n_init=10)
marketing['campaign_segment'] = kmeans_marketing.fit_predict(X_marketing_scaled)

# Visualize
plt.figure(figsize=(12, 4))

plt.subplot(1, 3, 1)
colors = ['red', 'blue', 'green']
for i in range(3):
    mask = marketing['campaign_segment'] == i
    plt.scatter(marketing[mask]['click_rate'], marketing[mask]['conversion_rate'], 
               c=colors[i], alpha=0.7, label=f'Segment {i}')
plt.xlabel('Click Rate')
plt.ylabel('Conversion Rate')
plt.title('Campaign Performance Segments')
plt.legend()

plt.subplot(1, 3, 2)
for i in range(3):
    mask = marketing['campaign_segment'] == i
    plt.scatter(marketing[mask]['email_opens'], marketing[mask]['spend_amount'], 
               c=colors[i], alpha=0.7, label=f'Segment {i}')
plt.xlabel('Email Opens')
plt.ylabel('Spend Amount')
plt.title('Engagement vs Spending')
plt.legend()

plt.subplot(1, 3, 3)
segment_performance = marketing.groupby('campaign_segment')['conversion_rate'].mean()
plt.bar(range(3), segment_performance, color=colors)
plt.xlabel('Campaign Segment')
plt.ylabel('Average Conversion Rate')
plt.title('Segment Performance')

plt.tight_layout()
plt.show()

print("\n📊 Campaign Segment Analysis:")
print(marketing.groupby('campaign_segment')[['email_opens', 'click_rate', 'conversion_rate', 'spend_amount']].mean().round(3))

## Dataset 3: Product Sales Analysis with PCA

In [None]:
# Load product data
products = pd.read_csv('product_sales.csv')
print("🛍️ Product Sales Dataset:")
print(products.head())
print(f"\nShape: {products.shape}")

# Encode category
products['category_encoded'] = pd.Categorical(products['category']).codes

# Select numerical features
features_products = ['price', 'units_sold', 'revenue', 'rating', 'reviews_count', 'category_encoded']
X_products = products[features_products]

# Standardize data
scaler_products = StandardScaler()
X_products_scaled = scaler_products.fit_transform(X_products)

# Apply PCA
pca = PCA()
X_pca = pca.fit_transform(X_products_scaled)

# Visualize PCA results
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
categories = products['category'].unique()
colors = plt.cm.Set1(np.linspace(0, 1, len(categories)))
for i, category in enumerate(categories):
    mask = products['category'] == category
    plt.scatter(X_pca[mask, 0], X_pca[mask, 1], 
               c=[colors[i]], alpha=0.7, label=category)
plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)')
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)')
plt.title('Products in PCA Space')
plt.legend()

plt.subplot(1, 3, 2)
plt.bar(range(1, len(pca.explained_variance_ratio_) + 1), pca.explained_variance_ratio_)
plt.xlabel('Principal Component')
plt.ylabel('Variance Explained')
plt.title('PCA Variance Explained')

plt.subplot(1, 3, 3)
cumsum = np.cumsum(pca.explained_variance_ratio_)
plt.plot(range(1, len(cumsum) + 1), cumsum, 'bo-')
plt.axhline(y=0.8, color='red', linestyle='--', alpha=0.7)
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Variance Explained')
plt.title('Cumulative Variance')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\n🔍 PCA Results:")
print(f"• First 2 components explain {cumsum[1]:.1%} of variance")
print(f"• First 3 components explain {cumsum[2]:.1%} of variance")

# Cluster products using first 3 PCA components
kmeans_products = KMeans(n_clusters=3, random_state=42, n_init=10)
products['product_cluster'] = kmeans_products.fit_predict(X_pca[:, :3])

print("\n📦 Product Cluster Analysis:")
print(products.groupby('product_cluster')[['price', 'units_sold', 'revenue', 'rating']].mean().round(2))

## Summary: Working with Your Own Datasets

### 📁 Available Datasets:
- **customer_data.csv**: Customer behavior and spending patterns
- **marketing_data.csv**: Campaign performance metrics
- **product_sales.csv**: Product sales and ratings data

### 🔧 Key Steps for Any Dataset:
1. **Load data**: `pd.read_csv('filename.csv')`
2. **Explore**: Check shape, summary statistics, missing values
3. **Preprocess**: Handle categorical variables, standardize features
4. **Apply algorithms**: K-means for clustering, PCA for dimensionality reduction
5. **Visualize results**: Scatter plots, bar charts, variance plots
6. **Interpret**: Extract business insights from clusters/components

### 💡 Tips:
- Always standardize features before clustering
- Use elbow method to choose optimal number of clusters
- PCA helps visualize high-dimensional data
- Domain knowledge is crucial for interpreting results