In [1]:
# Cell 1: Import all required libraries
import pandas as pd
import numpy as np
import json
import zipfile
import os
from datetime import datetime

# Machine Learning
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import silhouette_score

# Persistence
import joblib

# Visualization (optional)
import matplotlib.pyplot as plt
import seaborn as sns

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

print("✓ All libraries imported successfully")
print("✓ User Clustering Agent initialized")


✓ All libraries imported successfully
✓ User Clustering Agent initialized


In [2]:
# Cell 2: Load data and perform initial exploration
def load_and_explore_data(file_path="user_data_vectors.xlsx", sheet_name="user_vectors"):
    """
    Load the Excel file and perform initial data exploration
    """
    try:
        # Load data
        df = pd.read_excel(file_path, sheet_name=sheet_name)

        print(f"✓ Data loaded successfully")
        print(f"  - Shape: {df.shape}")
        print(f"  - Users: {df.shape[0]}")
        print(f"  - Features: {df.shape[1]}")

        # Display column information
        print(f"\n📊 Column Information:")
        print(f"  - Columns: {list(df.columns)}")

        # Check for missing values
        missing_values = df.isnull().sum()
        if missing_values.any():
            print(f"\n⚠️  Missing values found:")
            print(missing_values[missing_values > 0])
        else:
            print(f"\n✓ No missing values detected")

        # Basic statistics
        print(f"\n📈 Basic Statistics:")
        print(df.describe().round(3))

        return df

    except Exception as e:
        print(f"❌ Error loading data: {str(e)}")
        return None

# Load the data
df_raw = load_and_explore_data()


✓ Data loaded successfully
  - Shape: (327, 13)
  - Users: 327
  - Features: 13

📊 Column Information:
  - Columns: ['User_ID', 'Age', 'Monthly_Income', 'Average_Rating', 'Average_Z_Score', 'Average_Cost', 'Gender', 'weather', 'Marital_Status', 'C_Type', 'purchase_sensitivity', 'Veg_Ratio', 'Non_Veg_Ratio']

✓ No missing values detected

📈 Basic Statistics:
       User_ID      Age  Monthly_Income  Average_Rating  Average_Z_Score  \
count  327.000  327.000         327.000         327.000          327.000   
mean   164.000   -0.086           0.041           4.006           -0.051   
std     94.541    0.777           0.949           0.413            0.664   
min      1.000   -2.501          -1.205           3.000           -2.151   
25%     82.500   -0.531          -0.731           3.780           -0.361   
50%    164.000   -0.039          -0.472           4.000            0.012   
75%    245.500    0.207           0.908           4.250            0.281   
max    327.000    4.392         

In [3]:
# Cell 3: Feature selection and column exclusion
def prepare_features(df):
    """
    Prepare features by dropping specified columns and organizing remaining features
    """
    # Create a copy to avoid modifying original data
    df_processed = df.copy()

    # Columns to drop as specified
    columns_to_drop = [
        'User_ID',                    # Always drop User_ID
        'Non_Veg_Ratio',             # Drop one from the collinear pair (keeping Veg_Ratio)
        'Average_Cost'               # Drop one from the correlated pair (keeping Average_Z_Score)
    ]

    print("🔄 Preparing features...")
    print(f"  - Dropping columns: {columns_to_drop}")

    # Drop specified columns
    df_processed = df_processed.drop(columns=columns_to_drop, errors='ignore')

    # Identify feature types
    # Numerical features (already standardized)
    numerical_features = ['Age', 'Monthly_Income', 'Average_Rating', 'Average_Z_Score', 'purchase_sensitivity']

    # Categorical features (already one-hot encoded as integers)
    categorical_features = ['Gender', 'weather', 'Marital_Status', 'C_Type']

    # Behavioral features
    behavioral_features = ['Veg_Ratio']

    # Verify all features exist
    all_features = numerical_features + categorical_features + behavioral_features
    available_features = [col for col in all_features if col in df_processed.columns]

    print(f"  - Numerical features: {numerical_features}")
    print(f"  - Categorical features: {categorical_features}")
    print(f"  - Behavioral features: {behavioral_features}")
    print(f"  - Total features for clustering: {len(available_features)}")

    # Final feature matrix
    X = df_processed[available_features]

    print(f"✓ Feature matrix prepared")
    print(f"  - Shape: {X.shape}")
    print(f"  - Features: {list(X.columns)}")

    return X, available_features, numerical_features, categorical_features, behavioral_features

# Prepare features
X, feature_names, num_features, cat_features, behav_features = prepare_features(df_raw)


🔄 Preparing features...
  - Dropping columns: ['User_ID', 'Non_Veg_Ratio', 'Average_Cost']
  - Numerical features: ['Age', 'Monthly_Income', 'Average_Rating', 'Average_Z_Score', 'purchase_sensitivity']
  - Categorical features: ['Gender', 'weather', 'Marital_Status', 'C_Type']
  - Behavioral features: ['Veg_Ratio']
  - Total features for clustering: 10
✓ Feature matrix prepared
  - Shape: (327, 10)
  - Features: ['Age', 'Monthly_Income', 'Average_Rating', 'Average_Z_Score', 'purchase_sensitivity', 'Gender', 'weather', 'Marital_Status', 'C_Type', 'Veg_Ratio']


In [4]:
# Cell 4: Create preprocessing pipeline
def create_preprocessing_pipeline(numerical_features, categorical_features, behavioral_features):
    """
    Create preprocessing pipeline for the features
    Since features are already standardized/encoded, we use passthrough
    """
    # Since features are already preprocessed, we use passthrough
    # But we still create a pipeline for consistency and future flexibility

    # All features get passthrough treatment since they're already processed
    all_features = numerical_features + categorical_features + behavioral_features

    preprocessor = ColumnTransformer(
        transformers=[
            ('passthrough', 'passthrough', all_features)
        ],
        remainder='drop'
    )

    print("🔧 Preprocessing pipeline created")
    print("  - Strategy: Passthrough (features already preprocessed)")
    print(f"  - Processing {len(all_features)} features")

    return preprocessor

# Create preprocessing pipeline
preprocessor = create_preprocessing_pipeline(num_features, cat_features, behav_features)


🔧 Preprocessing pipeline created
  - Strategy: Passthrough (features already preprocessed)
  - Processing 10 features


In [5]:
# Cell 5: Train K-means++ clustering model
def train_kmeans_model(X, preprocessor, n_clusters=4, random_state=42):
    """
    Train K-means++ clustering model
    """
    print("🚀 Training K-means++ clustering model...")

    # Create the full pipeline
    kmeans_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('kmeans', KMeans(
            n_clusters=n_clusters,
            init='k-means++',
            n_init=10,
            max_iter=300,
            random_state=random_state
        ))
    ])

    # Fit the model
    kmeans_pipeline.fit(X)

    # Get cluster labels
    labels = kmeans_pipeline.named_steps['kmeans'].labels_

    # Get centroids
    centroids = kmeans_pipeline.named_steps['kmeans'].cluster_centers_

    # Calculate metrics
    X_transformed = preprocessor.fit_transform(X)
    inertia = kmeans_pipeline.named_steps['kmeans'].inertia_
    silhouette_avg = silhouette_score(X_transformed, labels)

    print(f"✓ Model training completed")
    print(f"  - Clusters: {n_clusters}")
    print(f"  - Inertia: {inertia:.3f}")
    print(f"  - Silhouette Score: {silhouette_avg:.3f}")

    # Cluster distribution
    unique, counts = np.unique(labels, return_counts=True)
    print(f"  - Cluster distribution: {dict(zip(unique, counts))}")

    return kmeans_pipeline, labels, centroids, inertia, silhouette_avg

# Train the model
model_pipeline, cluster_labels, centroids, inertia_score, silhouette_score = train_kmeans_model(X, preprocessor)


🚀 Training K-means++ clustering model...
✓ Model training completed
  - Clusters: 4
  - Inertia: 1718.219
  - Silhouette Score: 0.279
  - Cluster distribution: {np.int32(0): np.int64(80), np.int32(1): np.int64(66), np.int32(2): np.int64(106), np.int32(3): np.int64(75)}


In [6]:
# Cell 6: Generate persona metadata
def generate_persona_metadata(X, cluster_labels, centroids, feature_names):
    """
    Generate persona metadata for each cluster
    """
    print("👥 Generating persona metadata...")

    # Define persona names (can be customized based on business context)
    persona_names = {
        0: "Established Urban Professionals",
        1: "Premium Self-Employed Segment",
        2: "Young Urban Students",
        3: "Price-Sensitive Employees"
    }

    # Create DataFrame with features and labels
    df_with_labels = X.copy()
    df_with_labels['cluster_id'] = cluster_labels

    metadata = []

    for cluster_id in range(4):
        cluster_data = df_with_labels[df_with_labels['cluster_id'] == cluster_id]

        # Calculate means for numerical features
        cluster_means = cluster_data.drop('cluster_id', axis=1).mean()

        # Create metadata structure
        cluster_metadata = {
            'cluster_id': int(cluster_id),
            'persona_name': persona_names.get(cluster_id, f"Cluster_{cluster_id}"),
            'cluster_size': len(cluster_data),
            'percentage_of_total': round(len(cluster_data) / len(X) * 100, 2)
        }

        # Add feature averages
        for feature in feature_names:
            if feature in cluster_means.index:
                cluster_metadata[f'avg_{feature}'] = round(cluster_means[feature], 4)

        # Add categorical distributions for specific features
        categorical_features = ['Gender', 'weather', 'Marital_Status', 'C_Type']
        for cat_feature in categorical_features:
            if cat_feature in cluster_data.columns:
                # Calculate distribution
                value_counts = cluster_data[cat_feature].value_counts()
                total = len(cluster_data)
                distribution = {str(k): round(v/total, 4) for k, v in value_counts.items()}
                cluster_metadata[f'{cat_feature}_distribution'] = distribution

        metadata.append(cluster_metadata)

    print("✓ Persona metadata generated")
    for i, meta in enumerate(metadata):
        print(f"  - Cluster {i}: {meta['persona_name']} ({meta['cluster_size']} users, {meta['percentage_of_total']}%)")

    return metadata

# Generate metadata
persona_metadata = generate_persona_metadata(X, cluster_labels, centroids, feature_names)


👥 Generating persona metadata...
✓ Persona metadata generated
  - Cluster 0: Established Urban Professionals (80 users, 24.46%)
  - Cluster 1: Premium Self-Employed Segment (66 users, 20.18%)
  - Cluster 2: Young Urban Students (106 users, 32.42%)
  - Cluster 3: Price-Sensitive Employees (75 users, 22.94%)


In [7]:
# Cell 7: Create output generation functions
def generate_cluster_labels_csv(cluster_labels):
    """
    Generate cluster labels CSV file
    """
    labels_df = pd.DataFrame({
        'row_index': range(len(cluster_labels)),
        'cluster_id': cluster_labels
    })

    labels_df.to_csv('cluster_labels.csv', index=False)
    print("✓ cluster_labels.csv generated")
    return labels_df

def generate_cluster_metadata_json(metadata):
    """
    Generate cluster metadata JSON file
    """
    with open('cluster_metadata.json', 'w') as f:
        json.dump(metadata, f, indent=2)
    print("✓ cluster_metadata.json generated")

    # Also create CSV version
    metadata_df = pd.DataFrame(metadata)
    metadata_df.to_csv('cluster_metadata.csv', index=False)
    print("✓ cluster_metadata.csv generated")
    return metadata_df

def generate_cluster_preferences_csv(centroids, feature_names):
    """
    Generate cluster preferences CSV file
    """
    preferences_df = pd.DataFrame(centroids, columns=feature_names)
    preferences_df.insert(0, 'cluster_id', range(len(centroids)))

    preferences_df.to_csv('cluster_preferences.csv', index=False)
    print("✓ cluster_preferences.csv generated")
    return preferences_df

def create_download_package():
    """
    Create ZIP package with all outputs
    """
    zip_filename = f'user_clustering_outputs_{datetime.now().strftime("%Y%m%d_%H%M%S")}.zip'

    with zipfile.ZipFile(zip_filename, 'w') as zipf:
        files_to_zip = [
            'cluster_labels.csv',
            'cluster_metadata.json',
            'cluster_metadata.csv',
            'cluster_preferences.csv'
        ]

        for file in files_to_zip:
            if os.path.exists(file):
                zipf.write(file)

    print(f"✓ Download package created: {zip_filename}")
    return zip_filename

# Generate all outputs
labels_df = generate_cluster_labels_csv(cluster_labels)
metadata_df = generate_cluster_metadata_json(persona_metadata)
preferences_df = generate_cluster_preferences_csv(centroids, feature_names)
zip_file = create_download_package()


✓ cluster_labels.csv generated
✓ cluster_metadata.json generated
✓ cluster_metadata.csv generated
✓ cluster_preferences.csv generated
✓ Download package created: user_clustering_outputs_20250714_161308.zip


In [8]:
# Cell 8: Save model for future inference
def save_model_pipeline(pipeline, feature_names, filename='user_clustering_agent.pkl'):
    """
    Save the trained pipeline for future inference
    """
    model_package = {
        'pipeline': pipeline,
        'feature_names': feature_names,
        'training_date': datetime.now().isoformat(),
        'model_info': {
            'n_clusters': 4,
            'algorithm': 'k-means++',
            'random_state': 42
        }
    }

    joblib.dump(model_package, filename)
    print(f"✓ Model saved to {filename}")
    return filename

# Save the model
model_file = save_model_pipeline(model_pipeline, feature_names)


✓ Model saved to user_clustering_agent.pkl


In [9]:
# Cell 9: Create inference function for new users
def load_model_and_predict(new_user_data, model_path='user_clustering_agent.pkl'):
    """
    Load saved model and predict cluster for new user

    Parameters:
    new_user_data: dict with keys matching the feature names
    model_path: path to saved model file

    Returns:
    cluster_id: int, assigned cluster
    distances: array, distances to all centroids
    confidence: float, confidence score (inverse of min distance)
    """
    try:
        # Load model package
        model_package = joblib.load(model_path)
        pipeline = model_package['pipeline']
        feature_names = model_package['feature_names']

        # Convert input to DataFrame
        user_df = pd.DataFrame([new_user_data])

        # Ensure all required features are present
        for feature in feature_names:
            if feature not in user_df.columns:
                user_df[feature] = 0.0  # Default value for missing features

        # Reorder columns to match training
        user_df = user_df[feature_names]

        # Predict cluster
        cluster_id = pipeline.predict(user_df)[0]

        # Get distances to all centroids
        distances = pipeline.named_steps['kmeans'].transform(
            pipeline.named_steps['preprocessor'].transform(user_df)
        )[0]

        # Calculate confidence (inverse of minimum distance)
        confidence = 1 / (1 + min(distances))

        return int(cluster_id), distances, confidence

    except Exception as e:
        print(f"❌ Error in prediction: {str(e)}")
        return None, None, None

# Test the inference function
test_user = {
    'Age': 0.5,
    'Monthly_Income': 0.2,
    'Average_Rating': 4.2,
    'Average_Z_Score': 0.1,
    'purchase_sensitivity': 2,
    'Gender': 1,
    'weather': 1,
    'Marital_Status': 0,
    'C_Type': 5,
    'Veg_Ratio': 0.8
}

predicted_cluster, distances, confidence = load_model_and_predict(test_user)
print(f"🔮 Test Prediction:")
print(f"  - Cluster: {predicted_cluster}")
print(f"  - Distances: {distances.round(3)}")
print(f"  - Confidence: {confidence:.3f}")


🔮 Test Prediction:
  - Cluster: 2
  - Distances: [3.463 8.533 1.205 5.02 ]
  - Confidence: 0.454


In [10]:
# Cell 10: Validation and quality checks
def validate_clustering_results(X, labels, centroids, silhouette_avg):
    """
    Perform validation and quality checks
    """
    print("🔍 Validation and Quality Checks:")

    # Check 1: Verify excluded columns are not in outputs
    excluded_columns = ['User_ID', 'Non_Veg_Ratio', 'Average_Cost']
    print(f"  ✓ Excluded columns verification:")
    print(f"    - Excluded: {excluded_columns}")
    print(f"    - Present features: {list(X.columns)}")

    # Check 2: Silhouette score validation
    print(f"  ✓ Silhouette score: {silhouette_avg:.3f}")
    if silhouette_avg > 0:
        print(f"    - Status: ✓ Good (> 0)")
    else:
        print(f"    - Status: ⚠️ Needs improvement")

    # Check 3: Cluster balance
    unique, counts = np.unique(labels, return_counts=True)
    cluster_balance = dict(zip(unique, counts))
    print(f"  ✓ Cluster balance: {cluster_balance}")

    min_cluster_size = min(counts)
    max_cluster_size = max(counts)
    balance_ratio = min_cluster_size / max_cluster_size

    if balance_ratio > 0.1:  # At least 10% of largest cluster
        print(f"    - Status: ✓ Well balanced (ratio: {balance_ratio:.3f})")
    else:
        print(f"    - Status: ⚠️ Imbalanced (ratio: {balance_ratio:.3f})")

    # Check 4: Centroid interpretability
    print(f"  ✓ Centroid analysis:")
    for i, centroid in enumerate(centroids):
        print(f"    - Cluster {i}: mean={centroid.mean():.3f}, std={centroid.std():.3f}")

    return True

# Run validation
validation_passed = validate_clustering_results(X, cluster_labels, centroids, silhouette_score)


🔍 Validation and Quality Checks:
  ✓ Excluded columns verification:
    - Excluded: ['User_ID', 'Non_Veg_Ratio', 'Average_Cost']
    - Present features: ['Age', 'Monthly_Income', 'Average_Rating', 'Average_Z_Score', 'purchase_sensitivity', 'Gender', 'weather', 'Marital_Status', 'C_Type', 'Veg_Ratio']
  ✓ Silhouette score: 0.279
    - Status: ✓ Good (> 0)
  ✓ Cluster balance: {np.int32(0): np.int64(80), np.int32(1): np.int64(66), np.int32(2): np.int64(106), np.int32(3): np.int64(75)}
    - Status: ✓ Well balanced (ratio: 0.623)
  ✓ Centroid analysis:
    - Cluster 0: mean=1.134, std=1.256
    - Cluster 1: mean=2.275, std=3.942
    - Cluster 2: mean=1.463, std=1.820
    - Cluster 3: mean=1.902, std=2.941


In [11]:
# Cell 11: Retraining utility function
def retrain_clustering_agent(file_path="user_data_vectors.xlsx",
                           sheet_name="user_vectors",
                           n_clusters=4,
                           random_state=42):
    """
    Utility function to retrain the clustering agent with fresh data
    """
    print("🔄 Retraining User Clustering Agent...")

    # Load fresh data
    df_fresh = load_and_explore_data(file_path, sheet_name)
    if df_fresh is None:
        return False

    # Prepare features
    X_fresh, feature_names_fresh, num_feat, cat_feat, behav_feat = prepare_features(df_fresh)

    # Create new preprocessing pipeline
    preprocessor_fresh = create_preprocessing_pipeline(num_feat, cat_feat, behav_feat)

    # Train new model
    model_fresh, labels_fresh, centroids_fresh, inertia_fresh, silhouette_fresh = train_kmeans_model(
        X_fresh, preprocessor_fresh, n_clusters, random_state
    )

    # Generate new metadata
    metadata_fresh = generate_persona_metadata(X_fresh, labels_fresh, centroids_fresh, feature_names_fresh)

    # Generate new outputs
    generate_cluster_labels_csv(labels_fresh)
    generate_cluster_metadata_json(metadata_fresh)
    generate_cluster_preferences_csv(centroids_fresh, feature_names_fresh)

    # Save new model
    save_model_pipeline(model_fresh, feature_names_fresh)

    # Create new download package
    new_zip = create_download_package()

    print("✓ Retraining completed successfully")
    return True

# The retraining function is now available for use
print("🔧 Retraining utility function created")
print("  - Usage: retrain_clustering_agent()")
print("  - Will reload data, retrain model, and generate new outputs")


🔧 Retraining utility function created
  - Usage: retrain_clustering_agent()
  - Will reload data, retrain model, and generate new outputs


In [12]:
# Cell 12: Display final results and summary
def display_final_summary():
    """
    Display comprehensive summary of the clustering results
    """
    print("="*60)
    print("🎯 USER CLUSTERING AGENT - FINAL RESULTS")
    print("="*60)

    print(f"\n📊 DATASET SUMMARY:")
    print(f"  - Total users clustered: {len(cluster_labels)}")
    print(f"  - Features used: {len(feature_names)}")
    print(f"  - Clusters created: 4")

    print(f"\n🎯 MODEL PERFORMANCE:")
    print(f"  - Inertia: {inertia_score:.3f}")
    print(f"  - Silhouette Score: {silhouette_score:.3f}")
    print(f"  - Algorithm: K-means++")

    print(f"\n👥 CLUSTER DISTRIBUTION:")
    for i, meta in enumerate(persona_metadata):
        print(f"  - Cluster {i}: {meta['persona_name']}")
        print(f"    └─ Users: {meta['cluster_size']} ({meta['percentage_of_total']}%)")

    print(f"\n📁 GENERATED FILES:")
    files = [
        'cluster_labels.csv',
        'cluster_metadata.json',
        'cluster_metadata.csv',
        'cluster_preferences.csv',
        'user_clustering_agent.pkl',
        zip_file
    ]

    for file in files:
        if os.path.exists(file):
            size = os.path.getsize(file)
            print(f"  ✓ {file} ({size} bytes)")

    print(f"\n🚀 READY FOR DEPLOYMENT:")
    print(f"  - Model saved and ready for inference")
    print(f"  - Download package created: {zip_file}")
    print(f"  - Retraining function available")

    print("\n" + "="*60)
    print("✅ USER CLUSTERING AGENT IMPLEMENTATION COMPLETE")
    print("="*60)

# Display final summary
display_final_summary()


🎯 USER CLUSTERING AGENT - FINAL RESULTS

📊 DATASET SUMMARY:
  - Total users clustered: 327
  - Features used: 10
  - Clusters created: 4

🎯 MODEL PERFORMANCE:
  - Inertia: 1718.219
  - Silhouette Score: 0.279
  - Algorithm: K-means++

👥 CLUSTER DISTRIBUTION:
  - Cluster 0: Established Urban Professionals
    └─ Users: 80 (24.46%)
  - Cluster 1: Premium Self-Employed Segment
    └─ Users: 66 (20.18%)
  - Cluster 2: Young Urban Students
    └─ Users: 106 (32.42%)
  - Cluster 3: Price-Sensitive Employees
    └─ Users: 75 (22.94%)

📁 GENERATED FILES:
  ✓ cluster_labels.csv (1873 bytes)
  ✓ cluster_metadata.json (3195 bytes)
  ✓ cluster_metadata.csv (1337 bytes)
  ✓ cluster_preferences.csv (859 bytes)
  ✓ user_clustering_agent.pkl (4306 bytes)
  ✓ user_clustering_outputs_20250714_161308.zip (7754 bytes)

🚀 READY FOR DEPLOYMENT:
  - Model saved and ready for inference
  - Download package created: user_clustering_outputs_20250714_161308.zip
  - Retraining function available

✅ USER CLUSTERIN