[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/googlecolab/colab-samples/blob/main/notebooks/basic_notebook_features/text_cells.ipynb)




In [None]:
# Install PyCaret from GitHub master branch
!pip install git+https://github.com/pycaret/pycaret.git@master --upgrade -q

In [None]:
# Load Mall Customer Segmentation dataset
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# Load dataset from URL
url = 'https://raw.githubusercontent.com/SteffiPeTaffy/machineLearningAZ/master/Machine%20Learning%20A-Z%20Template%20Folder/Part%204%20-%20Clustering/Section%2025%20-%20Hierarchical%20Clustering/Mall_Customers.csv'
df = pd.read_csv(url)

# Check columns first
print(f"Columns in dataset: {df.columns.tolist()}")

# Drop CustomerID if it exists
if 'CustomerID' in df.columns:
    df = df.drop(columns=['CustomerID'])

# Keep only numeric columns for clustering
df = df.select_dtypes(include=['number'])

print(f"\n‚úÖ Mall Customers Dataset loaded: {df.shape}")
print(f"Features for clustering: {list(df.columns)}")
print(f"\nDataset Preview:")
df.head()

In [None]:
# Explore the data before clustering
print("üìä Dataset Statistics:")
print(df.describe())

print(f"\nüí° We have {len(df)} customers with 3 features:")
print("   ‚Ä¢ Age: Customer age")
print("   ‚Ä¢ Annual Income (k$): Yearly income in thousands")
print("   ‚Ä¢ Spending Score (1-100): Shopping behavior score")

In [None]:
from pycaret.clustering import *

# Initialize clustering environment
clust_exp = setup(
    data=df,
    session_id=789,
    normalize=True,
    transformation=True,
    pca=True,
    pca_components=2,
    verbose=False
)

In [None]:
# Display all available clustering algorithms
print("üîç Available Clustering Models:")
models()

In [None]:
# Create K-Means clustering model with 5 clusters
print("ü§ñ Creating K-Means model with 5 customer segments...")
kmeans_model = create_model('kmeans', num_clusters=5)
print("‚úÖ K-Means model created!")

In [None]:
# Check if GPU is available
!nvidia-smi --query-gpu=name --format=csv,noheader

In [None]:
# Assign cluster labels to customers
clustered_data = assign_model(kmeans_model)

print(f"‚úÖ Customers segmented into {len(clustered_data['Cluster'].unique())} clusters")
print(f"\nCluster Distribution:")
print(clustered_data['Cluster'].value_counts().sort_index())

print("\nüìä Sample of clustered customers:")
clustered_data.head(10)

In [None]:
# Generate clustering visualizations
print("üìà Generating cluster visualizations...")

plot_model(kmeans_model, plot='elbow')
plot_model(kmeans_model, plot='silhouette')
plot_model(kmeans_model, plot='cluster')
plot_model(kmeans_model, plot='distribution')

In [None]:
# Analyze each cluster's characteristics
print("üîç Cluster Characteristics:")
cluster_summary = clustered_data.groupby('Cluster').mean()
print(cluster_summary)

print("\nüí° Cluster Insights:")
for cluster in sorted(clustered_data['Cluster'].unique()):
    cluster_data = clustered_data[clustered_data['Cluster'] == cluster]
    avg_age = cluster_data['Age'].mean()
    avg_income = cluster_data['Annual Income (k$)'].mean()
    avg_spending = cluster_data['Spending Score (1-100)'].mean()
    print(f"\n   Cluster {cluster}: {len(cluster_data)} customers")
    print(f"      Avg Age: {avg_age:.1f} years")
    print(f"      Avg Income: ${avg_income:.1f}k")
    print(f"      Avg Spending Score: {avg_spending:.1f}/100")

In [None]:
# Interactive dashboard for cluster evaluation
evaluate_model(kmeans_model)

In [None]:
# Try another clustering algorithm - DBSCAN
print("üß© Creating DBSCAN model...")
dbscan_model = create_model('dbscan')

dbscan_clustered = assign_model(dbscan_model)
print(f"\nDBSCAN found {len(dbscan_clustered['Cluster'].unique())} clusters")
plot_model(dbscan_model, plot='cluster')

In [None]:
# Save the K-Means model
model_name = 'customer_segmentation_kmeans'
save_model(kmeans_model, model_name)

print(f"‚úÖ Model successfully saved as '{model_name}.pkl'")
print(f"üì¶ Model can be loaded using: loaded = load_model('{model_name}')")

print(f"\nüéØ Clustering Summary:")
print(f"   ‚Ä¢ Dataset: Mall Customers (200 samples)")
print(f"   ‚Ä¢ Algorithm: K-Means")
print(f"   ‚Ä¢ Number of Clusters: 5")
print(f"   ‚Ä¢ Use Case: Customer segmentation for marketing")