<a href="https://www.kaggle.com/code/sarraverse/03-clustering?scriptVersionId=284146635" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# CLUSTERING MODELS (3 MODELS)


In [None]:
import pandas as pd
import ast
from collections import Counter
import numpy as np
import time
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering, MiniBatchKMeans
from sklearn.metrics import silhouette_score
from sklearn.feature_extraction import FeatureHasher
from scipy.sparse import vstack
import joblib

print("Loading cleaned dataset...")
df = pd.read_csv("/kaggle/input/job-skills/all_jobs_mapped.csv")
print(f"Dataset shape: {df.shape}")

# Ensure skill_list is properly formatted as list
def to_skill_list(x):
    if isinstance(x, list):
        return x
    if pd.isna(x):
        return []
    try:
        val = ast.literal_eval(x)
        if isinstance(val, list):
            return [str(s).strip().lower() for s in val]
    except:
        pass
    return [s.strip().lower() for s in str(x).split(",") if s.strip()]

df["skill_list"] = df["skill_list"].apply(to_skill_list)
df["skill_categories"] = df["skill_categories"].fillna("").astype(str)

# Function to build feature matrix
def build_minibatch_feature_matrix(df, chunk_size=50000, min_freq=50, n_features=1000):
    """Build feature matrix using MiniBatch approach for entire dataset"""
    print("üîç Building global skill vocabulary from ALL data...")
    
    # Step 1: Build vocabulary from entire dataset in chunks
    all_skills = Counter()
    total_chunks = (len(df) + chunk_size - 1) // chunk_size
    
    for i in range(0, len(df), chunk_size):
        chunk = df.iloc[i:i+chunk_size]
        for skills in chunk["skill_list"]:
            all_skills.update(skills)
        if i % 100000 == 0:
            print(f"  Processed {i:,} jobs...")
    
    # Keep most frequent skills
    keep_skills = {k for k, v in all_skills.items() if v >= min_freq}
    skill_vocab = sorted(list(keep_skills))
    
    print(f"‚úÖ Global vocabulary: {len(skill_vocab)} skills (appearing ‚â• {min_freq} times)")
    
    # Step 2: Use FeatureHasher for memory-efficient transformation
    print("üîÑ Transforming data with FeatureHasher...")
    
    def skills_to_dict(skills_list):
        return {skill: 1 for skill in skills_list if skill in keep_skills}
    
    # Process in chunks to avoid memory issues
    hasher = FeatureHasher(n_features=n_features, input_type='dict', alternate_sign=False)
    
    # Process in chunks and store results
    X_chunks = []
    
    for i in range(0, len(df), chunk_size):
        chunk = df.iloc[i:i+chunk_size]
        chunk_dicts = chunk["skill_list"].apply(skills_to_dict).tolist()
        X_chunk = hasher.transform(chunk_dicts)
        X_chunks.append(X_chunk)
    
    # Combine all chunks
    X_sparse = vstack(X_chunks)
    
    print(f"‚úÖ Feature matrix built: {X_sparse.shape}")
    print(f"üíæ Memory usage: {X_sparse.data.nbytes / (1024**2):.2f} MB")
    
    return X_sparse, skill_vocab, hasher

# Build the feature matrix for entire dataset
X_sparse, skill_vocab, hasher = build_minibatch_feature_matrix(df, chunk_size=50000, min_freq=50, n_features=1000)

# Create a smaller dense sample for models that require dense matrices
sample_size = min(20000, X_sparse.shape[0])
X_dense_sample = X_sparse[:sample_size].toarray()
print(f"Created dense sample: {X_dense_sample.shape} for models requiring dense matrices")

# Model C1: K-Means
print("\n=== Model C1: K-Means Clustering ===")
k_values = [5, 8, 10]
results_kmeans = {}

for k in k_values:
    print(f"Running KMeans with k={k}...")
    km = KMeans(n_clusters=k, random_state=42, n_init=3)
    
    # KMeans works with sparse matrices, so use larger sample
    kmeans_sample_size = min(50000, X_sparse.shape[0])
    X_kmeans = X_sparse[:kmeans_sample_size]
    
    labels = km.fit_predict(X_kmeans)
    results_kmeans[k] = labels
    
    # Evaluate with silhouette score
    score = silhouette_score(X_kmeans, labels)
    results_kmeans[f"silhouette_{k}"] = score
    print(f"KMeans k={k}: silhouette={score:.3f}")

# Find best K for KMeans
best_k_kmeans = max(k_values, key=lambda k: results_kmeans[f"silhouette_{k}"])
print(f"\nBest KMeans: k={best_k_kmeans} (silhouette={results_kmeans[f'silhouette_{best_k_kmeans}']:.3f})")

# Model C2: DBSCAN
print("\n=== Model C2: DBSCAN Clustering ===")
# DBSCAN works with sparse matrices
dbscan_sample_size = min(20000, X_sparse.shape[0])
X_dbscan = X_sparse[:dbscan_sample_size]

print(f"DBSCAN running on {X_dbscan.shape[0]} samples")

eps_values = [0.5, 0.7]
min_samples_values = [10, 15]

results_dbscan = {}

for eps in eps_values:
    for ms in min_samples_values:
        print(f"Testing DBSCAN eps={eps}, min_samples={ms}...")
        db = DBSCAN(eps=eps, min_samples=ms, metric='euclidean', n_jobs=-1)
        labels = db.fit_predict(X_dbscan)
        n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
        n_noise = (labels == -1).sum()
        noise_percentage = (n_noise / len(labels)) * 100
        
        # Store results
        key = f"eps{eps}_min{ms}"
        results_dbscan[key] = {
            'labels': labels,
            'n_clusters': n_clusters,
            'n_noise': n_noise,
            'noise_percentage': noise_percentage
        }
        
        print(f"  Clusters: {n_clusters}, Noise: {n_noise} ({noise_percentage:.1f}%)")
        
        # silhouette only if >1 cluster and not all noise
        if n_clusters > 1 and n_clusters < len(labels):
            mask = labels != -1
            if len(set(labels[mask])) > 1 and sum(mask) > 1:
                score = silhouette_score(X_dbscan[mask], labels[mask])
                results_dbscan[key]['silhouette'] = score
                print(f"  Silhouette: {score:.3f}")

# Model C3: Agglomerative Clustering
print("\n=== Model C3: Agglomerative Clustering ===")
k_values = [5, 8, 10]
results_agglo = {}

for k in k_values:
    print(f"Running Agglomerative with k={k}...")
    ac = AgglomerativeClustering(n_clusters=k)
    
    # Agglomerative requires dense matrix, so use our dense sample
    labels = ac.fit_predict(X_dense_sample)
    results_agglo[k] = labels
    
    score = silhouette_score(X_dense_sample, labels)
    results_agglo[f"silhouette_{k}"] = score
    print(f"Agglomerative k={k}: silhouette={score:.3f}")

# Find best K for Agglomerative
best_k_agglo = max(k_values, key=lambda k: results_agglo[f"silhouette_{k}"])
print(f"\nBest Agglomerative: k={best_k_agglo} (silhouette={results_agglo[f'silhouette_{best_k_agglo}']:.3f})")

# Compare all models
print("\n" + "="*50)
print("CLUSTERING MODEL COMPARISON")
print("="*50)

def compare_all_models(results_kmeans, results_dbscan, results_agglo, k_values, use_dbscan=True):
    """Compare all clustering models and select the best one"""
    
    best_model = None
    best_score = -1
    best_config = None
    
    # Compare KMeans models
    print("KMeans Results:")
    for k in k_values:
        score = results_kmeans[f"silhouette_{k}"]
        print(f"  k={k}: silhouette={score:.3f}")
        if score > best_score:
            best_score = score
            best_model = "KMeans"
            best_config = k
    
    # Compare Agglomerative models  
    print("\nAgglomerative Results:")
    for k in k_values:
        score = results_agglo[f"silhouette_{k}"]
        print(f"  k={k}: silhouette={score:.3f}")
        if score > best_score:
            best_score = score
            best_model = "Agglomerative"
            best_config = k
    
    # Compare DBSCAN models
    if use_dbscan:
        print("\nDBSCAN Results:")
        for key, result in results_dbscan.items():
            if 'silhouette' in result:
                score = result['silhouette']
                print(f"  {key}: silhouette={score:.3f} (clusters: {result['n_clusters']}, noise: {result['noise_percentage']:.1f}%)")
                if score > best_score:
                    best_score = score
                    best_model = "DBSCAN"
                    best_config = key
    
    print("="*50)
    print(f"üèÜ BEST MODEL: {best_model}")
    print(f"üìä Best Configuration: {best_config}")
    print(f"üéØ Best Silhouette Score: {best_score:.3f}")
    
    return best_model, best_config, best_score

# Compare models
best_model, best_config, best_score = compare_all_models(results_kmeans, results_dbscan, results_agglo, k_values, use_dbscan=True)

# Apply Best Model to Full Dataset
print("\n" + "="*50)
print("Applying Best Model to Full Dataset...")
print("="*50)

start_time = time.time()

if best_model == "KMeans":
    n_clusters = best_config
    batch_size = 100_000
    
    print(f"Applying MiniBatch K-Means with k={n_clusters} on full dataset...")
    mbk = MiniBatchKMeans(
        n_clusters=n_clusters,
        batch_size=batch_size,
        random_state=42,
        n_init=10
    )
    
    full_labels = mbk.fit_predict(X_sparse)
    
    # Save results
    df_full = df.copy()
    df_full['cluster'] = full_labels
    
    end_time = time.time()
    print(f"‚úÖ MiniBatch K-Means completed in {(end_time-start_time)/60:.1f} minutes")
    print(f"üìä Total points: {len(full_labels):,}")
    print(f"  Clusters: {n_clusters}")
    
    df_full.to_csv("all_jobs_clustered_full_kmeans.csv", index=False)
    print("üíæ Saved: all_jobs_clustered_full_kmeans.csv")
    
    # Save model
    joblib.dump({
        'best_model': 'MiniBatchKMeans',
        'parameters': {
            'n_clusters': n_clusters,
            'batch_size': batch_size
        },
        'feature_hasher': hasher,
        'trained_model': mbk,
    }, "clustering_results_kmeans.pkl")

elif best_model == "Agglomerative":
    print("‚ö†Ô∏è Note: Agglomerative clustering on full dataset is memory-intensive.")
    print("Applying to sample dataset instead...")
    
    n_clusters = best_config
    ac = AgglomerativeClustering(n_clusters=n_clusters)
    labels_sample = ac.fit_predict(X_dense_sample)
    
    # Save results for sample
    df_sample = df.iloc[:sample_size].copy()
    df_sample['cluster'] = labels_sample
    
    df_sample.to_csv("all_jobs_clustered_sample_agglo.csv", index=False)
    print("üíæ Saved: all_jobs_clustered_sample_agglo.csv")

elif best_model == "DBSCAN":
    print("‚ö†Ô∏è Note: DBSCAN on full dataset is computationally expensive.")
    print("Applying to sample dataset instead...")
    
    eps, ms = best_config.replace("eps", "").replace("min", "").split("_")
    eps = float(eps)
    ms = int(ms)
    
    db = DBSCAN(eps=eps, min_samples=ms, metric='euclidean', n_jobs=-1)
    labels_sample = db.fit_predict(X_dbscan)
    
    # Save results for sample
    df_sample = df.iloc[:dbscan_sample_size].copy()
    df_sample['cluster'] = labels_sample
    
    df_sample.to_csv("all_jobs_clustered_sample_dbscan.csv", index=False)
    print("üíæ Saved: all_jobs_clustered_sample_dbscan.csv")

print("üéâ Clustering pipeline completed successfully!")