# Career Clustering Analysis
This notebook performs clustering analysis on career data to identify similar career paths.

In [None]:
# Import required libraries
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import umap.umap_ as umap
from wordcloud import WordCloud
import plotly.express as px

# Set plot style
# plt.style.use('seaborn')  # Commented out as it's not available
%matplotlib inline

In [None]:
# Load and prepare data
data_path = os.path.join('..', 'data', 'careers.csv')
df = pd.read_csv(data_path)
df['combined_features'] = df['Career'] + ' ' + df['Skills'] + ' ' + df['Description']

print(f"Loaded {len(df)} careers")
df.head()

In [None]:
# Text Vectorization
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
X = vectorizer.fit_transform(df['combined_features'])
print(f"Vectorized data shape: {X.shape}")

In [None]:
# Dimensionality Reduction with UMAP
reducer = umap.UMAP(random_state=42)
X_umap = reducer.fit_transform(X)
df['x'] = X_umap[:, 0]
df['y'] = X_umap[:, 1]

In [None]:
# Find optimal number of clusters
inertias = []
K = range(2, min(11, len(df)))

for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_umap)
    inertias.append(kmeans.inertia_)

# Plot elbow curve
plt.figure(figsize=(10, 6))
plt.plot(K, inertias, 'bo-')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.title('Elbow Method For Optimal k')
plt.xticks(K)
plt.grid(True)
plt.show()

In [None]:
# Apply K-means clustering
n_clusters = 5  # Adjust based on elbow plot
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
df['cluster'] = kmeans.fit_predict(X_umap)

# Display cluster distribution
cluster_counts = df['cluster'].value_counts().sort_index()
print("Cluster distribution:")
print(cluster_counts)

In [None]:
# Create interactive visualization
fig = px.scatter(
    df, 
    x='x', y='y',
    color='cluster',
    hover_name='Career',
    hover_data=['Domain', 'Skills'],
    title='Career Clusters Visualization',
    width=1000,
    height=600
)

# Save the interactive plot
output_dir = os.path.join('..', 'output')
os.makedirs(output_dir, exist_ok=True)
fig.write_html(os.path.join(output_dir, 'career_clusters.html'))

# Show the plot in the notebook
fig.show()

In [None]:
# Generate word clouds for each cluster
wordclouds_dir = os.path.join('..', 'output', 'wordclouds')
os.makedirs(wordclouds_dir, exist_ok=True)

for cluster in sorted(df['cluster'].unique()):
    plt.figure(figsize=(10, 6))
    text = ' '.join(df[df['cluster'] == cluster]['combined_features'])
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f'Cluster {cluster} - Common Terms')
    plt.savefig(os.path.join(wordclouds_dir, f'cluster_{cluster}_wordcloud.png'), 
                bbox_inches='tight', dpi=300)
    plt.close()
    print(f"Generated word cloud for cluster {cluster}")

print("\nAll word clouds saved to:output/wordclouds/")

In [None]:
# Save the clustered data
output_csv = os.path.join('..', 'output', 'clustered_careers.csv')
df.to_csv(output_csv, index=False)
print(f"\nClustered data saved to: {output_csv}")

print("\nAnalysis complete! Open the HTML file in the output folder to view the interactive visualization.")