In [2]:
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer, util

from collections import defaultdict
import torch
import numpy as np
import scipy.cluster.hierarchy as sch
from scipy.spatial.distance import pdist

import matplotlib.pyplot as plt

  from tqdm.autonotebook import tqdm, trange


In [4]:

seva_data_path = "../toy_data/TOY_seva_modified20apr.xlsx"
candidate_info_df = pd.read_excel(seva_data_path)

skills_jobs_df = pd.DataFrame()
skills_jobs_df["SP ID"] = candidate_info_df["SP ID"]
skills_jobs_df['Skills Jobs'] = candidate_info_df['Skills'].fillna(' ')
# skills_jobs_df['Skills Jobs'] = candidate_info_df['Work Experience/Designation'].fillna(' ') + " " + candidate_info_df['Any Additional Skills'].fillna(' ') + " " + \
#         candidate_info_df['Computer Skills'].fillna(' ') + " " + candidate_info_df['Skills'].fillna(' ') + " " + candidate_info_df['Skills.1'].fillna(' ')

In [5]:
# Create Skills and Jobs Embeddings
model = SentenceTransformer('sentence-transformers/all-roberta-large-v1', device='cpu')
print('EMBEDDING SKILLS...')
embeddings = model.encode(skills_jobs_df['Skills Jobs'], convert_to_tensor=True, show_progress_bar=True, device='cpu')
embeddings_np = embeddings.numpy()



EMBEDDING SKILLS...


Batches:   0%|          | 0/22 [00:00<?, ?it/s]

In [None]:
# Perform Hierarchical Clustering
Z = sch.linkage(
    embeddings_np,
    method='complete',
    metric='cosine',
    optimal_ordering=True) # set to false when large scale (slows down computation)

# Step 4: Plot the dendrogram to visualize the hierarchy tree
plt.figure(figsize=(20,10))
sch.dendrogram(Z,labels=skills_jobs_df['SP ID'].values,  leaf_rotation=0, count_sort=True, distance_sort=True, orientation='right', leaf_font_size=6)

plt.title("Hierarchical Clustering Dendrogram")
plt.xlabel("Index of Embeddings")
plt.ylabel("Distance")
plt.tight_layout()
plt.savefig('participant-dendrogram.png', format='png', dpi=300)
# plt.show()

In [None]:
def cluster_hierarchy_tree(Z, skills_jobs_df, dist=0.5):
    cluster_labels = sch.fcluster(Z, t=dist, criterion='distance')
    clustered = defaultdict(list)
    for i, label in enumerate(cluster_labels):
        clustered[label].append(skills_jobs_df['Skills Jobs'].iloc[i])  # Group string names by cluster label
    return clustered

def print_cluster(clustered):
    for i in range(0, len(clustered)):
        print(f"Cluster {i}: {clustered[i]}")


dist_threshs = np.arange(.1, 1, .1)
histograms = dict()
clusters_for_threshs = []
for dist in dist_threshs:
    print(f"Clusters at distance threshold {dist}")
    clustered = cluster_hierarchy_tree(Z, skills_jobs_df, dist)
    clusters_for_threshs.append(clustered)
    print_cluster(clustered)
    histograms[dist] = [len(strings) for strings in clustered.values()]

In [None]:
for i, (dist, histogram) in enumerate(histograms.items()):
    # Step 1: Get the size of each cluster
    # cluster_sizes = [len(strings) for strings in clusters.values()]
    
    # Step 2: Create the histogram
    plt.figure(i)  # Create a new figure for each plot
    plt.hist(histogram, bins=range(1, max(histogram)+2), edgecolor='black')
    plt.xlabel('Cluster Size')
    plt.ylabel('Frequency')
    plt.title(f'Histogram of Cluster Sizes for Dist Thresh: {dist}')

# Show all plots at once (after the loop)
plt.show()


In [11]:
import csv

# Step 1: Collect all the cluster data into rows
csv_data = []

# Loop through the list of dictionaries
for clusters in clusters_for_threshs:
    # For each dictionary, get the cluster ID and associated array (cluster of strings)
    #for cluster_id, cluster in clusters.items():
    for i in range(0, len(clusters)):
        # Create a row where the first entry is the cluster ID, followed by the cluster items
        row = [f"Cluster: {i}"] + clusters[i]
        csv_data.append(row)

# Step 2: Find the maximum row length to ensure all rows have the same number of columns
max_length = max(len(row) for row in csv_data)

# Step 3: Pad each row with empty strings (or None) to match the max length
for row in csv_data:
    row.extend([''] * (max_length - len(row)))

# Step 4: Write the jagged data to a CSV file
with open('clusters.csv', mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(csv_data)

print("CSV file created successfully!")


CSV file created successfully!
