In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from openai import OpenAI
import os
from dotenv import load_dotenv  
from sklearn.neighbors import NearestNeighbors
import networkx as nx   # library for creating graphs/networks
import faiss

In [None]:
load_dotenv()  # Load environment variables from .env file
# Ensure the OpenAI API key is set in the environment

if 'OPENAI_API_KEY' not in os.environ:
    raise ValueError("OPENAI_API_KEY environment variable not set.")

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

def get_openai_embeddings(texts, model='text-embedding-ada-002'): 
    # texts: list of strings
    response = client.embeddings.create(
        input=texts,
        model=model
    )
    # Return a list of embeddings
    return [item.embedding for item in response.data]

def create_embeddings(data, batch_size=512, save_path=None):
    embeddings = []
    for i in range (0, len(data), batch_size): 
        batch = data['task_clean'].iloc[i:i+batch_size].tolist()
        print(f"Processing batch from index {i} to {min(i + batch_size, len(data)) - 1}")
        batch_embeddings = get_openai_embeddings(batch)
        embeddings.extend(batch_embeddings)
        print(f"Processed batch {i // batch_size + 1}/{(len(data) + batch_size - 1) // batch_size}")

    # format of return is [[embedding1], [embedding2], ...]
    E = np.vstack(embeddings)

    if save_path:
        np.save(save_path, E)
        print(f"Embeddings saved to {save_path}")
    return E

In [6]:
# read the data from the csv file
data_path = os.path.join('../data/onet/task_counts.csv')

data = pd.read_csv(data_path, encoding='latin1')

# clean the data by lowering the case and removing extra spaces. The regex `r'\s+'` matches one or more whitespace characters.
data['task_clean'] = data['Task'].str.lower().replace(r'\s+', ' ', regex=True).str.strip()  # remove extra spaces

'''
Step 1:
Get the embedding for each task statement using OpenAI's text-embedding-ada-002 model.
'''
E = create_embeddings(data, batch_size=512, save_path='../data/onet/task_embeddings.npy')

print("Embeddings created successfully. Shape of embeddings:", E.shape)

Processing batch from index 0 to 511
Processed batch 1/64
Processing batch from index 512 to 1023
Processed batch 2/64
Processing batch from index 1024 to 1535
Processed batch 3/64
Processing batch from index 1536 to 2047
Processed batch 4/64
Processing batch from index 2048 to 2559
Processed batch 5/64
Processing batch from index 2560 to 3071
Processed batch 6/64
Processing batch from index 3072 to 3583
Processed batch 7/64
Processing batch from index 3584 to 4095
Processed batch 8/64
Processing batch from index 4096 to 4607
Processed batch 9/64
Processing batch from index 4608 to 5119
Processed batch 10/64
Processing batch from index 5120 to 5631
Processed batch 11/64
Processing batch from index 5632 to 6143
Processed batch 12/64
Processing batch from index 6144 to 6655
Processed batch 13/64
Processing batch from index 6656 to 7167
Processed batch 14/64
Processing batch from index 7168 to 7679
Processed batch 15/64
Processing batch from index 7680 to 8191
Processed batch 16/64
Proces

In [None]:
'''
Step 2: 
Once we have embeddings, we can first normalize each embedding to have an L2 norm of 1 (i.e., make them unit vectors), 
and then we can use FAISS to find the top-k (k=50) nearest neighbors for each embedding based on cosine similarity. 
We want to have the L2 norm so that we can use the inner product between two embedding vectors as a measure of cosine similarity.
This further allows us to perform the index search using FAISS, which is optimized for such operations.
'''
if E.dtype != np.float32 or not E.flags['C_CONTIGUOUS']:
    E = np.ascontiguousarray(E, dtype=np.float32)

d = faiss.normalize_L2(E)
index = faiss.IndexFlatIP(d)
index.add(E)
D, I = index.search(E, k=50)


In [9]:
'''
Step 3: 
We create a graph where each node is a task statement, and we add an edge between two nodes if their cosine similarity is 
above a certain threshold (0.97 in this case). This allows us to identify clusters of similar task statements.
'''
G = nx.Graph()
G.add_nodes_from(range(len(data)))
threshold = 0.97
for i, neighbors in enumerate(I):
    for j, score in zip(neighbors, D[i]):
        if i < j and score >= threshold:
            G.add_edge(i, j)
clusters = list(nx.connected_components(G))
canon = {}
for cid, comp in enumerate(clusters, 1):
    for idx in comp:
        canon[idx] = f"C{cid:05d}"
data['canon_id'] = data.index.map(canon)
data.to_csv('tasks_with_canon.csv', index=False)

In [16]:
# Group by canon id and show all unique task statements in each cluster, get counts
grouped = data.groupby('canon_id')['Task'].apply(lambda x: ', '.join(x.unique())).reset_index()
grouped['count'] = grouped['Task'].apply(lambda x: len(x.split(', ')))
grouped[grouped['count'] > 1].to_csv('task_clusters_geq_1.csv', index=False)

Unnamed: 0,canon_id,Task,count
1,C00002,"Administers, interprets, and explains policies...",6
2,C00003,"Develops, plans, organizes, and administers po...",4
3,C00004,Directs and coordinates activities of workers ...,3
6,C00007,"Reviews and analyzes legislation, laws, and pu...",11
7,C00008,"Develops, directs, and coordinates testing, hi...",6
...,...,...,...
22917,C22918,"Provide nursing, psychiatric, or personal care...",5
22918,C22919,"Collaborate with or assist doctors, psychologi...",7
22919,C22920,"Provide patients with cognitive, intellectual,...",6
22922,C22923,"Provide care for children with physical, devel...",3
