In [1]:
import os
import time
import numpy as np
import pandas as pd
from collections import namedtuple
from utils import openai_auth
from openai.embeddings_utils import get_embedding
from openai.embeddings_utils import cosine_similarity
import tiktoken 

Read OpenAI API authentication key from .env file


In [2]:

# input data parameters 
DOCUMENT_PATH = "data/Task Statements2.csv"
COLUMN_TO_EMBED = "Task" 

# output data parameters
OUTPUT_PATH = "data/TaskEmbeddingss.csv"

#embedding model parameters
EMBEDDING_MODEL = "text-embedding-ada-002"
ENCODING_MODEL = "cl100k_base"  # this is the encoding for text-embedding-ada-002
MAX_TOKENS = 8191  # the maximum for text-embedding-ada-oo2 is 8191

In [3]:
print(f'Using embedding model: {EMBEDDING_MODEL}')
print(f'Using encoding: {ENCODING_MODEL}')
print(f'Maximum number of tokens: {MAX_TOKENS}')
print('***********************************')
df = pd.read_csv(DOCUMENT_PATH)
print(f'Read {len(df)} documents from {DOCUMENT_PATH}')
df[['ID', 'Task']].head().set_index("ID")


Using embedding model: text-embedding-ada-002

Using encoding: cl100k_base

Maximum number of tokens: 8191

***********************************

Read 19265 documents from data/Task Statements2.csv


Unnamed: 0_level_0,Task
ID,Unnamed: 1_level_1
1,Direct or coordinate an organization's financi...
2,Appoint department heads or managers and assig...
3,Analyze operations to evaluate performance of ...
4,"Direct, plan, or implement policies, objective..."
5,"Prepare budgets for approval, including those ..."


In [4]:
# tokenize: generate encodings for 'cloumn_to_encode'
encoding = tiktoken.get_encoding(ENCODING_MODEL)
df["encoding"] = df[COLUMN_TO_EMBED].apply(lambda x: encoding.encode(x))
print(f'Encoded the {COLUMN_TO_EMBED} column into an encoding column')

Encoded the Task column into an encoding column


In [5]:
# omit encodings that are too long to embed
df["n_tokens"] = df.encoding.apply(lambda x: len(x))
n_long_encodings = len(df[df.n_tokens > MAX_TOKENS])
df = df[df.n_tokens <= MAX_TOKENS]
print(f'Omitted {n_long_encodings} encodings that were too long to embed')

Omitted 0 encodings that were too long to embed


In [6]:
openai_auth()

# embed: generate embeddings for 'column_to_embed', by calling the OpenAI API
print(f'Embedded {COLUMN_TO_EMBED} into an embedding column, by calling the OpenAI API')
print(f'This may take about a minute for {len(df)} documents...')
df["embedding"] = df[COLUMN_TO_EMBED].apply(lambda x: get_embedding(x, engine=EMBEDDING_MODEL))

Set OpenAI authentication key

Embedded Task into an embedding column, by calling the OpenAI API

This may take about a minute for 19265 documents...


In [None]:
# save embeddings to output_path
df.to_csv(OUTPUT_PATH, index=False)
print(f'Saved embeddings to {OUTPUT_PATH}')
print(df.head())

In [None]:
DOCUMENTS_EMBEDDINGS_PATH = "data"  # a folder with all the documents embeddings. within this folder, one csv file include multiple documents embedding of the same run
COLUMN_EMBEDDINGS = "embedding"  # the embedding column name in the documents embedding file.

In [None]:
files = os.listdir(DOCUMENTS_EMBEDDINGS_PATH)
print(f'Found {len(files)} files in {DOCUMENTS_EMBEDDINGS_PATH}')
for i, file in enumerate(files):
    print(f'{i+1}. {file}')
file_index = int(input('Please enter the number of the document embedding file you want to use: '))
file_path = os.path.join(DOCUMENTS_EMBEDDINGS_PATH, files[file_index-1])

# read documents embeddings
df = pd.read_csv(file_path)
df[COLUMN_EMBEDDINGS] = df[COLUMN_EMBEDDINGS].apply(eval).apply(np.array) # convert string to np array
print(f'Read {len(df)} documents embeddings from {file_path}')

openai_auth()
query=""
while query != "exit":
    print('***********************************')
    print('Enter "exit" to exit the script')
    query = input('Please enter a query: ') # get query from user
    if query == "exit":
        break
    tic = time.time()
    query_embedding = get_embedding(query, engine=EMBEDDING_MODEL)
    toc = time.time()
    print(f'Embedding query took {round(toc-tic)*1000}ms')
    print('Top matches ordered by cosine similarity of vector embeddings:')
    df['similarity'] = df[COLUMN_EMBEDDINGS].apply(lambda x: cosine_similarity(x, query_embedding))
    print(df.sort_values(by='similarity', ascending=False)[['similarity', 'description']].head(10))


# Using Transformers: MPNet from Hugging Face

In [17]:
import pandas as pd
import torch
import numpy as np
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [2]:
!pip install sentence_transformers transformers
from sentence_transformers import SentenceTransformer

Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: sentence_transformers
  Building wheel for sentence_transformers (setup.py) ... [?25ldone
[?25h  Created wheel for sentence_transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125938 sha256=1377078597f31dc019810605a9b3082771719af6e523f5c6938c974d17854bd4
  Stored in directory: /root/.cache/pip/wheels/83/71/2b/40d17d21937fed496fb99145227eca8f20b4891240ff60c86f
Successfully built sentence_transformers
Installing collected packages: sentence_transformers
Successfully installed sentence_transformers-2.2.2
[0m

In [4]:
job_data=pd.read_excel("/kaggle/input/tasks-for-jobs/Task Statements(1).xlsx")
job_data.head()

Task Statements


Unnamed: 0,O*NET-SOC Code,Title,Task ID,Task,Task Type,Incumbents Responding,Date,Domain Source
0,11-1011.00,Chief Executives,8823,Direct or coordinate an organization's financi...,Core,87.0,07/2014,Incumbent
1,11-1011.00,Chief Executives,8831,Appoint department heads or managers and assig...,Core,87.0,07/2014,Incumbent
2,11-1011.00,Chief Executives,8825,Analyze operations to evaluate performance of ...,Core,87.0,07/2014,Incumbent
3,11-1011.00,Chief Executives,8826,"Direct, plan, or implement policies, objective...",Core,87.0,07/2014,Incumbent
4,11-1011.00,Chief Executives,8827,"Prepare budgets for approval, including those ...",Core,87.0,07/2014,Incumbent


In [5]:
job=job_data[['Title','Task','Task Type']]
job.head()

Unnamed: 0,Title,Task,Task Type
0,Chief Executives,Direct or coordinate an organization's financi...,Core
1,Chief Executives,Appoint department heads or managers and assig...,Core
2,Chief Executives,Analyze operations to evaluate performance of ...,Core
3,Chief Executives,"Direct, plan, or implement policies, objective...",Core
4,Chief Executives,"Prepare budgets for approval, including those ...",Core


In [14]:
job.shape

(19265, 3)

In [8]:
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

In [10]:
embeddings = model.encode(job.Task.to_list(), show_progress_bar=True)

Batches:   0%|          | 0/603 [00:00<?, ?it/s]

In [11]:
embeddings

array([[ 0.00111515, -0.01212561,  0.00674503, ..., -0.00209855,
         0.00597192,  0.00498587],
       [ 0.05327297,  0.03539166, -0.01899687, ...,  0.01133153,
         0.01639626, -0.00217442],
       [-0.04514264,  0.0388556 ,  0.00111903, ...,  0.00812997,
         0.01865058,  0.00662475],
       ...,
       [-0.02771702, -0.05803879, -0.00317996, ...,  0.00536576,
        -0.05789695, -0.00941573],
       [-0.01636924,  0.00543328,  0.00593053, ..., -0.02465145,
        -0.0617161 , -0.03174858],
       [-0.02750919,  0.0237188 , -0.02141184, ..., -0.0302878 ,
        -0.02668518, -0.02936576]], dtype=float32)

In [13]:
print(len(embeddings[0]),len(embeddings))

768 19265


In [15]:
def cosine_similarity(a,b):
    dot_product = np.dot(a, b)

    # compute magnitudes
    magnitude_a = np.sqrt(np.dot(a, a))
    magnitude_b = np.sqrt(np.dot(b, b))

    # compute cosine similarity
    cos_sim = dot_product / (magnitude_a * magnitude_b)
    return cos_sim

In [18]:
example_job_title=np.array(model.encode("nurse", show_progress_bar=True))

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [19]:
cosine_sims = [cosine_similarity(example_job_title,embedding) for embedding in embeddings]

In [20]:
len(cosine_sims)

19265

In [22]:
n = 5
top_n_indices = np.argsort(-np.array(cosine_sims))[:5]

In [23]:
for i in top_n_indices:
  print(cosine_sims[i])

0.6141493
0.6006719
0.59101975
0.57436407
0.5660505


In [27]:
selected_rows = job.loc[top_n_indices.tolist(), 'Task']
for i,desc in enumerate(selected_rows):
    print('Job Responsibility number',i+1,'is:\n',desc)

Job Responsibility number 1 is:
 Observe nurses and visit patients to ensure proper nursing care.
Job Responsibility number 2 is:
 Write nursing orders.
Job Responsibility number 3 is:
 Supervise and monitor unit nursing staff.
Job Responsibility number 4 is:
 Perform nursing duties, such as administering medications, measuring vital signs, collecting specimens, or drawing blood samples.
Job Responsibility number 5 is:
 Assist nurses or physicians in the operation of medical equipment or provision of patient care.


In [29]:
selected_df = job.loc[top_n_indices, ['Title','Task','Task Type']]
selected_df

Unnamed: 0,Title,Task,Task Type
8807,Registered Nurses,Observe nurses and visit patients to ensure pr...,Core
8929,Clinical Nurse Specialists,Write nursing orders.,Core
8891,Critical Care Nurses,Supervise and monitor unit nursing staff.,Core
10218,Psychiatric Aides,"Perform nursing duties, such as administering ...",Core
10170,Nursing Assistants,Assist nurses or physicians in the operation o...,Core
