In [None]:
# %pip install --upgrade --user --quiet google-cloud-aiplatform google-cloud-storage

In [34]:
import pandas as pd
from google.cloud import storage, aiplatform
from vertexai.preview.language_models import TextEmbeddingModel
import vertexai
import tqdm
import time

PROJECT_ID = "{project-name}"
LOCATION = "us-central1"
BUCKET_NAME = "{bucket-name}"
CSV_FILE_PATH = "talent_acquisition_profiles.csv"

vertexai.init(project=PROJECT_ID, location=LOCATION)

df = pd.read_csv(CSV_FILE_PATH)

In [35]:
df.head()

Unnamed: 0,id,name,total_experience,current_designation,overview
0,1,Michael Carpenter,2,Data Engineer,managed database solutions in PostgreSQL/Mongo...
1,2,Katie Turner,10,Sr Data Engineer,integrated streaming data processing using Kaf...
2,3,Michael Graham,10,Sr Data Engineer,drove cloud migrations and optimizations. deve...
3,4,Paul Martin,13,Lead Data Engineer,architected enterprise data solutions on AWS/G...
4,5,Robert Sullivan,7,Sr Data Engineer,enhanced data warehousing strategies with Reds...


In [36]:
df['combined_details'] = df.apply(lambda row: f"{row['name']} {row['total_experience']} years {row['current_designation']} {row['overview']}", axis=1)

model = TextEmbeddingModel.from_pretrained("textembedding-gecko@001")

In [37]:
df.head()

Unnamed: 0,id,name,total_experience,current_designation,overview,combined_details
0,1,Michael Carpenter,2,Data Engineer,managed database solutions in PostgreSQL/Mongo...,Michael Carpenter 2 years Data Engineer manage...
1,2,Katie Turner,10,Sr Data Engineer,integrated streaming data processing using Kaf...,Katie Turner 10 years Sr Data Engineer integra...
2,3,Michael Graham,10,Sr Data Engineer,drove cloud migrations and optimizations. deve...,Michael Graham 10 years Sr Data Engineer drove...
3,4,Paul Martin,13,Lead Data Engineer,architected enterprise data solutions on AWS/G...,Paul Martin 13 years Lead Data Engineer archit...
4,5,Robert Sullivan,7,Sr Data Engineer,enhanced data warehousing strategies with Reds...,Robert Sullivan 7 years Sr Data Engineer enhan...


In [38]:
def get_embeddings_wrapper(texts, batch_size=5):
    embeddings = []
    for i in tqdm.tqdm(range(0, len(texts), batch_size)):
        time.sleep(1) 
        batch_texts = texts[i:i+batch_size]
        batch_embeddings = model.get_embeddings(batch_texts)
        embeddings.extend([embedding.values for embedding in batch_embeddings])
    return embeddings

combined_texts = df['combined_details'].tolist()
df['embedding'] = get_embeddings_wrapper(combined_texts)

100%|██████████| 11/11 [00:12<00:00,  1.12s/it]


In [39]:
df.head()

Unnamed: 0,id,name,total_experience,current_designation,overview,combined_details,embedding
0,1,Michael Carpenter,2,Data Engineer,managed database solutions in PostgreSQL/Mongo...,Michael Carpenter 2 years Data Engineer manage...,"[-0.005255071446299553, -0.013083773665130138,..."
1,2,Katie Turner,10,Sr Data Engineer,integrated streaming data processing using Kaf...,Katie Turner 10 years Sr Data Engineer integra...,"[-0.013578824698925018, -0.02418570965528488, ..."
2,3,Michael Graham,10,Sr Data Engineer,drove cloud migrations and optimizations. deve...,Michael Graham 10 years Sr Data Engineer drove...,"[-0.03245685249567032, -0.03547470644116402, -..."
3,4,Paul Martin,13,Lead Data Engineer,architected enterprise data solutions on AWS/G...,Paul Martin 13 years Lead Data Engineer archit...,"[-0.02958768606185913, -0.03361005708575249, -..."
4,5,Robert Sullivan,7,Sr Data Engineer,enhanced data warehousing strategies with Reds...,Robert Sullivan 7 years Sr Data Engineer enhan...,"[-0.032230257987976074, -0.04934456944465637, ..."


In [33]:
jsonl_string = df[["id", "name","total_experience","current_designation","overview","embedding"]].to_json(orient="records", lines=True)

with open("candidates.json", "w") as f:
    f.write(jsonl_string)

In [7]:
BUCKET_URI = f"gs://no-latency-labs-documents"
! gsutil cp candidates.json {BUCKET_URI}

Copying file://candidates.json [Content-Type=application/json]...
/ [1 files][569.5 KiB/569.5 KiB]                                                
Operation completed over 1 objects/569.5 KiB.                                    


In [8]:
from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, location=LOCATION)

# my_index = aiplatform.MatchingEngineIndex.create_tree_ah_index(
#     display_name=f"job-applicants-idx",
#     contents_delta_uri=BUCKET_URI,
#     dimensions=768,
#     approximate_neighbors_count=20,
#     distance_measure_type="DOT_PRODUCT_DISTANCE",
# )

my_index_id = "6429895620722425856"
my_index = aiplatform.MatchingEngineIndex(my_index_id)

Creating MatchingEngineIndex
Create MatchingEngineIndex backing LRO: projects/499192289487/locations/us-central1/indexes/6429895620722425856/operations/6679298977122222080
MatchingEngineIndex created. Resource name: projects/499192289487/locations/us-central1/indexes/6429895620722425856
To use this MatchingEngineIndex in another session:
index = aiplatform.MatchingEngineIndex('projects/499192289487/locations/us-central1/indexes/6429895620722425856')


In [9]:
index_endpoint = aiplatform.MatchingEngineIndexEndpoint.create(
    display_name=f"job-applicants-endpoint",
    public_endpoint_enabled=True
)

Creating MatchingEngineIndexEndpoint
Create MatchingEngineIndexEndpoint backing LRO: projects/499192289487/locations/us-central1/indexEndpoints/3561665608040841216/operations/5808978349132873728
MatchingEngineIndexEndpoint created. Resource name: projects/499192289487/locations/us-central1/indexEndpoints/3561665608040841216
To use this MatchingEngineIndexEndpoint in another session:
index_endpoint = aiplatform.MatchingEngineIndexEndpoint('projects/499192289487/locations/us-central1/indexEndpoints/3561665608040841216')


In [11]:
DEPLOYED_INDEX_ID = f"job_applicants_index"

index_endpoint.deploy_index(index=my_index, deployed_index_id=DEPLOYED_INDEX_ID)

Deploying index MatchingEngineIndexEndpoint index_endpoint: projects/499192289487/locations/us-central1/indexEndpoints/3561665608040841216
Deploy index MatchingEngineIndexEndpoint index_endpoint backing LRO: projects/499192289487/locations/us-central1/indexEndpoints/3561665608040841216/operations/41556076331532288
MatchingEngineIndexEndpoint index_endpoint Deployed index. Resource name: projects/499192289487/locations/us-central1/indexEndpoints/3561665608040841216


<google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint.MatchingEngineIndexEndpoint object at 0x7fe4a45cd8d0> 
resource name: projects/499192289487/locations/us-central1/indexEndpoints/3561665608040841216

In [45]:
# user_prompt = "We are looking for a junior data engineer with around 2 to 4 years of experience"

user_prompt = "We are looking for a data analyst with experience in creating dashboards and writing SQL queries"

test_embeddings = get_embeddings_wrapper([user_prompt])

new_index_endpoint = aiplatform.MatchingEngineIndexEndpoint('projects/499192289487/locations/us-central1/indexEndpoints/3561665608040841216')

response = index_endpoint.find_neighbors(
    deployed_index_id=DEPLOYED_INDEX_ID,
    queries=test_embeddings,
    num_neighbors=5,
)

100%|██████████| 1/1 [00:01<00:00,  1.11s/it]


In [46]:
import numpy as np

for idx, neighbor in enumerate(response[0]):
    id = np.int64(neighbor.id)
    similar = df.query("id == @id", engine="python")
    print(f"{neighbor.distance:.4f} {similar.name.values[0]}-{similar.overview.values[0]}-{similar.total_experience.values[0]}")

0.7609 Olivia Dunn-assisted in data cleanup and preliminary analysis. created visual reports for stakeholder presentations. supported analytics projects with SQL and Excel. engaged in ongoing analytics tools and techniques training.-4
0.7484 Amanda Whitney-supported analytics projects with SQL and Excel. engaged in ongoing analytics tools and techniques training. assisted in data cleanup and preliminary analysis. created visual reports for stakeholder presentations.-5
0.7420 Sarah Spears-engaged in ongoing analytics tools and techniques training. assisted in data cleanup and preliminary analysis. created visual reports for stakeholder presentations. supported analytics projects with SQL and Excel.-3
0.7390 Mr. James Mitchell DDS-created visual reports for stakeholder presentations. supported analytics projects with SQL and Excel. engaged in ongoing analytics tools and techniques training. assisted in data cleanup and preliminary analysis.-2
0.7372 Brent Goodwin-supported analytics proj