In [1]:
import os
import pandas as pd
#load from json .creds/PINECONE_API
import json
with open('.creds/PINECONE_API') as f:
    creds = json.load(f)
    PINECONE_API_KEY = creds['PINECONE_API_KEY']
    PINECONE_ENVIRONMENT = creds['PINECONE_ENVIRONMENT']
    OPENAI_API_KEY = creds['OPENAI_API_KEY']

os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [2]:
path_jobs = "data/linkedin/job_posting.json"
path_profiles = "data/linkedin/profiles.json"

import json

# Read profiles.json
with open(path_profiles, 'r') as f:
    profiles = json.load(f)

# Read job_posting.json
with open(path_jobs, 'r') as f:
    job_postings = json.load(f)


# Create upload function

* We have list with a dictonary as the template.

1. We need to create an embeddigns or summary for each jobposting and candidate
2. Append that embedding to the dictionary (to use the list and create a new key "embedding")


In [3]:
from langchain.embeddings.openai import OpenAIEmbeddings

model_name = 'text-embedding-ada-002'

embed = OpenAIEmbeddings(
    model=model_name,
    openai_api_key=OPENAI_API_KEY
)

Create for all data of id a "text" chunk and append it to a list. Then modify the keys of the example to test

In [4]:
index_name = 'linkedin'

import pinecone


pinecone.init(
    api_key=PINECONE_API_KEY,
    environment=PINECONE_ENVIRONMENT
)

if index_name not in pinecone.list_indexes():
    # we create a new index
    pinecone.create_index(
        name=index_name,
        metric='cosine',
        dimension=1536  # 1536 dim of text-embedding-ada-002
    )

index = pinecone.GRPCIndex(index_name)

  from tqdm.autonotebook import tqdm


In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
import tiktoken

tokenizer = tiktoken.get_encoding('cl100k_base')

def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=20,
    length_function=tiktoken_len,
    separators=["\n\n", "\n", " ", ""]
)

In [6]:
profiles

[{'id': 1,
  'name': 'Saurav Joshi',
  'url': 'https://www.linkedin.com/in/saurav-joshi-ab58a9179/',
  'title': "M.S. Data Science @ USC'24 | GSoC'22 @ DBpedia | ICPC'20 Regionalist",
  'location': 'Los Angeles, CA',
  'summary': '',
  'connections': '427',
  'experience_count': '2',
  'experience_collection': [{'title': 'Research Scientist',
    'location': 'Marina del Rey, California',
    'company': 'USC Information Sciences Institute',
    'description': 'Developed a graph database by aggregating information from WikiData, PDF, and FoodKG, and created a backend design for a knowledge-powered understanding application. Extracted meaningful insights from PDFs and applied machine learning algorithms and statistical knowledge to build a recommendation system that recommended food products that are more sustainable. Awarded Best Data Science Team Leader at USC Information Sciences Institute CKIDS Datafest 2022.'},
   {'title': 'Data Scientist',
    'location': 'Mumbai, Maharashtra',
   

In [7]:
profiles[0]

{'id': 1,
 'name': 'Saurav Joshi',
 'url': 'https://www.linkedin.com/in/saurav-joshi-ab58a9179/',
 'title': "M.S. Data Science @ USC'24 | GSoC'22 @ DBpedia | ICPC'20 Regionalist",
 'location': 'Los Angeles, CA',
 'summary': '',
 'connections': '427',
 'experience_count': '2',
 'experience_collection': [{'title': 'Research Scientist',
   'location': 'Marina del Rey, California',
   'company': 'USC Information Sciences Institute',
   'description': 'Developed a graph database by aggregating information from WikiData, PDF, and FoodKG, and created a backend design for a knowledge-powered understanding application. Extracted meaningful insights from PDFs and applied machine learning algorithms and statistical knowledge to build a recommendation system that recommended food products that are more sustainable. Awarded Best Data Science Team Leader at USC Information Sciences Institute CKIDS Datafest 2022.'},
  {'title': 'Data Scientist',
   'location': 'Mumbai, Maharashtra',
   'company': 'GS

In [10]:
from uuid import uuid4

batch_limit = 100

texts = []
metadatas = []

for i, record in enumerate(profiles):
    # first get metadata fields for this record
    metadata = {
        'id': str(record['id']),
        'name': str(record['name']),
        'url': str(record['url']),
        'source':'linkedin',
        'location': record['location'],
        'experience_count': record['experience_count'],
        'connections': record['connections'],
    }
    #concatenate all keys and corresponding values of a nested profile[i] in a text feature
    result = ""
    for key, value in profiles[i].items():
        if key == "experience_collection":
            for experience in value:
                for sub_key, sub_value in experience.items():
                    result += sub_key+ " " +sub_value + " "
                result += "\n"
        else:
            result += key + " " + str(value) + " "
    # now we create chunks from the record text
    record_texts = text_splitter.split_text(result)
    print("*"*20)

    # create individual metadata dicts for each chunk
    record_metadatas = [{
        "chunk": j, "text": text, **metadata
    } for j, text in enumerate(record_texts)]

    # append these to current batches
    texts.extend(record_texts)
    metadatas.extend(record_metadatas)
    print("*"*20)
    # if we have reached the batch_limit we can add texts
    if len(texts) >= batch_limit:
        ids = [str(uuid4()) for _ in range(len(texts))]
        embeds = embed.embed_documents(texts)
        index.upsert(vectors=zip(ids, embeds, metadatas))
        texts = []
        metadatas = []

if len(texts) > 0:
    ids = [str(uuid4()) for _ in range(len(texts))]
    embeds = embed.embed_documents(texts)
    index.upsert(vectors=zip(ids, embeds, metadatas))

********************
********************
********************
********************
********************
********************
********************
********************
********************
********************


In [11]:
print("Finished")

Finished


In [12]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 5}},
 'total_vector_count': 5}

We dont need another index for job posting as that will be the user query based on which we will match the user profile. Maybe we require if we doing 2 way search

## Semantic Search

Lets take a Data Analyst Job posting in Los Angeles as a sample query

In [13]:
job_posting_query = job_postings[0]
job_posting_metadata = job_posting_query
job_posting_query = ""
for key, value in job_posting_metadata.items():
    job_posting_query += key + " " + str(value) + " "
print(job_posting_query)

id 1 title Data Analyst company London Approach description The ideal candidate for the Data Analyst/Financial Analyst position will use their passion for data and analytics to provide insights to the business covering a range of topics. They will be responsible for conducting both recurring and ad hoc analysis for business users.Salary: $100-120kSchedule: 4 days a week onsite & 1 day remote (usually Friday)ResponsibilitiesUnderstand the day-to-day issues that our business faces, which can be better understood with dataCompile and analyze data related to business' issuesDevelop clear visualizations to convey complicated data in a straightforward fashionCollaboratively build financial models and reportsReview and track trendsInvestigate and research discrepanciesMaintain system and code information in systemCompile and analyze dataPrepare and analyze operations and financialsBuild trend reportsQualificationsBachelor's or Master's degree in Accounting or Finance2+ years of relevant data 

In [14]:
job_posting_metadata

{'id': 1,
 'title': 'Data Analyst',
 'company': 'London Approach',
 'description': "The ideal candidate for the Data Analyst/Financial Analyst position will use their passion for data and analytics to provide insights to the business covering a range of topics. They will be responsible for conducting both recurring and ad hoc analysis for business users.Salary: $100-120kSchedule: 4 days a week onsite & 1 day remote (usually Friday)ResponsibilitiesUnderstand the day-to-day issues that our business faces, which can be better understood with dataCompile and analyze data related to business' issuesDevelop clear visualizations to convey complicated data in a straightforward fashionCollaboratively build financial models and reportsReview and track trendsInvestigate and research discrepanciesMaintain system and code information in systemCompile and analyze dataPrepare and analyze operations and financialsBuild trend reportsQualificationsBachelor's or Master's degree in Accounting or Finance2+

In [15]:
jb_texts = []
jb_record_texts = text_splitter.split_text(job_posting_query)
record_metadatas = [{
    "chunk": j, "text": text, **metadata
} for j, text in enumerate(jb_record_texts)]
jb_texts.extend(jb_record_texts)

In [16]:
query_vector = embed.embed_documents(jb_texts)
query_vector = query_vector[0]

In [17]:
query_response = index.query(
  vector=query_vector,
  top_k=3,
  include_metadata=True
)

In [18]:
query_response

{'matches': [{'id': '493a919a-14a4-4ab1-aa01-4d70696a9e30',
              'metadata': {'chunk': 0.0,
                           'connections': '700',
                           'experience_count': '1',
                           'id': '2',
                           'location': 'Austin, TX',
                           'name': 'Shamit Kikani',
                           'source': 'linkedin',
                           'text': 'id 2 name Shamit Kikani url '
                                   'https://www.linkedin.com/in/shamitkikani/ '
                                   'title Data Analyst | University of '
                                   'Michigan | SpaceX location Austin, TX '
                                   'summary Industrial Engineer specializing '
                                   'in Data Science and Operations Research. I '
                                   'aspire to work towards practical '
                                   'implementation of data science, machine '
  

The top retrieved result is a data analyst, 2nd - data scientist, 3rd - cyber

## Metadata Filtering

Profiles which are most similar to data analyst job role and live in LA

In [21]:
query_response = index.query(
  vector=query_vector,
  filter={
    "location": {"$eq": "Los Angeles, CA"},
  },
  top_k=3,
  include_metadata=True
)

In [22]:
query_response

{'matches': [{'id': '2a9cf887-93a1-42f2-b464-3c0791de4ae3',
              'metadata': {'chunk': 0.0,
                           'connections': '427',
                           'experience_count': '2',
                           'id': '1',
                           'location': 'Los Angeles, CA',
                           'name': 'Saurav Joshi',
                           'source': 'linkedin',
                           'text': 'id 1 name Saurav Joshi url '
                                   'https://www.linkedin.com/in/saurav-joshi-ab58a9179/ '
                                   "title M.S. Data Science @ USC'24 | GSoC'22 "
                                   "@ DBpedia | ICPC'20 Regionalist location "
                                   'Los Angeles, CA summary  connections 427 '
                                   'experience_count 2 title Research '
                                   'Scientist location Marina del Rey, '
                                   'California company USC Info

# 1. Similarity of JD and Candidate: For each JD find the most similar profiles ============ Done

# 2. adjust the template of langchain to JD and candidate

# 3. Evaluator candidate.

# 4. Where do we save the output and how we use that output?