In [28]:
import os
import pandas as pd
#load from json .creds/PINECONE_API
import json
with open('.creds/PINECONE_API') as f:
    creds = json.load(f)
    PINECONE_API_KEY = creds['PINECONE_API_KEY']
    PINECONE_ENVIRONMENT = creds['PINECONE_ENVIRONMENT']
    OPENAI_API_KEY = creds['OPENAI_API_KEY']

os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [5]:
path_jobs = "data/linkedin/job_posting.json"
path_profiles = "data/linkedin/profiles.json"

import json

# Read profiles.json
with open(path_profiles, 'r') as f:
    profiles = json.load(f)

# Read job_posting.json
with open(path_jobs, 'r') as f:
    job_postings = json.load(f)


In [10]:
type(job_postings)

list

# Create upload function

* We have list with a dictonary as the template.

1. We need to create an embeddigns or summary for each jobposting and candidate
2. Append that embedding to the dictionary (to use the list and create a new key "embedding")


In [12]:
from langchain.embeddings.openai import OpenAIEmbeddings

model_name = 'text-embedding-ada-002'

embed = OpenAIEmbeddings(
    model=model_name,
    openai_api_key=OPENAI_API_KEY
)

Create for all data of id a "text" chunk and append it to a list. Then modify the keys of the example to test

In [30]:
index_name = 'linkedin'

import pinecone


pinecone.init(
    api_key=PINECONE_API_KEY,
    environment=PINECONE_ENVIRONMENT
)

if index_name not in pinecone.list_indexes():
    # we create a new index
    pinecone.create_index(
        name=index_name,
        metric='cosine',
        dimension=1536  # 1536 dim of text-embedding-ada-002
    )

index = pinecone.GRPCIndex(index_name)

In [31]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
import tiktoken

tokenizer = tiktoken.get_encoding('cl100k_base')

def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=20,
    length_function=tiktoken_len,
    separators=["\n\n", "\n", " ", ""]
)

In [34]:
profiles

[{'id': 1,
  'name': 'Saurav Joshi',
  'url': 'https://www.linkedin.com/in/saurav-joshi-ab58a9179/',
  'title': "M.S. Data Science @ USC'24 | GSoC'22 @ DBpedia | ICPC'20 Regionalist",
  'location': 'Los Angeles, California',
  'summary': '',
  'connections': '427',
  'experience_count': '2',
  'experience_collection': [{'title': 'Research Scientist',
    'location': 'Marina del Rey, California',
    'company': 'USC Information Sciences Institute',
    'description': 'Developed a graph database by aggregating information from WikiData, PDF, and FoodKG, and created a backend design for a knowledge-powered understanding application. Extracted meaningful insights from PDFs and applied machine learning algorithms and statistical knowledge to build a recommendation system that recommended food products that are more sustainable. Awarded Best Data Science Team Leader at USC Information Sciences Institute CKIDS Datafest 2022.'},
   {'title': 'Data Scientist',
    'location': 'Mumbai, Maharasht

In [33]:
metadata

{'id': '5',
 'name': 'Philman Tjong, EIT',
 'url': 'https://www.linkedin.com/in/philmanstjong/',
 'source': 'linkedin',
 'location': 'El Monte, California',
 'experience_count': '2',
 'experience_collection': [{'title': 'Electrical Engineer Associate 1',
   'location': 'Los Angeles County, California',
   'company': 'LADWP',
   'description': 'Currently learning and working with the Control Systems Network Engineering'},
  {'title': 'Electrical and Computer Engineering Instructional Assistant',
   'location': 'San Diego, California',
   'company': 'UCSD',
   'description': 'Assisted in classes pertaining to Power Grid Modernization and Power Grid Resiliency to Adverse Effects + Assisted the professor in the course material, proctoring exams, grading assignments, and tutoring any students on the course knowledge'}],
 'connections': '490'}

In [35]:
profiles[0]

{'id': 1,
 'name': 'Saurav Joshi',
 'url': 'https://www.linkedin.com/in/saurav-joshi-ab58a9179/',
 'title': "M.S. Data Science @ USC'24 | GSoC'22 @ DBpedia | ICPC'20 Regionalist",
 'location': 'Los Angeles, California',
 'summary': '',
 'connections': '427',
 'experience_count': '2',
 'experience_collection': [{'title': 'Research Scientist',
   'location': 'Marina del Rey, California',
   'company': 'USC Information Sciences Institute',
   'description': 'Developed a graph database by aggregating information from WikiData, PDF, and FoodKG, and created a backend design for a knowledge-powered understanding application. Extracted meaningful insights from PDFs and applied machine learning algorithms and statistical knowledge to build a recommendation system that recommended food products that are more sustainable. Awarded Best Data Science Team Leader at USC Information Sciences Institute CKIDS Datafest 2022.'},
  {'title': 'Data Scientist',
   'location': 'Mumbai, Maharashtra',
   'compa

In [44]:
from uuid import uuid4

batch_limit = 100

texts = []
metadatas = []

for i, record in enumerate(profiles):
    # first get metadata fields for this record
    metadata = {
        'id': str(record['id']),
        'name': str(record['name']),
        'url': str(record['url']),
        'source':'linkedin',
        'location': record['location'],
        'experience_count': record['experience_count'],
        'connections': record['connections'],
    }
    #concatenate all keys and corresponding values of a nested profile[i] in a text feature
    result = ""
    for key, value in profiles[i].items():
        if key == "experience_collection":
            for experience in value:
                for sub_key, sub_value in experience.items():
                    result += sub_key+ " " +sub_value + " "
                result += "\n"
        else:
            result += key + " " + str(value) + " "
    # now we create chunks from the record text
    record_texts = text_splitter.split_text(result)
    print("*"*20)

    # create individual metadata dicts for each chunk
    record_metadatas = [{
        "chunk": j, "text": text, **metadata
    } for j, text in enumerate(record_texts)]

    # append these to current batches
    texts.extend(record_texts)
    metadatas.extend(record_metadatas)
    print("*"*20)
    # if we have reached the batch_limit we can add texts
    if len(texts) >= batch_limit:
        ids = [str(uuid4()) for _ in range(len(texts))]
        embeds = embed.embed_documents(texts)
        index.upsert(vectors=zip(ids, embeds, metadatas))
        texts = []
        metadatas = []

if len(texts) > 0:
    ids = [str(uuid4()) for _ in range(len(texts))]
    embeds = embed.embed_documents(texts)
    index.upsert(vectors=zip(ids, embeds, metadatas))

********************
********************
********************
********************
********************
********************
********************
********************
********************
********************


In [45]:
print("Finished")

Finished


In [8]:
# Create metadata for profiles
profile_metadata = {id: profile for id, profile in zip(job_postings.keys(), profiles)}

AttributeError: 'list' object has no attribute 'keys'

In [3]:
# Assume that `profile_vectors` and `job_posting_vectors` are dictionaries where
# the keys are the IDs of the profiles/job postings and the values are the corresponding vectors.


# Create metadata for job postings
job_posting_metadata = {id: job_posting for id, job_posting in zip(job_posting_vectors.keys(), job_postings)}

# Upload metadata to Pinecone
pinecone.upsert_metadata(index_name=index_name, metadata=profile_metadata)
pinecone.upsert_metadata(index_name=index_name, metadata=job_posting_metadata)


Unnamed: 0,title,company,description,onsite_remote,salary,location,criteria,posted_date,link
0,Data Analyst - Recent Graduate,PayPal,"At PayPal (NASDAQ: PYPL), we believe that ever...",onsite,,Buffalo-Niagara Falls Area,"[{'Seniority level': 'Not Applicable'}, {'Empl...",2022-11-22,https://www.linkedin.com/jobs/view/data-analys...
1,Data Analyst - Recent Graduate,PayPal,"At PayPal (NASDAQ: PYPL), we believe that ever...",onsite,,"San Jose, CA","[{'Seniority level': 'Not Applicable'}, {'Empl...",2022-11-22,https://www.linkedin.com/jobs/view/data-analys...
2,Data Analyst,PayPal,"At PayPal (NASDAQ: PYPL), we believe that ever...",onsite,,"Texas, United States","[{'Seniority level': 'Not Applicable'}, {'Empl...",2022-11-17,https://www.linkedin.com/jobs/view/data-analys...
3,Data Analyst,PayPal,"At PayPal (NASDAQ: PYPL), we believe that ever...",onsite,,"Illinois, United States","[{'Seniority level': 'Not Applicable'}, {'Empl...",2022-11-17,https://www.linkedin.com/jobs/view/data-analys...
4,Entry-Level Data Analyst,The Federal Savings Bank,"The Federal Savings Bank, a national bank and ...",onsite,,"Chicago, IL","[{'Seniority level': 'Entry level'}, {'Employm...",2022-11-17,https://www.linkedin.com/jobs/view/entry-level...
...,...,...,...,...,...,...,...,...,...
2840,Junior Data Analyst,Iris Software Inc.,"Iris's client, one of the world's largest fina...",hybrid,,"Texas, United States","[{'Seniority level': 'Mid-Senior level'}, {'Em...",2022-10-21,https://www.linkedin.com/jobs/view/junior-data...
2841,Data Analyst (SQL),Marwood Group,The Marwood Group (Marwood) is a leading healt...,hybrid,"$75,000.00\r\n -\r\n $95...",New York City Metropolitan Area,"[{'Seniority level': 'Mid-Senior level'}, {'Em...",2022-10-26,https://www.linkedin.com/jobs/view/data-analys...
2842,Data Analyst,SmartSense by Digi,"Join a high-performing, tight-knit team at a f...",hybrid,,"Mishawaka, IN","[{'Seniority level': 'Associate'}, {'Employmen...",2022-11-17,https://www.linkedin.com/jobs/view/data-analys...
2843,Data Analyst,Synergy Search,Nashville (Berry Hill) based company looking t...,hybrid,"$85,000.00\r\n -\r\n $95...",Nashville Metropolitan Area,"[{'Seniority level': 'Not Applicable'}, {'Empl...",2022-11-14,https://www.linkedin.com/jobs/view/data-analys...


In [46]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 5}},
 'total_vector_count': 5}

# 1. Similarity of JD and Candidate: For each JD find the most similar profiles
# 2. adjust the template of langchain to JD and candidate
# 3. Evaluator candidate.
# 4. Where do we save the output and how we use that output?