In [3]:
import os
import pinecone
import json
with open('../.creds') as f:
    creds = json.load(f)
    PINECONE_API_KEY = creds['PINECONE_API_KEY']
    PINECONE_ENVIRONMENT = creds['PINECONE_ENVIRONMENT']
    OPENAI_API_KEY = creds['OPENAI_API_KEY']

os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [None]:
dialogues_f = "../data/friends_dialogues/dialogues.csv"

import csv
import json

# Define a dictionary to hold the concatenated lines
dialogues = {}

# Read the CSV file
with open(dialogues_f, 'r') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        if row['character'] in dialogues:
            dialogues[row['character']] += '  ' + row['dialogue']
        else:
            dialogues[row['character']] = row['dialogue']

# Convert to the desired JSON format
output = []
for character, dialogue in dialogues.items():
    output.append({
        'character': character,
        'concatenated_lines': dialogue
    })

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings

model_name = 'text-embedding-ada-002'

embed = OpenAIEmbeddings(
    model=model_name,
    openai_api_key=OPENAI_API_KEY
)

In [None]:
index_name = 'friends'

pinecone.init(
    api_key=PINECONE_API_KEY,
    environment=PINECONE_ENVIRONMENT
)

if index_name not in pinecone.list_indexes():
    pinecone.create_index(
        name=index_name,
        metric='cosine',
        dimension=1536
    )

index = pinecone.GRPCIndex(index_name)

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
import tiktoken

tokenizer = tiktoken.get_encoding('cl100k_base')

def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=20,
    length_function=tiktoken_len,
    separators=["\n\n", "\n", " ", ""]
)

In [None]:
output

In [None]:
output[0]

In [None]:
from uuid import uuid4

batch_limit = 100

texts = []
metadatas = []

for i, record in enumerate(output):
    # first get metadata fields for this record
    metadata = {
        'character': str(record['character']),
        'type': 'character'
    }

    record_texts = text_splitter.split_text(str(record['concatenated_lines']))
    print("*"*20)

    record_metadatas = [{
        "chunk": j, "text": text, **metadata
    } for j, text in enumerate(record_texts)]

    texts.extend(record_texts)
    metadatas.extend(record_metadatas)
    print("*"*20)
    # if we have reached the batch_limit we can add texts
    if len(texts) >= batch_limit:
        ids = [str(uuid4()) for _ in range(len(texts))]
        embeds = embed.embed_documents(texts)
        index.upsert(vectors=zip(ids, embeds, metadatas))
        texts = []
        metadatas = []

if len(texts) > 0:
    ids = [str(uuid4()) for _ in range(len(texts))]
    embeds = embed.embed_documents(texts)
    index.upsert(vectors=zip(ids, embeds, metadatas))

In [None]:
print("Finished")

In [None]:
index.describe_index_stats()

# 1. Similarity of JD and Candidate: For each JD find the most similar profiles
# 2. adjust the template of langchain to JD and candidate
# 3. Evaluator candidate.
# 4. Where do we save the output and how we use that output?

# 1. Similarity of JD and Candidate: For each JD find the most similar profiles
# 2. adjust the template of langchain to JD and candidate
# 3. Evaluator candidate.
# 4. Where do we save the output and how we use that output?