In [3]:
import os
import pinecone
import json
with open('../.creds') as f:
    creds = json.load(f)
    PINECONE_API_KEY = creds['PINECONE_API_KEY']
    PINECONE_ENVIRONMENT = creds['PINECONE_ENVIRONMENT']
    OPENAI_API_KEY = creds['OPENAI_API_KEY']

os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [4]:
dialogues_f = "../data/friends_dialogues/dialogues.csv"

import csv
import json

# Define a dictionary to hold the concatenated lines
dialogues = {}

# Read the CSV file
with open(dialogues_f, 'r') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        if row['character'] in dialogues:
            dialogues[row['character']] += '  ' + row['dialogue']
        else:
            dialogues[row['character']] = row['dialogue']

# Convert to the desired JSON format
output = []
for character, dialogue in dialogues.items():
    output.append({
        'character': character,
        'concatenated_lines': dialogue
    })

In [5]:
from langchain.embeddings.openai import OpenAIEmbeddings

model_name = 'text-embedding-ada-002'

embed = OpenAIEmbeddings(
    model=model_name,
    openai_api_key=OPENAI_API_KEY
)

In [11]:
index_name = 'friends'

pinecone.init(
    api_key=PINECONE_API_KEY,
    environment=PINECONE_ENVIRONMENT
)

if index_name not in pinecone.list_indexes():
    pinecone.create_index(
        name=index_name,
        metric='cosine',
        dimension=1536
    )

index = pinecone.GRPCIndex(index_name)

In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
import tiktoken

tokenizer = tiktoken.get_encoding('cl100k_base')

def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=20,
    length_function=tiktoken_len,
    separators=["\n\n", "\n", " ", ""]
)

In [8]:
output

[{'character': 'Chandler',
  'concatenated_lines': 'Hey.  Someone on the subway licked my neck! Licked my neck!!  What are you guys doing?  Oooh, I’m afraid that does not exist.  I don’t know what it is, I just can’t take a good picture.  Yeah, I’m not in that.  How about those three pointers?  Yes, and we call Ross Lingers In The Bathroom.  They have that on the napkins at the club.  Rach, if you have a crush on this guy, why would you hire him? I mean y’know you can’t date him right?  Dude that is so sad.  Could I play?  No, I am.  Oh my God! Those are my bedroom eyes?! Why did you ever sleep with me?  I can’t help it!  Borrow money from me?  Why is there jelly on your shoe?  Hey!  I’m still right here!  Did you have a crush on me, when you first met me?  There I am!  Yeah, Joey said I uh, I needed to relax so he gave me an antihistamine.  Yeah, and then I fell asleep on the subway and went all the way to Brooklyn. Brooklyn is f-far!!  Ahhh.  Oh yeah, that looks good.  Okay.  I’m mar

In [9]:
output[0]

{'character': 'Chandler',
 'concatenated_lines': 'Hey.  Someone on the subway licked my neck! Licked my neck!!  What are you guys doing?  Oooh, I’m afraid that does not exist.  I don’t know what it is, I just can’t take a good picture.  Yeah, I’m not in that.  How about those three pointers?  Yes, and we call Ross Lingers In The Bathroom.  They have that on the napkins at the club.  Rach, if you have a crush on this guy, why would you hire him? I mean y’know you can’t date him right?  Dude that is so sad.  Could I play?  No, I am.  Oh my God! Those are my bedroom eyes?! Why did you ever sleep with me?  I can’t help it!  Borrow money from me?  Why is there jelly on your shoe?  Hey!  I’m still right here!  Did you have a crush on me, when you first met me?  There I am!  Yeah, Joey said I uh, I needed to relax so he gave me an antihistamine.  Yeah, and then I fell asleep on the subway and went all the way to Brooklyn. Brooklyn is f-far!!  Ahhh.  Oh yeah, that looks good.  Okay.  I’m marry

In [10]:
from uuid import uuid4

batch_limit = 100

texts = []
metadatas = []

for i, record in enumerate(output):
    # first get metadata fields for this record
    metadata = {
        'character': str(record['character']),
        'type': 'character'
    }

    record_texts = text_splitter.split_text(str(record['concatenated_lines']))
    print("*"*20)

    record_metadatas = [{
        "chunk": j, "text": text, **metadata
    } for j, text in enumerate(record_texts)]

    texts.extend(record_texts)
    metadatas.extend(record_metadatas)
    print("*"*20)
    # if we have reached the batch_limit we can add texts
    if len(texts) >= batch_limit:
        ids = [str(uuid4()) for _ in range(len(texts))]
        embeds = embed.embed_documents(texts)
        index.upsert(vectors=zip(ids, embeds, metadatas))
        texts = []
        metadatas = []

if len(texts) > 0:
    ids = [str(uuid4()) for _ in range(len(texts))]
    embeds = embed.embed_documents(texts)
    index.upsert(vectors=zip(ids, embeds, metadatas))

********************
********************


NameError: name 'index' is not defined

In [45]:
print("Finished")

Finished


In [46]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 5}},
 'total_vector_count': 5}

# 1. Similarity of JD and Candidate: For each JD find the most similar profiles
# 2. adjust the template of langchain to JD and candidate
# 3. Evaluator candidate.
# 4. Where do we save the output and how we use that output?