In [1]:
import os
import pinecone
import json
with open('../.creds') as f:
    creds = json.load(f)
    PINECONE_API_KEY = creds['PINECONE_API_KEY']
    PINECONE_ENVIRONMENT = creds['PINECONE_ENVIRONMENT']
    OPENAI_API_KEY = creds['OPENAI_API_KEY']

os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

  from tqdm.autonotebook import tqdm


In [8]:
dialogues_f = "../data/friends_dialogues/dialogues.csv"

import csv
import json

# Define a dictionary to hold the concatenated lines
dialogues = {}

# Read the CSV file
with open(dialogues_f, 'r') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        if 'S01' in row['filename']:
            if row['character'] in dialogues:
                dialogues[row['character']] += '  ' + row['dialogue']
            else:
                dialogues[row['character']] = row['dialogue']

# Convert to the desired JSON format
output = []
for character, dialogue in dialogues.items():
    output.append({
        'character': character,
        'concatenated_lines': dialogue
    })

In [9]:
from langchain.embeddings.openai import OpenAIEmbeddings

model_name = 'text-embedding-ada-002'

embed = OpenAIEmbeddings(
    model=model_name,
    openai_api_key=OPENAI_API_KEY
)

In [10]:
index_name = 'linkedin'

pinecone.init(
    api_key=PINECONE_API_KEY,
    environment=PINECONE_ENVIRONMENT
)

if index_name not in pinecone.list_indexes():
    print("creating")
    pinecone.create_index(
        name=index_name,
        metric='cosine',
        dimension=1536
    )

index = pinecone.GRPCIndex(index_name)
print(index)
index.describe_index_stats()


<pinecone.core.grpc.index_grpc.GRPCIndex object at 0x7f3467ea17e0>


{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 5}},
 'total_vector_count': 5}

In [11]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
import tiktoken

tokenizer = tiktoken.get_encoding('cl100k_base')

def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=20,
    length_function=tiktoken_len,
    separators=["\n\n", "\n", " ", ""]
)

In [13]:
from uuid import uuid4
import sys

batch_limit = 200

texts = []
metadatas = []


for i, record in enumerate(output):
    # first get metadata fields for this record
    metadata = {
        'id': i,
        'character': str(record['character']),
        'type': 'character',
        'source':'friends'
    }
    print(metadata)

    record_texts = text_splitter.split_text(str(record['concatenated_lines']))

    print("*"*20)

    record_metadatas = [{
        "chunk": j, "text": text, **metadata
    } for j, text in enumerate(record_texts)]

    texts.extend(record_texts)
    metadatas.extend(record_metadatas)
    print("*"*20)
    # if we have reached the batch_limit we can add texts

    print(len(texts))
    if len(texts) >= batch_limit:
        ids = [str(uuid4()) for _ in range(len(texts))]
        embeds = embed.embed_documents(texts)
        index.upsert(vectors=zip(ids, embeds, metadatas), namespace='friends')
        texts = []
        metadatas = []

print(len(texts))
if len(texts) > 0:
    ids = [str(uuid4()) for _ in range(len(texts))]
    embeds = embed.embed_documents(texts)
    index.upsert(vectors=zip(ids, embeds, metadatas), namespace='friends')

{'id': 0, 'character': 'Ross', 'type': 'character', 'source': 'friends'}
********************
********************
51
{'id': 1, 'character': 'Monica', 'type': 'character', 'source': 'friends'}
********************
********************
94
{'id': 2, 'character': 'Chandler', 'type': 'character', 'source': 'friends'}
********************
********************
141
{'id': 3, 'character': 'Rachel', 'type': 'character', 'source': 'friends'}
********************
********************
183
{'id': 4, 'character': 'Joey', 'type': 'character', 'source': 'friends'}
********************
********************
217
{'id': 5, 'character': 'Phoebe', 'type': 'character', 'source': 'friends'}
********************
********************
31
{'id': 6, 'character': 'Girls', 'type': 'character', 'source': 'friends'}
********************
********************
32
{'id': 7, 'character': 'Guys', 'type': 'character', 'source': 'friends'}
********************
********************
33
{'id': 8, 'character': 'All', 'type': 'cha

In [45]:
print("Finished")

Finished


In [14]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 5}, 'friends': {'vector_count': 327}},
 'total_vector_count': 332}

In [17]:
query_text = "Extremely allergic, okay? If I'm anywhere near a dog for more than 5 minutes, my throat will just close up!"
query_vector = embed.embed_documents([query_text])[0]

query_results = index.query(
    vector=query_vector,
    filter={
        "source": {"$eq": "friends"},
        "type": {"$eq": "character"}
    },
    top_k=3,
    include_metadata=True,
    namespace="friends"
)


In [16]:
print(query_results)

{'matches': [{'id': 'd81d9c9c-4bb4-47e1-9a4c-ddb319ead056',
              'metadata': {'character': 'Joey',
                           'chunk': 14.0,
                           'id': 4.0,
                           'source': 'friends',
                           'text': 'Young?  Would you let it go Ross. It was '
                                   "just a dream. It doesn't mean...  All "
                                   'right, relax, relax. Just relax, just '
                                   'relax. Be cool, be cool.  Hey, hey. How '
                                   'was the first day?  Oh... Yeah, you do.  '
                                   'So, tell me. Was it like you and Chandler, '
                                   'and then you and me, or you and me and '
                                   'Chandler?  What?  Listen, the next time '
                                   'you talk to him, can you ask him which one '
                                   'the strongest Power Ran