In [21]:
import os
import time
from pinecone import Pinecone, ServerlessSpec
import cohere
from sentence_transformers import SentenceTransformer
from IPython.display import display, Markdown

# Load API keys
with open("pinecone_api_key.txt") as f:
    PINECONE_API_KEY = f.read().strip()

with open("cohere_api_key.txt") as f:
    COHERE_API_KEY = f.read().strip()
    
# Load the models
cohere_client = cohere.Client(COHERE_API_KEY)
sentence_transformer_model = SentenceTransformer('all-MiniLM-L6-v2')
dimension = sentence_transformer_model.get_sentence_embedding_dimension() 

# Initialize Pinecone
pc = Pinecone(
        api_key=PINECONE_API_KEY
    )
index_name = "terms-index"
existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
if index_name not in existing_indexes:
    pc.create_index(
        name=index_name,
        dimension=dimension,
        # Remember! It is crucial that the metric you will use in your VectorDB will also be a metric your embedding
        # model works well with!
        metric='cosine',
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )




In [19]:

# Helper functions
def load_data(directory: str):
    """Load all text files from a directory and its subdirectories."""
    documents = []
    company_names = []
    for foldername, _, filenames in os.walk(directory):
        company_name = os.path.basename(foldername)
        for filename in filenames:
            if filename.endswith(".md"):
                filepath = os.path.join(foldername, filename)
                with open(filepath, 'r', encoding='utf-8') as f:
                    documents.append(f.read())
                    company_names.append(company_name)  # Add company name for each document
    return documents, company_names

def chunk_data(documents: list, company_names: list, chunk_size: int):
    """Split documents into smaller chunks and prepend company names."""
    chunks = []
    for doc, company_name in zip(documents, company_names):
        for i in range(0, len(doc), chunk_size):
            chunk = doc[i:i+chunk_size]
            chunks.append(f"Company: {company_name}\n{chunk}")  # Prepend company name to each chunk
    return chunks

def embed_text(model_type: str, model_name: str, texts: list):
    """Embed texts using either Cohere or SentenceTransformer."""
    if model_type == "Cohere":
        embeddings = []
        for i in range(0, len(texts), 10):  # Batching to avoid hitting rate limits
            batch = texts[i:i+10]
            response = cohere_client.embed(texts=batch, model=model_name)
            embeddings.extend(response.embeddings)
            time.sleep(1)  # Delay to avoid rate limits
        return embeddings
    elif model_type == "SentenceTransformer":
        return sentence_transformer_model.encode(texts, convert_to_tensor=True).tolist()

def upsert_index(index, embeddings, metadata):
    """Insert embeddings into Pinecone with metadata."""
    for idx, (emb, md) in enumerate(zip(embeddings, metadata)):
        vector = {"id": str(idx), "values": emb, "metadata": {"text": md}}
        index.upsert([vector])
        time.sleep(0.1)  # Delay to avoid rate limits

def query_index(index, query_embedding, top_k=5):
    """Query Pinecone index with a query embedding."""
    return index.query(vector=query_embedding, top_k=top_k, include_metadata=True)

def generate_answer_with_context(query: str, context: str, model_type: str, model_name: str):
    """Generate answer from a model, with or without context."""
    if model_type == "Cohere":
        response = cohere_client.generate(model=model_name, prompt=f"Context: {context}\n\nQuestion: {query}\nAnswer:")
        return response.generations[0].text.strip()
    else:
        # Add your sentence transformer or other model-based generation here if needed
        return "Direct answer functionality with this model is not yet implemented."

# Load and Index Data
directory = "only_english_data"
chunk_size = 300
documents,company_names = load_data(directory)
chunks = chunk_data(documents, company_names, chunk_size)
print(f"Number of documents: {len(documents)}")
print(f"Number of chunks: {len(chunks)}")

# Embed text
embedding_model_type = "SentenceTransformer"  # Change this to "SentenceTransformer" to use SentenceTransformer
embedding_model_name = "embed-english-light-v2.0"  # Choose appropriate model

embeddings = embed_text(embedding_model_type, embedding_model_name, chunks)

print(f"Dimension of embeddings: {len(embeddings[0])}")

# # Create or connect to Pinecone index
# if index_name not in pinecone.list_indexes():
#     pinecone.create_index(index_name, dimension=dimension)




Number of documents: 286
Number of chunks: 46534
Dimension of embeddings: 384


In [20]:
index = pc.Index(index_name)
print("here")
upsert_index(index, embeddings, chunks)
print(f"Number of upserted vectors: {index_upserted}")
# Ask a Question
query = "What is the refund policy in AliExpress?"  # Example question
query_embedding = embed_text(embedding_model_type, embedding_model_name, [query])[0]

# Query Pinecone index
results = query_index(index, query_embedding)

# Fetch top-k results (context)
context = "\n".join([result["metadata"]["text"] for result in results["matches"]])

# Generate answers
rag_answer = generate_answer_with_context(query, context, "Cohere", "command-r-plus")
direct_answer = generate_answer_with_context(query, "", "Cohere", "command-r-plus")

# Display answers
display(Markdown(f"### RAG Answer:"))
display(Markdown(rag_answer))

display(Markdown(f"### Direct Answer (No Context):"))
display(Markdown(direct_answer))

here


KeyboardInterrupt: 

In [15]:
chunks

["This database has been created by the European Commission. Copyright European Union, 2024.\n \nThe Commission's reuse policy is implemented by Commission Decision 2011/833/EU of 12 December 2011 on the reuse of Commission documents (OJ L 330, 14.12.2011, p. 39 - https://eur-lex.europa.eu/eli/dec/2011/833/oj).\n \nUnless otherwise noted, the reuse of this database is authorised under the Creative Commons Attribution 4.0 International (CC BY 4.0) licence (https://creativecommons.org/licenses/by/4.0/).",
 ' This means that reuse is allowed, provided that appropriate credit is given and any changes are indicated.\n\nThe Commission cooperated with the [Open Terms Archive](https://opentermsarchive.org/) for the creation of part of this database. \n',
 '### Local Services Additional Terms for Providers  \n(EMEA)\n\n1. Applicable Terms. Local Services by Google is a platform provided by Google Ireland Limited (registered number: 368047) with its registered office located at Gordon House, Bar