### Imports

In [1]:
import os
import json
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_pinecone import PineconeVectorStore

### Loading dataset

In [2]:
import random
from datasets import load_dataset
dataset = load_dataset("wikimedia/wikipedia", "20231101.en")

In [3]:
train_dataset = dataset["train"]

In [4]:
# randomly selecting 10k articles
num_rows = len(train_dataset)
random_indices = random.sample(range(num_rows), 10000)
random_rows = [train_dataset[idx] for idx in random_indices]

In [5]:
import pandas as pd
df = pd.DataFrame(random_rows)
df.to_csv("wiki10k.csv", index=False)

### Load the dataset using langchain

In [2]:
import sys
import csv

csv.field_size_limit(sys.maxsize)

131072

In [3]:
loader = CSVLoader(file_path='./wiki10k.csv', metadata_columns=['id','url','title'])
langchain_docs = loader.load()

### Chunking

In [4]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
)

In [5]:
docs = text_splitter.split_documents(documents=langchain_docs)

In [6]:
len(docs)

91455

In [10]:
docs[0]

Document(page_content='text: Daniel Remshart Thomas (August 27, 1843 – April 7, 1915) was an American businessman and city councillor based in Savannah, Georgia. He was an original member of the Sinking Fund Commission, which was established by the City of Savannah in 1878 aimed at retiring general bond issues. He served on the city council for almost fourteen years.\n\nLife and career\nThomas was born on August 27, 1843, to John T. Thomas and Jane Ann Remshart. As a child, he had "a delicate constitution and imperfect sight".', metadata={'source': './wiki10k.csv', 'row': 0, 'id': '70400187', 'url': 'https://en.wikipedia.org/wiki/Daniel%20Remshart%20Thomas', 'title': 'Daniel Remshart Thomas'})

### Embedding 

In [11]:
embeddings = HuggingFaceEmbeddings()

In [10]:
# Check the dimension before creating index in pinecone
embeddings.embed_documents(docs[0].page_content)[0]

### Insert data into pinecone

In [13]:
import os

os.environ["PINECONE_API_KEY"] = "SET-YOUR-PINECONE-API-KEY-HERE"

In [18]:
index_name = "wiki10k"

docsearch = PineconeVectorStore.from_documents(docs, embeddings, index_name=index_name)

### Retrieval

In [22]:
docsearch = PineconeVectorStore.from_existing_index(index_name, embeddings)

In [23]:
retriever = docsearch.as_retriever()

In [33]:
docs[15].metadata["title"]

'Tim Kelly (Alaska politician)'

In [34]:
query = "Who is Tim Kelly?"

In [35]:
matched_docs = retriever.get_relevant_documents(query)

In [36]:
matched_docs

[Document(page_content='text: Timothy Donahue Kelly (August 15, 1944 – August 17, 2009) was an American businessman and politician.\n\nBorn in Sacramento, California, Kelly graduated from Sacramento High School in 1962. He served in the United States Marine Corps and later in the Alaska Air National Guard. He was a legislative aide in California and Nevada. In 1970, he moved to Alaska and settled in Anchorage, Alaska. He was in the banking business.', metadata={'id': '53768941', 'row': 2.0, 'source': './wiki10k.csv', 'title': 'Tim Kelly (Alaska politician)', 'url': 'https://en.wikipedia.org/wiki/Tim%20Kelly%20%28Alaska%20politician%29'}),
 Document(page_content='text: Keith Gerard Taylor (born December 21, 1964) is a former American football safety in the National Football League (NFL) for the Indianapolis Colts, the New Orleans Saints, and the Washington Redskins.  He played college football at the University of Illinois and was drafted in the fifth round of the 1988 NFL Draft.  Taylo

### Using Maximum Marginal Relevancy

In [37]:
retriever = docsearch.as_retriever(search_type="mmr")

In [38]:
matched_docs = retriever.get_relevant_documents(query)

In [39]:
matched_docs

[Document(page_content='text: Timothy Donahue Kelly (August 15, 1944 – August 17, 2009) was an American businessman and politician.\n\nBorn in Sacramento, California, Kelly graduated from Sacramento High School in 1962. He served in the United States Marine Corps and later in the Alaska Air National Guard. He was a legislative aide in California and Nevada. In 1970, he moved to Alaska and settled in Anchorage, Alaska. He was in the banking business.', metadata={'id': '53768941', 'row': 2.0, 'source': './wiki10k.csv', 'title': 'Tim Kelly (Alaska politician)', 'url': 'https://en.wikipedia.org/wiki/Tim%20Kelly%20%28Alaska%20politician%29'}),
 Document(page_content="Thune worked on a pilot in 2016 with his creative partner Kevin Parker Flynn, Holy Sh*t, a workplace comedy about a small church avoiding a mega-church takeover. The pilot was produced by Mila Kunis' production company.\n\nThune had a guest starring role on HBO's Love Life as Magnus, one of the relationships opposite Anna Kendr

### Query using native pinecone python client

In [40]:
from pinecone import Pinecone

pc = Pinecone(api_key="SET-YOUR-PINECONE-API-KEY-HERE")
index = pc.Index("wiki10k")

In [41]:
query = "Who is Tim Kelly?"

In [42]:
vector = embeddings.embed_query(query)

In [45]:
len(vector)

768

In [54]:
result = index.query(
    vector=vector,
    top_k=1,
    include_metadata=True
)

In [62]:
result["matches"][0]["metadata"]["title"]

'Tim Kelly (Alaska politician)'

In [61]:
result["matches"][0]["metadata"]["text"]

'text: Timothy Donahue Kelly (August 15, 1944 – August 17, 2009) was an American businessman and politician.\n\nBorn in Sacramento, California, Kelly graduated from Sacramento High School in 1962. He served in the United States Marine Corps and later in the Alaska Air National Guard. He was a legislative aide in California and Nevada. In 1970, he moved to Alaska and settled in Anchorage, Alaska. He was in the banking business.'

### Filter using metadata

In [66]:
result = index.query(
    vector=vector,
    filter={
        "title": {"$eq": "Tim Kelly (Alaska politician)"}
    },
    top_k=1,
    include_metadata=True
)