Install the requirements

In [None]:
!pip install pandas
!pip install --upgrade langchain openai -q
!pip install unstructured -q
!pip install unstructured[local-inference] -q
!pip install pinecone-client==2.2.4
!pip install openai==0.28
!pip install tiktoken -q
!pip install python-dotenv
!apt-get install poppler-utils


Import the libraries

In [None]:
import numpy as np
import csv
import pandas as pd
from pandas import Series, DataFrame
import os
import openai
import pinecone
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain
from dotenv import load_dotenv


Sample data set for Hackathon purpose

In [None]:
filename = '/content/ajiofashion.csv'
df = pd.read_csv(filename)
sampled_df = df.sample(frac=0.0035)
sampled_df.to_csv('/content/data/meta.csv', index=False)

JSON data (unstructured) fetched through API and the converted to CSV (structured) to embed into vector database

In [None]:
#import pandas as pd
#import requests
#response=requests.get("https://dummyjson.com/products?skip=0&limit=100")
#df=pd.DataFrame(response.json()['products'])[['title','description','price','brand','category','images']]
#df.to_csv('/content/data/product.csv')

Load the Documents in loader

In [None]:
directory = '/content/data'

def load_docs(directory):
  loader = DirectoryLoader(directory)
  documents = loader.load()
  return documents

documents = load_docs(directory)
len(documents)

Split the documents into chunks

In [None]:
def split_docs(documents, chunk_size=1000, chunk_overlap=100):
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
  docs = text_splitter.split_documents(documents)
  return docs

docs = split_docs(documents)
print(len(docs))

Embed the data chunks by tokenizing it and embedding into vector database

In [None]:
load_dotenv()
embeddings = OpenAIEmbeddings()
pinecone.init(
    api_key=os.environ["PINECONE_API_KEY"],
    environment=os.environ["ENVIRONMENT"]
)
index_name = os.environ["INDEX_NAME"]
text=[docs]
index = Pinecone.from_documents(docs, embeddings, index_name=index_name)