In [None]:
import pandas as pd
from langchain.prompts import ChatPromptTemplate
from PyPDF2 import PdfReader
from IPython.display import Markdown
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader,DirectoryLoader
from langchain.chains import RetrievalQA
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
import time
from langchain.llms import Bedrock
from langchain.document_loaders import Docx2txtLoader
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFDirectoryLoader
import json
import boto3

In [None]:
def get_pdf_text(pdf_doc):
    text = []
    pdf_reader = PdfReader(pdf_doc)
    for page_number, page in enumerate(pdf_reader.pages):
        text.append((page_number + 1, page.extract_text()))
    return text

In [None]:
def split_text_pdf(data, chunk_size, chunk_overlap):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size,
        chunk_overlap = chunk_overlap,
        separators=["\n\n", "\n", "."],
        length_function=len)
    text_chunks = text_splitter.create_documents([data])
    return text_chunks

In [None]:
texts = []
def process_pd_documents(path):
    for file_index, file in enumerate(os.listdir(path)):
        if file.endswith(".pdf"):
            try:
                pages = get_pdf_text(os.path.join(path, file))
            except:
                continue
            title = file.replace('.pdf', '')
            for page_number, page_text in pages:
                page_text = page_text.replace(".", "")
                chunks_parent = split_text_pdf(page_text, 1500, 300)
                for chunk_index, chunk in enumerate(chunks_parent):
                    parent_text = chunk.page_content
                    child_chunks = split_text_pdf(parent_text, 200, 100)
                    for child_chunk in child_chunks:
                        entry = {
                            'title': title,
                            'page_number': page_number,
                            'text_parent': parent_text,
                            'text_child': child_chunk.page_content
                        }
                        texts.append(entry)
    df = pd.DataFrame(texts)
    return df

In [None]:
path = '../aerocivil_docs/'
df = process_pd_documents(path)

In [None]:
def clean_text(df):
    df = df.fillna('.')
    df = df.replace('\n',' ', regex=True)
    df = df.replace('#',' ', regex=True)
    df = df.replace('-',' ', regex=True)
    df = df.replace('__',' ', regex=True)
    df = df.replace('@',' ', regex=True)
    df = df.replace('/',' ', regex=True)
    df = df.replace('`',' ', regex=True)
    df = df.replace(' +',' ', regex=True)
    df = df.replace('>',' ', regex=True)
    df = df.replace('<',' ', regex=True)
    df = df.replace('{',' ', regex=True)
    df = df.replace('}',' ', regex=True)
    df = df.replace('!',' ', regex=True)
    df = df.replace('[^\w\s]','', regex=True)
    df = df.replace('pdf','', regex=True)
    return df

In [None]:
df = clean_text(df)

In [None]:
title_to_doc = {title: f'doc{i+1}' for i, title in enumerate(df['title'].unique())}

In [None]:
df['doc'] = df['title'].map(title_to_doc)

In [None]:
df['id'] = df['title']
df['doi'] = '1102'
df['chunk-id'] = df.index

In [None]:
import boto3
from langchain.embeddings import BedrockEmbeddings

bedrock_client = boto3.client(service_name='bedrock-runtime', 
                              region_name='us-east-1')
embeddings_bedrock = BedrockEmbeddings(model_id="amazon.titan-embed-text-v1",
                                       client=bedrock_client)

In [None]:
openai_api_key = os.environ.get("OPENAI_API_KEY")

In [None]:
from langchain_community.embeddings import OpenAIEmbeddings
openai_embedding = OpenAIEmbeddings(openai_api_key=openai_api_key)

In [None]:
import pinecone
from langchain_pinecone import PineconeVectorStore

In [None]:
from pinecone import Pinecone, ServerlessSpec

In [None]:
from dotenv import load_dotenv
load_dotenv()

pinecone_key = os.environ.get("PINECONE_KEY", "default_endpoint")

In [None]:
pc = Pinecone(api_key=pinecone_key)

In [None]:
pc.create_index(
    name="aerocivildocs",
    dimension=1536,
    metric="cosine",
    spec=ServerlessSpec(
        cloud="aws",
        region="us-west-2"
    ) 
)

In [None]:
index = pc.Index("aerocivildocs")

In [None]:
from tqdm.auto import tqdm
batch_size = 250

for i in tqdm(range(0, len(df), batch_size)):
    i_end = min(len(df), i+batch_size)
    batch = df.iloc[i:i_end]
    ids = [f"{x['doc']}#{x['chunk-id']}" for i, x in batch.iterrows()]
    texts = [x['text_child'] for _, x in batch.iterrows()]
    embeds = embeddings_bedrock.embed_documents(texts)
    metadata = [
        {'text': x['text_parent'],
         'title': x['title'],
         'page_number': x['page_number'],
          'id': x['id']} for i, x in batch.iterrows()
    ]
    index.upsert(vectors=zip(ids, embeds, metadata), namespace='aero')