# Install Required Libraries
Install the necessary libraries, including LangChain, Pinecone, and any other dependencies.

In [1]:
# # Install LangChain
# !pip install langchain

# # Install Pinecone
# !pip install pinecone-client

# # Install other dependencies
# !pip install openai
# !pip install numpy
# !pip install pandas
# !pip install scikit-learn
# !pip install pdfminer.six
!pip3 install --upgrade jupyter ipywidgets



# Import Libraries
Import the necessary libraries, including LangChain, Pinecone, and any other dependencies.

In [1]:
# Import Libraries

# Import LangChain
# from langchain import 

# Import Pinecone
# from pinecone import Pinecone, ServerlessSpec

# Import OpenAI
import openai

# Import NumPy
import numpy as np

# Import Pandas
import pandas as pd

# Import scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

# Import pdfminer for PDF processing
from pdfminer.high_level import extract_text

# Import dotenv
from dotenv import load_dotenv

# Import os
import os


# Load environment variables

In [54]:
load_dotenv()

True

# Setup Pinecone
Setup Pinecone by creating an account, getting the API key, and initializing the Pinecone client.

In [65]:
import time
from getpass import getpass
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
# Setup Pinecone

# Initialize Pinecone
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY") or getpass("Enter Pinecone API Key: ")
pc = Pinecone(api_key=PINECONE_API_KEY)

# Create a Pinecone index
index_name = "research-paper-index"

if index_name in pc.list_indexes().names():
    pc.delete_index(index_name)
    
pc.create_index(
    name=index_name, 
    dimension=1024,
    metric="cosine",
    spec=ServerlessSpec(
        cloud="aws",
        region='us-east-1'
    )
)
# wait for index to be initialized
while not pc.describe_index(index_name).status['ready']:
    time.sleep(1)


# Connect to the Pinecone index
index = pc.Index(index_name)

# Verify the index connection
print(f"Connected to Pinecone index: {index_name}")
print(f"Index summary:\n {index.describe_index_stats()}")

Connected to Pinecone index: research-paper-index
Index summary:
 {'dimension': 1024,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 0}},
 'total_vector_count': 0}


# Load and Preprocess Research Paper
Load the research paper from a file, preprocess the text (e.g., tokenization, cleaning), and prepare it for indexing.

In [95]:
# Load and Preprocess Research Paper
# Import nltk
import nltk
from nltk.tokenize import PunktSentenceTokenizer
nltk.download('punkt')
from pdfminer.high_level import extract_pages


# Function to load and extract text from a PDF file
def load_pdf(file_path):
    return extract_text(file_path)

# Function to preprocess the text (tokenization, cleaning)
def preprocess_text(text):
    
    tokenizer = PunktSentenceTokenizer(text.strip())  # Initialize the tokenizer
    text = text.replace('\n', ' ')  # Remove newlines
    sentences = tokenizer.tokenize(text.strip())  # Tokenize the text into sentences
    return sentences

# Load the research paper from a file
file_path = "/Users/samiulmushfik/Downloads/FAST--Fast-Architecture-Sensit-542611fc-2992-4086-a9db-0d34117f512c.pdf"
raw_text = load_pdf(file_path)

# Preprocess the text
preprocessed_text = preprocess_text(raw_text)

# Display the first few preprocessed sentences
# print(preprocessed_text[:5])
for page_layout in extract_pages(file_path):
    for element in page_layout:
        print(element)



[nltk_data] Downloading package punkt to
[nltk_data]     /Users/samiulmushfik/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


<LTTextBoxHorizontal(0) 109.080,683.088,500.589,720.944 'FAST: Fast Architecture Sensitive Tree Search\non Modern CPUs and GPUs\n'>
<LTTextBoxHorizontal(1) 86.400,631.285,540.204,655.885 'Changkyu Kim†, Jatin Chhugani†, Nadathur Satish†, Eric Sedlar!, Anthony D. Nguyen†,\nTim Kaldewey!, Victor W. Lee†, Scott A. Brandt!, and Pradeep Dubey†\n'>
<LTTextBoxHorizontal(2) 257.880,611.499,368.893,621.461 'changkyu.kim@intel.com\n'>
<LTTextBoxHorizontal(3) 78.960,579.458,206.685,600.834 '†Throughput Computing Lab,\nIntel Corporation\n'>
<LTTextBoxHorizontal(4) 259.200,579.818,367.568,600.970 '!Special Projects Group,\nOracle Corporation\n'>
<LTTextBoxHorizontal(5) 430.800,578.858,536.946,599.261 '!University of California,\nSanta Cruz\n'>
<LTTextBoxHorizontal(6) 53.760,477.039,296.956,565.489 'ABSTRACT\nIn-memory tree structured index search is a fundamental database\noperation. Modern processors provide tremendous computing power\nby integrating multiple cores, each with wide vector units. Th

In [23]:
from operator import itemgetter


def fonts(doc, granularity=False):
    """Extracts fonts and their usage in PDF documents.
    :param doc: PDF document to iterate through
    :type doc: <class 'fitz.fitz.Document'>
    :param granularity: also use 'font', 'flags' and 'color' to discriminate text
    :type granularity: bool
    :rtype: [(font_size, count), (font_size, count}], dict
    :return: most used fonts sorted by count, font style information
    """
    styles = {}
    font_counts = {}

    for page in doc:
        blocks = page.get_text("dict")["blocks"]
        for b in blocks:  # iterate through the text blocks
            if b['type'] == 0:  # block contains text
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        if granularity:
                            identifier = "{0}_{1}_{2}_{3}".format(s['size'], s['flags'], s['font'], s['color'])
                            styles[identifier] = {'size': s['size'], 'flags': s['flags'], 'font': s['font'],
                                                  'color': s['color']}
                        else:
                            identifier = "{0}".format(s['size'])
                            styles[identifier] = {'size': s['size'], 'font': s['font']}

                        font_counts[identifier] = font_counts.get(identifier, 0) + 1  # count the fonts usage

    font_counts = sorted(font_counts.items(), key=itemgetter(1), reverse=True)

    if len(font_counts) < 1:
        raise ValueError("Zero discriminating fonts found!")

    return font_counts, styles


def font_tags(font_counts, styles):
    """Returns dictionary with font sizes as keys and tags as value.
    :param font_counts: (font_size, count) for all fonts occuring in document
    :type font_counts: list
    :param styles: all styles found in the document
    :type styles: dict
    :rtype: dict
    :return: all element tags based on font-sizes
    """
    p_style = styles[font_counts[0][0]]  # get style for most used font by count (paragraph)
    p_size = p_style['size']  # get the paragraph's size

    # sorting the font sizes high to low, so that we can append the right integer to each tag 
    font_sizes = []
    for (font_size, count) in font_counts:
        font_sizes.append(float(font_size))
    font_sizes.sort(reverse=True)

    # aggregating the tags for each font size
    idx = 0
    size_tag = {}
    for size in font_sizes:
        idx += 1
        if size == p_size:
            idx = 0
            size_tag[size] = '<p>'
        if size > p_size:
            size_tag[size] = '<h{0}>'.format(idx)
        elif size < p_size:
            size_tag[size] = '<s{0}>'.format(idx)

    return size_tag


def headers_para(doc, size_tag):
    """Scrapes headers & paragraphs from PDF and return texts with element tags.
    :param doc: PDF document to iterate through
    :type doc: <class 'fitz.fitz.Document'>
    :param size_tag: textual element tags for each size
    :type size_tag: dict
    :rtype: list
    :return: texts with pre-prended element tags
    """
    header_para = []  # list with headers and paragraphs
    first = True  # boolean operator for first header
    previous_s = {}  # previous span

    for page in doc:
        blocks = page.get_text("dict")["blocks"]
        for b in blocks:  # iterate through the text blocks
            if b['type'] == 0:  # this block contains text

                # REMEMBER: multiple fonts and sizes are possible IN one block

                block_string = ""  # text found in block
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        if s['text'].strip():  # removing whitespaces:
                            if first:
                                previous_s = s
                                first = False
                                block_string = size_tag[s['size']] + s['text']
                            else:
                                if s['size'] == previous_s['size']:

                                    if block_string and all((c == "|") for c in block_string):
                                        # block_string only contains pipes
                                        block_string = size_tag[s['size']] + s['text']
                                    if block_string == "":
                                        # new block has started, so append size tag
                                        block_string = size_tag[s['size']] + s['text']
                                    else:  # in the same block, so concatenate strings
                                        block_string += " " + s['text']

                                else:
                                    header_para.append(block_string)
                                    block_string = size_tag[s['size']] + s['text']

                                previous_s = s

                    # new block started, indicating with a pipe
                    # block_string += "|"

                header_para.append(block_string)

    return header_para

In [24]:
import pymupdf
from pymupdf import Page

doc = pymupdf.open("/Users/samiulmushfik/Downloads/FAST--Fast-Architecture-Sensit-542611fc-2992-4086-a9db-0d34117f512c.pdf")
print(doc.metadata)

# all_paragraphs = []
    
# for page_num in range(len(doc)):
#     page = doc[page_num]
#     blocks = page.get_text("dict")["blocks"]
    
#     for block in blocks:
#         # Extract text from block
#         block_text = ""
#         if "lines" in block:
#             for line in block["lines"]:
#                 for span in line["spans"]:
#                     block_text += span["text"]
#                 block_text += "\n"  # New line after each line within a block
            
#         # Clean and add block text as a paragraph
#         paragraph = block_text.strip()
#         if paragraph:  # Avoid empty blocks
#             all_paragraphs.append({
#                 "page": page_num + 1,
#                 "paragraph": paragraph
#             })

# for para in all_paragraphs:
#     print(f"Page {para['page']}:\n{para['paragraph']}\n")

font_counts, styles = fonts(doc)
size_tag = font_tags(font_counts, styles)
headers_para(doc, size_tag)
        


{'format': 'PDF 1.3', 'title': 'FAST: fast architecture sensitive tree search on modern CPUs and GPUs', 'author': 'Changkyu Kim, Jatin Chhugani, Nadathur Satish, Eric Sedlar, Anthony D. Nguyen, Tim Kaldewey, Victor W. Lee, Scott A. Brandt, Pradeep Dubey', 'subject': '', 'keywords': 'compression, cpu, data-level parallelism, gpu, thread-level parallelism, tree search', 'creator': '', 'producer': 'Mac OS X 10.5.8 Quartz PDFContext', 'creationDate': "D:20100617231448Z00'00'", 'modDate': "D:20100617231448Z00'00'", 'trapped': '', 'encryption': None}


['<h2>FAST: Fast Architecture Sensitive Tree Search',
 '<h2>on Modern CPUs and GPUs',
 '',
 '<h5>Changkyu Kim',
 '<s13>†',
 '<h5>, Jatin Chhugani',
 '<s13>†',
 '<h5>, Nadathur Satish',
 '<s13>†',
 '<h5>, Eric Sedlar',
 '<s13>⋆',
 '<h5>, Anthony D. Nguyen',
 '<s13>†',
 '<h5>,',
 '<h5>Tim Kaldewey',
 '<s13>⋆',
 '<h5>, Victor W. Lee',
 '<s13>†',
 '<h5>, Scott A. Brandt',
 '<s13>⋄',
 '<h5>, and Pradeep Dubey',
 '<s13>†',
 '',
 '<h7>changkyu.kim@intel.com',
 '',
 '<s13>†',
 '<h7>Throughput Computing Lab,',
 '<h7>Intel Corporation',
 '',
 '<s13>⋆',
 '<h7>Special Projects Group,',
 '<h7>Oracle Corporation',
 '',
 '<s13>⋄',
 '<h7>University of California,',
 '<h7>Santa Cruz',
 '',
 '<h5>ABSTRACT',
 '',
 '<p>In-memory tree structured index search is a fundamental database operation. Modern processors provide tremendous computing power by integrating multiple cores, each with wide vector units. There has been much work to exploit modern processor architectures for database primitives like scan, 

In [75]:
len(preprocessed_text)

40

# Indexing

In [76]:
from tqdm.auto import tqdm
import uuid

batch_size = 10

for i in tqdm(range(0, len(preprocessed_text), batch_size)):
    batch = preprocessed_text[i:i+batch_size]
    print(batch)
    embeddings = pc.inference.embed(
        model='multilingual-e5-large',
        inputs=batch,
        parameters={"input_type": "passage", "truncate": "END"}
    )
    ids = [str(uuid.uuid4()) for _ in range(i, i+len(batch))]
    records = []
    for id, b, e in zip(ids, batch, embeddings):
        records.append({
            "id": id,
            "values": e['values'],
            'metadata': {'text': b}
        })
    print(records)
    # index.upsert(vectors=records)

  0%|          | 0/1 [00:00<?, ?it/s]

['Samiul Mushfik samiulmushfik123@gmail.com  Personal Statement for Ph.D. in Computer Science  It is fascinating how much we interact with different software daily!', 'With the growing impact of software in our everyday lives, there has been a genuine concern regarding its performance, scalability, reliability, and delivery; existing techniques need to be much more efficient, while new ones need to be introduced to meet the increasing complexity of software systems.', 'Software works as a primary means of delivering stakeholder value by integrating systems.', 'Future systems will require expertise within and beyond the separate systems and software engineering disciplines.', 'Both domains are intimately intertwined in providing the whole picture.', 'Along with the software, systems need much attention to meet the growing needs for a seamless digital experience.', 'This realization motivated me to pursue a Ph.D. in Systems and Software Engineering-related topics.', 'More specifically, I

In [71]:
while not pc.describe_index(index_name).status['ready']:
    time.sleep(1)
index.describe_index_stats()
# pc.describe_index(index_name)

{'dimension': 1024,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 40}},
 'total_vector_count': 40}

# Create Vector Store

In [72]:
# from langchain_pinecone import PineconeVectorStore

# vectorestore = PineconeVectorStore(index)



# Create LangChain Model
Create a LangChain model for processing and understanding the research paper.

In [None]:
# Create LangChain Model

# Define a function to create a LangChain model
def create_langchain_model(preprocessed_text):
    # Initialize the LangChain model
    langchain_model = LangChain()

    # Add the preprocessed text to the LangChain model
    for sentence in preprocessed_text:
        langchain_model.add_text(sentence)

    return langchain_model

# Create the LangChain model using the preprocessed text
langchain_model = create_langchain_model(preprocessed_text)

# Display the LangChain model summary
print(langchain_model.summary())

# Query the Research Paper
Use the LangChain model to query the indexed research paper and retrieve relevant information based on user questions.

In [97]:
# Query the Research Paper

# Function to query the Pinecone index and retrieve relevant information
def query_paper(question, index):
    # Generate embedding for the question using OpenAI's GPT-3
    question_embedding = pc.inference.embed(
        model="multilingual-e5-large",
        inputs=[question],
        parameters={
            "input_type": "query"
        }
    )       
    
    # Query the Pinecone index with the question embedding
    query_response = index.query(vector=question_embedding[0].values, top_k=5, include_metadata=True)
    
    # # Extract the relevant sentences from the query response
    # relevant_sentences = [match['metadata']['text'] for match in query_response['matches']]
    
    # # Use the LangChain model to generate a response based on the relevant sentences
    # response = langchain_model.generate_response(question, relevant_sentences)
    
    return query_response

# Example usage
question = "What did the author study?"
response = query_paper(question, index)

# Display the response
print("Question:", question)
print("Response:", response)

Question: What did the author study?
Response: {'matches': [{'id': '4f0ecfc4-9faa-440d-984c-573788d42f61',
              'metadata': {'text': 'learned about Sockets,  I  My interest in '
                                   'software engineering led me to choose this '
                                   'field as my undergraduate thesis area.'},
              'score': 0.79990995,
              'sparse_values': {'indices': [], 'values': []},
              'values': []},
             {'id': '2ae9978b-c19b-4563-9843-186f4f785d3a',
              'metadata': {'text': 'This realization motivated me to pursue a '
                                   'Ph.D. in Systems and Software '
                                   'Engineering-related topics.'},
              'score': 0.7996379,
              'sparse_values': {'indices': [], 'values': []},
              'values': []},
             {'id': 'c1094bce-acd4-4083-a3bd-0e6f9d29577a',
              'metadata': {'text': 'While working on this, I explore