In [1]:
import typesense
import os
from dotenv import load_dotenv
load_dotenv(override=True)
client = typesense.Client({
  'nodes': [{
    'host': os.getenv('TYPESENSE_HOST_NAME'),  
    'port': '443',      
    'protocol': 'https'
  }],
  'api_key': os.getenv('TYPESENSE_API_KEY'),
  'connection_timeout_seconds': 2
})



In [2]:
books_schema={
    'name':'books',
    'fields':[
        {'name':'title', 'type':'string'},
        {'name':'authors', 'type':'string[]', 'facet':True},
        {'name':'publication_year', 'type':'int32', 'facet':True},
        {'name':'ratings_count', 'type':'int32'},
        {'name':'average_rating', 'type':'float'},
    ],
    'default_sorting_field':'ratings_count'
}

In [5]:
try:
    client.collections['books'].delete()
except:
    pass
print(client.collections.create(books_schema))

{'created_at': 1771085778, 'curation_sets': [], 'default_sorting_field': 'ratings_count', 'enable_nested_fields': False, 'fields': [{'facet': False, 'index': True, 'infix': False, 'locale': '', 'name': 'title', 'optional': False, 'sort': False, 'stem': False, 'stem_dictionary': '', 'store': True, 'truncate_len': 100, 'type': 'string'}, {'facet': True, 'index': True, 'infix': False, 'locale': '', 'name': 'authors', 'optional': False, 'sort': False, 'stem': False, 'stem_dictionary': '', 'store': True, 'truncate_len': 100, 'type': 'string[]'}, {'facet': True, 'index': True, 'infix': False, 'locale': '', 'name': 'publication_year', 'optional': False, 'sort': True, 'stem': False, 'stem_dictionary': '', 'store': True, 'truncate_len': 100, 'type': 'int32'}, {'facet': False, 'index': True, 'infix': False, 'locale': '', 'name': 'ratings_count', 'optional': False, 'sort': True, 'stem': False, 'stem_dictionary': '', 'store': True, 'truncate_len': 100, 'type': 'int32'}, {'facet': False, 'index': T

In [7]:
with open('../data/json/books.jsonl', 'r', encoding='utf-8') as json1_file:
    data=json1_file.read()
    client.collections['books'].documents.import_(data)

In [8]:
search_parameters={
    'q':"harry potter",
    'query_by':'title,authors',
    'filter_by':'publication_year:<1998',
    'sort_by':'publication_year:desc'
}

client.collections['books'].documents.search(search_parameters)

{'facet_counts': [],
 'found': 1,
 'hits': [{'document': {'authors': ['J.K. Rowling', ' Mary GrandPr√©'],
    'average_rating': 4.44,
    'id': '2',
    'image_url': 'https://images.gr-assets.com/books/1474154022m/3.jpg',
    'publication_year': 1997,
    'ratings_count': 4602479,
    'title': "Harry Potter and the Philosopher's Stone"},
   'highlight': {'title': {'matched_tokens': ['Harry', 'Potter'],
     'snippet': "<mark>Harry</mark> <mark>Potter</mark> and the Philosopher's Stone"}},
   'highlights': [{'field': 'title',
     'matched_tokens': ['Harry', 'Potter'],
     'snippet': "<mark>Harry</mark> <mark>Potter</mark> and the Philosopher's Stone"}],
   'text_match': 1157451471441102969,
   'text_match_info': {'best_field_score': '2211897868289',
    'best_field_weight': 15,
    'fields_matched': 1,
    'num_tokens_dropped': 0,
    'score': '1157451471441102969',
    'tokens_matched': 2,
    'typo_prefix_score': 0}}],
 'out_of': 9979,
 'page': 1,
 'request_params': {'collection_nam

In [9]:
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import Typesense
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from pathlib import Path
# from langchain_groq import ChatGroq

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
# llm = ChatGroq(
#     model="llama-3.3-70b-versatile",
#     api_key=os.getenv("GROQ_API_KEY"),
#     temperature=0,
#     max_tokens=None,
#     timeout=None,
#     max_retries=2,
# )

In [11]:
#read all pdfs inside directory

def process_all_pdfs(pdf_directory):
    all_documents=[]
    pdf_dir=Path(pdf_directory)
    pdf_files=list(pdf_dir.glob("**/*.pdf"))
    print(f"found {len(pdf_files)} pdf files to process")

    for pdf_file in pdf_files:
        print(f"\nProcessing: {pdf_file.name}")
        try:
            loader=PyMuPDFLoader(str(pdf_file))
            documents=loader.load()

            #adding more info to the metadata
            for doc in documents:
                doc.metadata['source_file']=pdf_file.name
                doc.metadata['file_type']='pdf'

            all_documents.extend(documents)
            print(f"loaded {len(documents)} pages")

        except Exception as e:
            print(f"error: {e}")

    print(f"\nTotal document pages loaded: {len(all_documents)}")
    return all_documents

all_pdf_documents=process_all_pdfs("../data/pdf")

found 3 pdf files to process

Processing: code generation using LLMs (compressed).pdf
loaded 70 pages

Processing: flashfill.pdf
loaded 30 pages

Processing: Systematic mapping study of template based code generation.pdf
loaded 20 pages

Total document pages loaded: 120


In [12]:
#split text into chunks
#sliding window chunking
def split_documents(documents, chunk_size=1000, chunk_overlap=200):
    text_splitter=RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n"," ",""]
    )
    split_docs=text_splitter.split_documents(documents)
    print(f"split {len(documents)} document pages into {len(split_docs)} chunks")

    return split_docs

chunks=split_documents(all_pdf_documents)

split 120 document pages into 644 chunks


In [13]:
embeddings=HuggingFaceEmbeddings()

In [14]:
docsearch=Typesense.from_documents(
    chunks,
    embeddings,
    typesense_client_params={
        'host': os.getenv('TYPESENSE_HOST_NAME'),  
        'port': '443',      
        'protocol': 'https',
        'typesense_api_key': os.getenv('TYPESENSE_API_KEY'),
        'typesense_collection_name':'pdf',
        'connection_timeout_seconds': 60
    },
)

In [15]:
query="how is LLM used for code generation"
found_docs=docsearch.similarity_search(query)
print(found_docs[0].page_content)

researchers in establishing a comprehensive, up-to-date, and advanced understanding of LLMs for
code generation. This includes discussing various aspects of this rapidly evolving domain, such as
data curation, latest advancements, performance evaluation, ethical and environmental implications,
and real-world applications. A historical overview of the evolution of LLMs for code generation is
J. ACM, Vol. 37, No. 4, Article 1. Publication date: August 2018.


In [16]:
retriever=docsearch.as_retriever()
query="how is LLM used for code generation"
retriever.invoke(query)

[Document(metadata={'author': '', 'creationDate': 'D:20241112015859Z', 'creationdate': '2024-11-12T01:58:59+00:00', 'creator': 'LaTeX with acmart 2021/05/01 v1.78 Typesetting articles for the Association for Computing Machinery and hyperref 2023-04-22 v7.00x Hypertext links for LaTeX', 'file_path': '..\\data\\pdf\\code generation using LLMs (compressed).pdf', 'file_type': 'pdf', 'format': 'PDF 1.5', 'keywords': '', 'modDate': 'D:20241112015859Z', 'moddate': '2024-11-12T01:58:59+00:00', 'page': 8, 'producer': 'pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'source': '..\\data\\pdf\\code generation using LLMs (compressed).pdf', 'source_file': 'code generation using LLMs (compressed).pdf', 'subject': '-  General and reference  ->  Surveys and overviews.-  Software and its engineering  ->  Software development techniques.-  Computing methodologies  ->  Artificial intelligence.', 'title': 'A Survey on Large Language Models for Code Generation', 'total_pages

In [28]:
from langchain_community.chat_models import ChatOllama
load_dotenv()
llm = ChatOllama(
    model="llama3.1",
    temperature=0,          
    base_url=os.getenv('OLLAMA_URL')
)

In [29]:
def rag(query, retriever, llm, top_k=6):
    #retrieve the context
    results = retriever.invoke(query)
    context = "\n\n".join([doc.page_content for doc in results]) if results else ""
    if not context:
        return "No relevant context found to answer the question"
    
    prompt_template = """You are an expert Computer Science research assistant. 
Your task is to answer the question based STRICTLY on the provided context.

Guidelines:
1. **Be Precise**: Use technical terminology found in the context.
2. **Structure**: If the answer has multiple parts, use bullet points.
3. **No Hallucination**: If the context does not contain the answer, say "I cannot find the answer in the provided documents." Do not make up information.
4. **Synthesis**: If the answer is split across multiple chunks, combine them into a coherent explanation.

----------------
Context:
{context}
----------------

Question: {query}

Answer:"""
    final_prompt = prompt_template.format(context=context, query=query)
    response=llm.invoke(final_prompt)
    return response.content

In [30]:
answer=rag("What are the characteristics of template-based code generation? Explain in detail.", retriever, llm)
answer

'Based on the provided context, the characteristics of template-based code generation (TBCG) can be identified as follows:\n\n* **Synthesis technique**: TBCG is considered a synthesis technique that uses templates to produce code from high-level specifications.\n* **Use of templates**: Templates are used in TBCG to favor reuse following the principle of "write once, produce many".\n* **Components**: The literature agrees on three components in TBCG: \n\t+ Data\n\t+ Template\n\t+ Output\nHowever, another component is also present but not mentioned in some sources:\n\t+ Meta-information: This refers to the meta-information that the generation logic of the template relies on.\n* **Code generation from high-level specifications**: TBCG produces code from high-level specifications, called templates.\n* **Abstraction and automation**: TBCG emphasizes abstraction and automation, which is also a key aspect of Model-Driven Engineering (MDE).\n* **Reusability**: Templates in TBCG favor reuse, al