In [1]:
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import SimpleDirectoryReader
from llama_parse import LlamaParse
from llama_index.core.node_parser import MarkdownElementNodeParser, MarkdownNodeParser
from llama_index.llms.openai import OpenAI
import nest_asyncio
import os

import re

nest_asyncio.apply()
#node_parser = SentenceSplitter(chunk_size=600, chunk_overlap=0)
llm = OpenAI(model="gpt-4o", api_key=os.environ.get("OPENAI_API_KEY", ""))

prompt = f"""Your task is to filter out texts from the given research paper by following the five Instruction_guidelines.
        \nInstruction_guidelines: 
            1. Remove only the texts related to references, acknowledgement and other dates and conference names that are useless to a reader.
            2. Remove the details on authors and their contacts too.
            3. DO NOT remove any other text. Keep them intact as they appear in the original format.
            4. Replace all the pictures in the research paper with their correct textual explanations pertaining to their surrounding context.
            5. Output any mathematical notation or special character in LATEX markdown (between $$)"""

parser = LlamaParse(result_type="markdown", 
                    api_key="",
                    
                    parsing_instruction=prompt)  # "markdown" and "text" are available

# use SimpleDirectoryReader to parse our file
pdf_extractor = {".pdf": parser}
#rec_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)

doc_pages = await SimpleDirectoryReader(input_dir="/home/ritwik-gosh/Fine_tuning/data", file_extractor=pdf_extractor, exclude=["Multi-Objective-Process.pdf"]).aload_data()   #-> List[llama_index.core.schema.Document]
#print(doc[1].page_content)


Started parsing the file under job_id 80d4dd4c-27ec-46a9-afa9-e12b42e2b752
Started parsing the file under job_id 52d99f30-e406-41ce-b47f-123ad9ad9301
.

**Parsing Markdown into CSV with additional filtrations**

In [2]:

md_node_parser = MarkdownElementNodeParser(llm=llm, num_workers=2, show_progress=True)  #also separates tables as object elements
md_parser = MarkdownNodeParser(llm=llm) #splits simply based on markdown headers
md_nodes = md_parser.get_nodes_from_documents(documents=doc_pages)  # List[7 nodes]
#md_base_nodes = md_parser.get_nodes_from_node(node=md_nodes)

md_elem_nodes = md_node_parser.get_nodes_from_documents(documents=doc_pages)   #List[Nodes and objects]     #5
base_nodes, objects = md_node_parser.get_nodes_and_objects(md_elem_nodes)   # List[nodes],  List[table_remarks]   #3, 1

#rec_chunks=[]

1it [00:00, 26379.27it/s]
100%|██████████| 1/1 [00:09<00:00,  9.94s/it]
4it [00:00, 72005.22it/s]
100%|██████████| 4/4 [00:04<00:00,  1.06s/it]


In [24]:
'''cleansed_file = "/home/ritwik-gosh/Fine_tuning/data/Markdown/llamaparsed.md"
with open(cleansed_file, 'w') as file:
    cleansed_text_file = file.write(doc_pages[0].text)'''
import pandas as pd

df["Context"] = pd.DataFrame([itm.text for _, itm in enumerate(md_nodes)])
# List of words to remove
words_to_remove = ['Acknowledgement', 'Acknowledgment']

# Regex pattern from the list
pattern = '|'.join(words_to_remove)
filtered_contexts = df[~df["Context"].str.contains(pattern, case=True, na=False)]   #ignores all rows with word 'Acknowledgement'
#final_contexts = filtered_contexts[filtered_contexts["Context"].str.len() >= 100]

# Rows with <250 characters are merged with the following row until the limit finally reaches 250
merged_rows = []
i=0
while i < len(filtered_contexts):
    current_row = filtered_contexts.iloc[i]['Context']
    
    # Continue merging with the next rows until the length of current_row is >= 100 or no more rows to merge
    while len(current_row) < 250 and i + 1 < len(filtered_contexts):
        next_row = filtered_contexts.iloc[i + 1]['Context']
        current_row += '\n\n' + next_row
        i += 1  # Move to the next row that has been merged

    merged_rows.append(current_row)
    i += 1  # Move to the next row
if len(merged_rows[-1]) < 200:
    merged_rows[-2] = merged_rows[-2] + '\n\n' + merged_rows[-1]
    merged_rows.pop()

df_merged = pd.DataFrame(merged_rows, columns=['Context'])

context_path = "/home/ritwik-gosh/Fine_tuning/data/Markdown/filtered_contexts.csv"
df_merged.to_csv(context_path, index=False)

**Additional text cleansing --> not needed if Llamaparse_instruction is used**

In [6]:
'''from openai import OpenAI
import markdown
import os

markdown_file = "/home/ritwik-gosh/Fine_tuning/data/Markdown/context.md"
with open(markdown_file, 'r') as file:
    md_text_file = file.read()
    #paper = markdown.markdown(md_text_file)

prompt = f"""Your task is to filter out texts from the given research paper according to the Instruction_guidelines.
        Paper:
                {md_text_file}
        \n\n\nInstruction_guidelines: 
                Remove only the texts related to references, acknowledgement and other dates and conference names that are useless to a reader.
                Remove the details on authors and their contacts too.
                DO NOT remove any other text. Keep them intact as they appear in the original format.
                
                Retrieve the relevant details from the pictures in the paper and convert them into short textual explanations.
                You MUST preserve the exact mathematical as well as special characer notations that are present in the paper."""
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY", ""))
response = client.chat.completions.create(
model="gpt-4o",
temperature = 0.2,
messages=[{"role": "system", "content": prompt}],
#stream=True,
#stream_options={"include_usage": True},
)
cleansed_text = response.choices[0].message.content 


cleansed_file = "/home/ritwik-gosh/Fine_tuning/data/Markdown/cleansed_context.md"
with open(cleansed_file, 'w') as file:
    cleansed_text_file = file.write(cleansed_text)
    #cleansed_paper = markdown.markdown(cleansed_text)

print(cleansed_text)



#CHUNKING
import pandas as pd

print(f'len of chunks: {len(chunks)}')
print()
chunk_list = []
start_idx = 1
end_idx = 70
for i, context in enumerate(chunks[start_idx:end_idx], start_idx):
    if i == end_idx:
        break
    chunk_dict = {"Context": context.text}
    chunk_list.append(chunk_dict)
    print(f"\n****************************** CHUNK: {i} **************************************\n")
    print(chunks[i].text)

#df = pd.DataFrame(chunk_list)
'''

**Identifying main Findings of the paper**

In [72]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from json_utils import parse_json_response
from text_generation import EvolutionaryTextGenerator
from typing import Dict, Any
import os


gpt = ChatOpenAI(model="gpt-4o",
    temperature=0.1,
    max_tokens=None,
    timeout=None,
    max_retries=2,
    api_key="")

def list_findings(research: str) -> Dict[str, Any]:
    PROMPT = ChatPromptTemplate.from_messages(
        [  
        ("system",
            """You are given a research papers on Surface Mount Technology manufacturing. 
                Understand the paper thoroughly, and then list all the findings from the paper that form the crux of this particular research work.

                "Provide your response as a JSON object where the keys are the sequential indices and their values are the respective findings from the paper."
                "Only provide the JSON object with no additional explanation or text." 

               Example: 
                    Input:
                         [Research Paper]
                            
                    Output: 
                         {"1": <your finding here>,\n"2":<your finding here>}
                    """,
        ),
        ("user", "Find at least 3 and maximum 10 crucial findings from this research paper:\n{paper}"),
        ]
        )
    runnable = PROMPT | gpt

    all_findings = runnable.invoke({
        "paper": research}).content
    return parse_json_response(all_findings)

markdown_paper = "/home/ritwik-gosh/Fine_tuning/data/Markdown/llamaparsed.md"
with open(markdown_paper, 'r') as file:
    paper_text = file.read()

pr = f"""You are given a research papers on Surface Mount Technology manufacturing. 
        Understand the paper thoroughly, and then list all the findings from the paper that form the crux of this particular research.
        Also, list all the important properties and experiments performed in the paper along with their obtained results.
        Find at least 3 and maximum 15 such findings from this research paper.

        Research Paper:
                    {paper_text}

        
        Output format instructions:
                "Provide your response as a JSON object where the keys are the sequential indices and their values are the respective findings from the paper."
                "Only provide the JSON object with no additional explanation or text." 
      """
pr += """
        Example: 
            Input:
                    [Research Paper]
                    
            Output: 
                    {
                    "1": <your finding here>,\n"2":<your finding here>
                    }
        """

def openai_generate(sys_prompt : str, temperature):
    client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY", ""))
    response = client.chat.completions.create(
    model="gpt-4o",
    temperature = temperature,
    messages=[{"role": "system", "content": sys_prompt}],
    #stream=True
    )
    print(f"\n💰💰💰💰 Token Usage:  {response.usage}\n")

    return response.choices[0].message.content

findings = parse_json_response(openai_generate(pr, 0.1))

import json

paper_findings = "/home/ritwik-gosh/Fine_tuning/data/Markdown/new_findings.json"
with open(paper_findings, 'w') as json_file:
    json.dump(findings, json_file, indent=4)
#findings = list_findings(paper_text)
print(findings)


HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



💰💰💰💰 Token Usage:  CompletionUsage(completion_tokens=331, prompt_tokens=2753, total_tokens=3084, completion_tokens_details=CompletionTokensDetails(reasoning_tokens=0))

{'1': 'Bump height significantly affects the interface between the bumps and the pads, with higher bump heights resulting in higher strain at the interface.', '2': 'There is practically no effect of bump height on the strain variation in the bumps and in the pads.', '3': 'The maximum strain is located in the adhesive layer, particularly at the outermost joints.', '4': 'Shear strain is very high in the adhesive area and the PCB substrate, but very small in the pad and the bump.', '5': 'The strain distribution in the joint is not symmetric, with higher strain values at the outermost joints compared to inner joints.', '6': 'The effect of bump height is significant for the stress and strain distributions in the y and shear directions, but not in the bumps and pads.', '7': 'The strain is highest for the outermost joint, wit

In [73]:
import pandas as pd
findings_list = list(findings.values())
print(findings_list)
'''import json

paper_findings = "/home/ritwik-gosh/Fine_tuning/data/Markdown/findings.json"
with open(paper_findings, 'w') as json_file:
    json.dump(findings, json_file, indent=4)'''

['Bump height significantly affects the interface between the bumps and the pads, with higher bump heights resulting in higher strain at the interface.', 'There is practically no effect of bump height on the strain variation in the bumps and in the pads.', 'The maximum strain is located in the adhesive layer, particularly at the outermost joints.', 'Shear strain is very high in the adhesive area and the PCB substrate, but very small in the pad and the bump.', 'The strain distribution in the joint is not symmetric, with higher strain values at the outermost joints compared to inner joints.', 'The effect of bump height is significant for the stress and strain distributions in the y and shear directions, but not in the bumps and pads.', 'The strain is highest for the outermost joint, with dominant strain in the normal direction.', 'The effect of bump height on ACA joints is different from solder joints; higher bump heights do not necessarily improve reliability in ACA joints.', 'Excessive

'import json\n\npaper_findings = "/home/ritwik-gosh/Fine_tuning/data/Markdown/findings.json"\nwith open(paper_findings, \'w\') as json_file:\n    json.dump(findings, json_file, indent=4)'

In [74]:
import json
file_path = "/home/ritwik-gosh/Fine_tuning/data/Markdown/findings.json"
with open(file_path, 'r') as file:
    data = json.load(file)
json_string = json.dumps(str(data))
print(json_string)



"{'1': 'Bump height significantly affects the interface between the bumps and the pads, with higher bump heights resulting in higher strain at the interface.', '2': 'There is practically no effect of bump height on the strain variation in the bumps and in the pads.', '3': 'The maximum strain is located in the adhesive layer, particularly at the outermost joints.', '4': 'Shear strain is very high in the adhesive area and in the PCB substrate but very small in the pad and bump.', '5': 'The strain distribution in the joint is not symmetric, with higher strain values at the outermost joints.', '6': 'The effect of bump height is significant for stress and strain distributions in the y and shear directions but not in the bumps and pads.', '7': 'The strain at the outermost joint and the normal strain are in the same order of magnitude.', '8': 'Excessively high bump heights (e.g., 70 \u03bcm) can induce poor reliability due to the potential for a porous structure in the ACA layers.', '9': 'The

In [75]:
from json_utils import parse_json_response
from openai import OpenAI
import os

markdown_paper = "/home/ritwik-gosh/Fine_tuning/data/Markdown/llamaparsed.md"
with open(markdown_paper, 'r') as file:
    paper_text = file.read()

ctx_prompt = f"""You are a text retriever from research papers. 
"Below is a json string with {len(data)} key findings from a research paper."

    {json_string}

For each of the {len(data)} findings, use RAG on the entire paper given below and retrieve all the relevant texts respectively from all parts of the paper without modifying any original text.
Each of the retrieved original texts should be very much sufficient to provide enough background on the respective findings. 
This might require retrieving not only the texts that are the best match but also the second best matches alongwith.
    Paper:
        {paper_text}

        
Output format instruction:          
    "Provide your response as a JSON object where the keys are the sequential indices and their values are the retrieved matching texts from the paper."
    "Only provide the JSON object with no additional explanation or text."
"""
ctx_prompt += """
    Example: 
            Output: 
                    {
                    "1": <retrieved matching text>,\n"2":<retrieved matching text>
                    }
"""

def openai_generate(sys_prompt : str, temperature):
    client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY", ""))
    response = client.chat.completions.create(
    model="gpt-4o",
    temperature = temperature,
    messages=[{"role": "system", "content": sys_prompt}],
    #stream=True
    )
    print(f"\n💰💰💰💰 Token Usage:  {response.usage}\n")

    return response.choices[0].message.content

contexts = openai_generate(ctx_prompt, 0.1)

#findings = list_findings(paper_text)
print(contexts)



HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



💰💰💰💰 Token Usage:  CompletionUsage(completion_tokens=705, prompt_tokens=3084, total_tokens=3789, completion_tokens_details=CompletionTokensDetails(reasoning_tokens=0))

{
    "1": "The effect of bump height can be seen in the strain changes in the interface between the bumps and the pads. The volume fraction covered by high strain value (shown by black contours) is more for higher bump. The effect is significant for the stress and the strain distributions in the y and in the shear directions. The effect of the bump height could not be found in the bumps and in the pads.",
    "2": "There is practically no effect of the bump height on the strain changes in the silicon and the pads.",
    "3": "The strain distribution in the joint is not symmetric and the maximum strain $$\varepsilon_{xx}$$ is located in the adhesive layer. The value of the strain for the outmost joints at A is higher than the strain located at B (A and B are shown in Fig. 5).",
    "4": "Shear strain $$\varepsilon_{xy}

**Parsing the retrieved contexts into JSON**

In [83]:
import re
import json

match = re.search(r'\{.*\}', contexts, re.DOTALL)
parsed_contexts = match.group(0)
#print(parsed_contexts)

fixed_json_string = re.sub(r'(?<!\\)\\(?![\\ntr"])', r'\\\\', parsed_contexts)  #duplicates backslashes to avoid JSONDecodeError
parsed_dict = json.loads(fixed_json_string)
print(parsed_dict)

csv_path = "/home/ritwik-gosh/Fine_tuning/data/Markdown/findings_contexts.csv"
retrieved_contexts = list(parsed_dict.values())
df = pd.DataFrame({
    'Findings': findings_list,
    'Context': retrieved_contexts
})

df.to_csv(csv_path, index=False)


'''context_finding_dict = {key: {"Finding": data.get(key), "Retrieved_context": parsed_dict.get(key)} 
               for key in data.keys() | parsed_dict.keys()}
context_findings = "/home/ritwik-gosh/Fine_tuning/data/Markdown/context_findings.json"
with open(context_findings, 'w') as json_file:
    json.dump(context_finding_dict, json_file, indent=4)'''


{'1': 'The effect of bump height can be seen in the strain changes in the interface between the bumps and the pads. The volume fraction covered by high strain value (shown by black contours) is more for higher bump. The effect is significant for the stress and the strain distributions in the y and in the shear directions. The effect of the bump height could not be found in the bumps and in the pads.', '2': 'There is practically no effect of the bump height on the strain changes in the silicon and the pads.', '3': 'The strain distribution in the joint is not symmetric and the maximum strain $$\\varepsilon_{xx}$$ is located in the adhesive layer. The value of the strain for the outmost joints at A is higher than the strain located at B (A and B are shown in Fig. 5).', '4': 'Shear strain $$\\varepsilon_{xy}$$ is very high (more than 0.02) in the adhesive area and in the PCB substrate and is very small (less than 0.001) in the pad and in the bump.', '5': 'The strain distribution in the joi

'context_finding_dict = {key: {"Finding": data.get(key), "Retrieved_context": parsed_dict.get(key)} \n               for key in data.keys() | parsed_dict.keys()}\ncontext_findings = "/home/ritwik-gosh/Fine_tuning/data/Markdown/context_findings.json"\nwith open(context_findings, \'w\') as json_file:\n    json.dump(context_finding_dict, json_file, indent=4)'