In [None]:
!pip install ipywidgets

**Loading raw data**

In [1]:
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import SimpleDirectoryReader
import re


node_parser = SentenceSplitter(chunk_size=400, chunk_overlap=0)

#rec_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)

doc_pages = SimpleDirectoryReader(input_dir="/home/ritwik-gosh/Fine_tuning/data", exclude=["ACA.pdf", "Soldering.pdf"]).load_data()
#print(doc[1].page_content)

rec_chunks=[]
chunks = node_parser.get_nodes_from_documents(doc_pages)

**Estimating Token count per API call**

In [28]:
import tiktoken
import pandas as pd

def num_tokens_from_string(prompt: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.encoding_for_model('gpt-4o')
    tokens = encoding.encode(prompt)
    num_tokens = len(tokens)
    return num_tokens, tokens


#pandas_frame = pd.read_csv('/home/ritwik-gosh/Fine_tuning/Evolution_QA/data/contexts.csv')
paper = "\n".join(chunks[i].text for i, _ in enumerate(chunks[:40]))
prompt = f"""
You evaluate and filter out instructions based on relevance, purposefullness and specificity to a given context. 
Filter out questions that reveal the answers in them.
instruction 1 to evaluate:
Explain the role of Surface Mount Technology (SMT) in the production of electronic devices and how it differs from other methods of attaching components to Printed Circuit Boards (PCBs).
instruction 2 to evaluate:
Explain the importance of the convection-based reflow soldering process in the Surface Mount Technology (SMT) assembly and discuss the advanced technologies to reduce voids in solder joints.
instruction 3 to evaluate:
Discuss the significance of the convection-based reflow soldering process within the realm of Surface Mount Technology (SMT) assembly. Elaborate on how this process contributes to the mitigation of voids in solder joints, considering the broader context of SMT's aim for high reliability and performance in electronic assemblies.
instruction 4 to evaluate:
Explain the role and mechanisms of convection-based reflow soldering in the assembly process of Surface Mount Technology (SMT). Additionally, discuss why this method is significant in maintaining the integrity of solder joints, particularly in minimizing the formation of voids, and how it impacts the overall reliability of electronic assemblies.
instruction 5 to evaluate:
Describe the function of the convection-based reflow soldering process within the context of Surface Mount Technology (SMT) assembly. Additionally, discuss the importance of minimizing voids in solder joints, and how these voids could potentially impact the performance and reliability of electronic components.
Context to be referred to while evaluating each of the above instructions:
{paper}

Instruction:
For each instruction, evaluate if it is concise, clear, and aligned with the context. Provide your evaluation as a JSON object where the keys are the text numbers and the values are boolean True (if the text passes the quality check) or False (if the text does not pass the quality check). Only provide the JSON object with no additional explanation or text.
Example format:
  "1": True,
  "2": False
Your evaluation:


"""
num, _ = num_tokens_from_string(prompt)
print(f"Number of Tokens: {num}")

Number of Tokens: 11317


In [2]:
import pandas as pd

print(f'len of chunks: {len(chunks)}')
print()
chunk_list = []
start_idx = 1
end_idx = 70
for i, context in enumerate(chunks[start_idx:end_idx], start_idx):
    if i == end_idx:
        break
    chunk_dict = {"Context": context.text}
    chunk_list.append(chunk_dict)
    '''print()
    print(f"****************************** CHUNK: {i} **************************************")
    print()
    print(chunks[i].text)'''

df = pd.DataFrame(chunk_list)

df.to_csv("/home/ritwik-gosh/Fine_tuning/Evolution_QA/data/Soldering_contexts.csv", index=False)

print(chunk_list[1])


len of chunks: 269

{'Context': '1   Introduction  \n \n2 electronics production. Since energy  methods only involve the 1st law of \nthermodynamics and thus ignore the difference between work and heat, \nthese met hods lack factors which are required to adequately show a \nsystem effectiveness utilizing given energy resources. Exergy methods also \ntake the second  law of thermodynamics into account , and therefore , \nbetter suited for thermodynamic processes. As for packag ing process and \nelectronics assembly, the soldering process is the most energy intensive \nprocess, and therefore exergy analysis suits to investigate the heat transfer \ncomplexities [8] [9]. \nBeyond the importance of energy f actor, quality factor for electronic \nproducts is inevitably improving. Explicitly for modern electronics \nproducts increasingly demand the features of lighter weight, smaller size, \nand higher quality, high pin -count, and various other product \nconfigurations . This is particularly

<font size=7><font color='yellow'>Generating the Q&A</font>

In [None]:
from training_data_synthesizer import TrainingDataSynthesizer
from data_synthesis import InstructionResponseConfig
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import SimpleDirectoryReader
import pandas as pd
from pathlib import Path
import nest_asyncio
import asyncio
import aiohttp
import time
import re

#save Q&As for every paper separately
async def create_dataset_per_paper(session, source_csv_path, output_csv_path, paper_name) -> None:

    sys_prompt = """
    You are a researcher in Surface Mount Technology(SMT) manufacturing.
    Use your expertise in generating balanced, context-rich questions and comprehensive answers based on given scientific contexts of SMT manufacturing. 
    
    Your goal is to create question-answer pairs that are informative, detailed when necessary, while not revealing the answer in the question.
    """

    instr_format = """
    Generate a specific and clear question directly related to a key concept in the given context assuming the contextual background is already known. 
    The question should be directed to the underlying key concepts in the given context, while being answerable using only the information provided. 
    
    DO NOT reveal the answer in the question. 
    DO NOT phrase questions with trivial background references like 'In this experiment..', 'In the context of..', 'In this study, what was achieved?' etc. Instead, include the necessary scientific entity names(or descriptions) within the question by assuming that the contextual background is already known.

    Ensure the question focuses on a deeper understanding of the subject matter and can be answered concisely if the information allows, but also accommodate for more detailed responses when appropriate.
    For large contexts with multiple important and different key concepts, generate intricate questions addressing the major key concepts. Simply concatenate the questions(if multiple) and do not incorporate newlines or indices.
    The generated question can either be in interrogative or in assertive sentence.  

    """
    instr_mutation = """
    Enhance the quality of the original question or instruction by incorporating intricate reasoning and key concepts from the given context.
    The question should be answerable using only the information provided in the context. 
    The question should not contain the entire detailed background from the context but only capture the overall concept with necessary entity names or their descriptions. 

    DO NOT reveal the answer in the question. 
    """

    instr_quality = """
    You evaluate and boolean classify(binary classification) the instructions(questions) based on their faithfulness and specificity to a given context. 
    Classify the questions that reveal the answers in them as False.
    Questions that do not capture the key points from the context should also classify as False.
    """

    resp_format = """
    From the provided context, synthesize an informative answer to the given instruction(question). 
    You should only use the information provided in the given context while generating your response. 
    The response should be concise, while fully answering all the question(s) with relevant information from given the context, and have simplified explanations where necessary. 
    
    For questions on complex and lengthy context, you should provide a more detailed response by capturing all intra-relations within the context. 
    Ensure that the response is well-informative for the understanding of manufacturing engineers in Surface Mount Technology.
    """

    resp_mutation = """
    Your goal is to enhance the quality of the answer by incorporating intricate reasoning and key concepts from the given context.
    Strive for clarity and depth in the answers, aiming to enhance the reader’s comprehension of the intricate concepts pertaining to the question.
    
    You should only use the information provided in the given context while enhancing the answer's quality. 
    """

    resp_quality = """
    You evaluate and boolean classify(binary classification) the responses that are too simple, or not well-explained with respect to a given context as False. 
    Answers that are irrelevant or unfaithful to the given context and its question, should classify as False.
    """

    d_frame = pd.read_csv(source_csv_path)
    config_obj = InstructionResponseConfig(input_fields=["Context"],
                                    api_key="",
                                    num_generations=2,
                                    population_size=2,
                                    mutation_rate=0.4,
                                    temperature=0.2,
                                    system_prompt=sys_prompt,
                                    instruction_format_prompt=instr_format,
                                    instruction_mutation_prompt=instr_mutation,
                                    instruction_quality_prompt=instr_quality,
                                    response_format_prompt=resp_format,
                                    response_mutation_prompt=resp_mutation,
                                    response_quality_prompt=resp_quality)
    
    dataset = await TrainingDataSynthesizer(
        df=d_frame,  #df.iloc[:]
        config=config_obj,
        output_file="",
        verbose = True,
        paper=paper_name
    ).generate(session=session)

    dataset.to_csv(output_csv_path, index=False)

'''async def iterate_dir(session, src_dir_path, dest_dir):
# Iterate through all CSV files in the directory
    for csv_file in src_dir_path.glob("*.csv"):
        if str(csv_file) == "/home/ritwik-gosh/Fine_tuning/data/Batch_OpenAI/data/cleansed_papers_csv/10-1108_SSMT-01-2017-0003.csv":
            paper_title = csv_file.stem
            src_path = f"{src_dir_path}/{paper_title}.csv"
            dest_path = f"{dest_dir}/{paper_title}.csv"
            await create_dataset_per_paper(session=session, source_csv_path=src_path, output_csv_path=dest_path)    
        else:
            continue'''

nest_asyncio.apply()
# Main function to run multiple chains in parallel
async def main(source_dir, target_dir, batch_size):
    global_src_path = source_dir.glob("*csv")
    task_chains=[]
    async with aiohttp.ClientSession() as session:
        # Create tasks for each chain, running them in parallel
        file_count = 1
        for csv_file in global_src_path:
            if file_count ==1:
                start_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            if file_count == 20:
                end_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            if file_count < 21:
                paper_title = csv_file.stem
                src_path = f"{src_dir_path}/{paper_title}.csv"
                dest_path = f"{target_dir}/{paper_title}.csv"
                task_chains.append(create_dataset_per_paper(session=session, 
                                                            source_csv_path=src_path, 
                                                            output_csv_path=dest_path,
                                                            paper_name=paper_title))  
                file_count += 1    # List[task_chains]
        for i in range(0, len(task_chains), batch_size):
            # Select the next batch of tasks
            current_batch_tasks = task_chains[i:i + batch_size]
            await asyncio.gather(*current_batch_tasks)
        
        with open("time.txt", "w") as file:
        # Write the string to the file
            file.write(f"Start time: {start_time}\n\nEnd time: {end_time}")
        

src_dir = "/home/ritwik-gosh/Fine_tuning/data/Batch_OpenAI/data/cleansed_papers_csv"
src_dir_path = Path(src_dir)
dest_dir_path = Path("/home/ritwik-gosh/Fine_tuning/Evolution_QA/data/Q&A")
# Run the asyncio event loop
asyncio.run(main(source_dir=src_dir_path,
                 target_dir=dest_dir_path,
                 batch_size=4))



PyTorch version 2.3.1 available.


Synthesizing from 10-1108_SSMT-01-2016-0002:   0%|          | 0/2 [00:00<?, ?it/s]


🆕 Starting the process of synthesizing a new training record for index 0.
🔍 Synthesizing diverse instructions based on the inputs.
Created prompt for instruction generation
🧬🧬🧬🧬🧬Starting evolutionary process for instruction generation🧬🧬🧬🧬🧬


Synthesizing from 10-1108_SSMT-01-2023-0002:   0%|          | 0/2 [00:00<?, ?it/s]


🆕 Starting the process of synthesizing a new training record for index 0.
🔍 Synthesizing diverse instructions based on the inputs.
Created prompt for instruction generation
🧬🧬🧬🧬🧬Starting evolutionary process for instruction generation🧬🧬🧬🧬🧬
🧬 Initial population (2 instructions):
        1. What is the relationship between the Zn content in Sn–15Bi–xZn solders and the thickness of the intermetallic compounds layer formed on the Cu substrate, and how does this relationship influence the overall wettability and spreading behavior of the solder?
        2. What is the relationship between the Zn content in Sn–15Bi–xZn solders and the thickness of the intermetallic compounds layer formed on the Cu substrate during the wetting process? How does varying the brazing temperature influence the final spreading equivalent radius and contact angle of the solder?

🔄 G E N E R A T I O N  1/2
🧬 Initial population (2 instructions):
        1. What role does molar atomic volume play in determining the s

RESPONSE:----------------------

{
    "quality": 70,
    "relevance": 80,
    "factual_accuracy": 85,
    "prompt_adherence": 60,
    "bias": 50,
    "toxicity": 0
}


    - "What are the five key factors that significantly affect the surface tension of Sn-based solder alloys, and how do they rank in terms of their influence? Furthermore, in what ways does the artificial neural network (ANN) model enhance the prediction and understanding of surface tension in this specific application?"
📝 Synthesizing diverse responses to the top synthetic instruction.
Created prompt for response generation
🧬🧬🧬🧬🧬Starting evolutionary process for response generation🧬🧬🧬🧬🧬

Generated instruction (score: 62.22):


RESPONSE:----------------------

{
    "quality": 65,
    "relevance": 70,
    "factual_accuracy": 80,
    "prompt_adherence": 60,
    "bias": 50,
    "toxicity": 50
}


    - "How does the addition of Zn in varying amounts to Sn–15Bi–xZn solders affect the thickness of the intermetallic compounds layer formed on a Cu substrate during the wetting process, and what impact does the brazing temperature have on the final spreading equivalent radius and contact angle of the solder?"
📝 Synthesizing diverse responses to the top synthetic instruction.
Created prompt for response generation
🧬🧬🧬🧬🧬Starting evolutionary process for response generation🧬🧬🧬🧬🧬
🧬 Initial population (2 responses):
        1. The addition of zinc (Zn) in varying amounts to Sn–15Bi–xZn solders significantly influences the thickness of the intermetallic compounds (IMC) layer formed on a copper (Cu) substrate during the wetting process. Specifically, incorporating an appropriate amount of Zn, particularly at a 1% concentration, has been shown to effectively decrease the thickness of the IMC layer. This reduction is beneficial as it can enhance the reliability and performance of the solder j

RESPONSE:----------------------

{
    "quality": 70,
    "relevance": 80,
    "factual_accuracy": 85,
    "prompt_adherence": 75,
    "bias": 50,
    "toxicity": 0
}


    - "The five key factors that significantly affect the surface tension of Sn-based solder alloys are:

1. **Molar Atomic Volume** - This factor is identified as the most influential in determining surface tension, indicating that the volume occupied by the atoms in the alloy plays a critical role in its surface properties.
2. **Electro-negativity** - Following molar atomic volume, electro-negativity is the second most significant factor, suggesting that the tendency of an atom to attract electrons influences the surface tension.
3. **Electronic Density** - This factor ranks third, highlighting the importance of the distribution of electrons in the alloy's atoms and how it affects intermolecular interactions at the surface.
4. **Concentration** - Concentration of the alloy components comes next in the hierarchy of influence, indicating that the proportion of different elements in the solder can alter its surface tension.
5. **Temperature** - Lastly, temperature is the least influenti

RESPONSE:----------------------

{
    "quality": 80,
    "relevance": 85,
    "factual_accuracy": 90,
    "prompt_adherence": 85,
    "bias": 50,
    "toxicity": 0
}


    - "The addition of zinc (Zn) in varying amounts to Sn–15Bi–xZn solders significantly influences the thickness of the intermetallic compounds (IMC) layer that forms on a copper (Cu) substrate during the wetting process. The study reveals that incorporating an optimal concentration of Zn, particularly at 1%, results in a notable reduction in the thickness of the IMC layer. This reduction is critical because a thicker IMC layer can lead to increased brittleness in solder joints, potentially compromising the mechanical integrity and reliability of electronic assemblies. By minimizing the IMC thickness, the solder joint can achieve enhanced ductility and overall performance, which is vital for the longevity and reliability of electronic components.

In addition to the effects of Zn content, the brazing temperature plays a pivotal role in influencing the wetting behavior of the solder. As the brazing temperature increases, there is a significant enhancement in the final spreading equival

RESPONSE:----------------------

{
    "quality": 75,
    "relevance": 80,
    "factual_accuracy": 85,
    "prompt_adherence": 70,
    "bias": 50,
    "toxicity": 0
}


    - "What are the key factors that affect the wettability of lead-free solder alloys, particularly in relation to the role of surface tension in soldering performance? Furthermore, how do the Butler equation and Wu model contribute to the estimation of surface tension for Sn-based solder alloys, and what limitations arise from neglecting temperature dependence and other influential factors?"
📝 Synthesizing diverse responses to the top synthetic instruction.
Created prompt for response generation
🧬🧬🧬🧬🧬Starting evolutionary process for response generation🧬🧬🧬🧬🧬
🧬 Initial population (2 instructions):
        1. What are the effects of varying Zn content on the wettability and interfacial reactions of Sn–Bi solder when applied to a Cu substrate, particularly in terms of the formation of intermetallic compounds and the mechanical properties of the solder? Additionally, how does the addition of Zn influence the dynamic wetting processes, including the relationship between dynamic contact an

RESPONSE:----------------------

{
    "quality": 65,
    "relevance": 70,
    "factual_accuracy": 80,
    "prompt_adherence": 60,
    "bias": 50,
    "toxicity": 0
}


    - "What are the effects of varying Zn content on the wettability and interfacial reactions of Sn–Bi solder when applied to a Cu substrate, particularly in terms of the formation of intermetallic compounds and the mechanical properties of the solder? Additionally, how does the addition of Zn influence the dynamic wetting processes, including the relationship between dynamic contact angle and spreading area over time?"
📝 Synthesizing diverse responses to the top synthetic instruction.
Created prompt for response generation
🧬🧬🧬🧬🧬Starting evolutionary process for response generation🧬🧬🧬🧬🧬
🧬 Initial population (2 responses):
        1. The addition of zinc (Zn) to Sn–Bi solder significantly influences both the wettability and interfacial reactions when applied to a copper (Cu) substrate. Specifically, varying Zn content can lead to the formation of new interfacial compounds at the Sn–Bi/Cu interface, which alters the dynamics of the intermetallic compound (IMC) formation. This is crucial

RESPONSE:----------------------

{
    "quality": 75,
    "relevance": 85,
    "factual_accuracy": 80,
    "prompt_adherence": 90,
    "bias": 50,
    "toxicity": 0
}


    - "The wettability of lead-free solder alloys is influenced by several key factors, with surface tension being a critical intrinsic factor that directly affects soldering performance. The ability of a solder to wet a substrate is essential for effective bonding in electronic packaging. Factors that influence wettability include:

1. **Surface Tension**: This is the primary intrinsic factor that determines how well the solder can spread on the substrate. Lower surface tension generally leads to better wettability.

2. **Soldering Temperature**: Higher temperatures can reduce surface tension, enhancing wettability. However, the specific temperature at which soldering occurs can significantly impact the performance of the solder.

3. **Type of Flux Used**: The flux can alter the surface properties of the solder and substrate, affecting the wetting behavior.

4. **Soldering Atmosphere**: The environment in which soldering takes place can also influence the wettability, as contaminants 

RESPONSE:----------------------

{
    "quality": 75,
    "relevance": 80,
    "factual_accuracy": 85,
    "prompt_adherence": 75,
    "bias": 50,
    "toxicity": 0
}


    - "The addition of zinc (Zn) to Sn–Bi solder significantly influences both the wettability and interfacial reactions when applied to a copper (Cu) substrate. Specifically, varying Zn content can lead to the formation of new interfacial compounds at the Sn–Bi/Cu interface, which alters the dynamics of the intermetallic compound (IMC) formation. This is crucial because the Cu–Sn IMC can negatively impact the reliability of solder joints. By incorporating Zn, the mutual diffusion between Sn and Cu atoms is inhibited, which reduces the consumption of the Cu substrate and extends its service life.

In terms of mechanical properties, the introduction of Zn into the Sn–Bi solder system can enhance these properties by refining the microstructure. This refinement is essential as it can lead to improved ductility and reduced brittleness, which are critical for the performance of electronic components under mechanical stress.

Regarding the dynamic wetting processes, the addition of Zn affect

In [2]:
a = [1,2,3,4,5,6,7,8,9,10]
for i in range(0, len(a), 4):
            # Select the next batch of tasks
            current_batch_tasks = a[i:i + 4]
            print(current_batch_tasks)

[1, 2, 3, 4]
[5, 6, 7, 8]
[9, 10]


In [None]:
from openai import OpenAI
import os

client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY", ""))

def openai_generate(sys_prompt : str, temperature, tokens = None, num_outputs = None):
    response = client.chat.completions.create(
    model="gpt-4o-mini",
    temperature = temperature,
    messages=[{"role": "system", "content": sys_prompt}],
    max_tokens=tokens,
    n=num_outputs
    #stream=True
    )
    print(f"\n💰💰💰💰 Token Usage:  {response.usage}\n")

    return response

'''output_obj = openai_generate(
    sys_prompt="Act like a mechanical engineer. What leads you to machines?",
    temperature=0.1,
    tokens=200,
    num_outputs=4)'''

import asyncio
import aiohttp

# Your OpenAI API key
API_KEY = ""
# OpenAI Chat Completion API endpoint
API_URL = "https://api.openai.com/v1/chat/completions"

# Function to make an API call to the Chat Completion API
async def chat_api_call(sys_prompt, session):
    headers = {
        "Authorization": f"Bearer {API_KEY}",
        "Content-Type": "application/json"
    }
    payload = {
        "model": "gpt-4o-mini",
        "messages": sys_prompt,
        "max_tokens": 100,
        "temperature": 0.4
    }

    async with session.post(API_URL, headers=headers, json=payload) as response:
        return await response.json()

async def _apply_mutations(
        session=None ):
        
        response = [(
                await chat_api_call(
                    sys_prompt=f"Act like a Neuroscientist.",
                    session=session
                )
                
            )
            for text in range(3)]
        
        return response

result = asyncio.run(_apply_mutations)
print(result)
