## We need to prepare - instruction_df which is in the input format of instruction ,	input, 	output

In [7]:
import pandas as pd
import re

# Load your DataFrame
df = pd.read_csv('D:/Topcoder/patent_documentation/patent_data_with_cleaned_inputs_phi3_mini.csv')

# Function to remove redundancy
def remove_redundancy(text):
    return text.replace("\n    Based on the following patent information:\n", "")

# Function to clean input text by removing specific keywords
def clean_input_text(text):
    # Remove labels like "Abstract:", "Description:", "Claims:", and "Title:" using regex and clean up extra spaces
    clean_text = re.sub(r'\s*-\s*(Abstract|Description|Claims|Title):', '', text)
    clean_text = re.sub(r'\s+', ' ', clean_text).strip()  # Normalize spaces
    return clean_text

# Function to create the instruction, input, and output format
def create_instruction_based_dataset(df):
    rows = []
    
    for _, row in df.iterrows():
        # Clean up redundant phrases in input fields
        related_art = remove_redundancy(row['related_art'])
        problem_statement = remove_redundancy(row['problem_statement'])
        field = remove_redundancy(row['field'])
        drawings = remove_redundancy(row['drawings'])
        additional_details = remove_redundancy(row['additional_details'])
        
        # Clean the entire input text by removing "Abstract", "Description", "Claims", and "Title"
        base_input = clean_input_text(f"Related Art: {related_art}\n"
                                      f"Problem Statement: {problem_statement}\n"
                                      f"Field: {field}\n"
                                      f"Drawings: {drawings}\n"
                                      f"Additional Details: {additional_details}")
        
        # Create individual tasks (Title, Abstract, Claims, Description)
        # Task 1: Generate Title
        rows.append({
            'instruction': "Generate the patent title based on the following input.",
            'input': base_input,
            'output': row['title']
        })
        
        # Task 2: Generate Abstract
        rows.append({
            'instruction': "Generate the abstract based on the following input.",
            'input': base_input,
            'output': row['abstract']
        })
        
        # Task 3: Generate Claims
        rows.append({
            'instruction': "Generate the claims based on the following input.",
            'input': base_input,
            'output': row['claims']
        })
        
        # Task 4: Generate Description
        rows.append({
            'instruction': "Generate the description based on the following input.",
            'input': base_input,
            'output': row['description']
        })
    
    # Convert the rows into a new DataFrame
    instruction_df = pd.DataFrame(rows)
    
    return instruction_df

# Create the dataset in instruction-based format
instruction_df = create_instruction_based_dataset(df)

# Save the new dataset to a CSV file
instruction_df.to_csv('instruction_based_patent_dataset_cleaned.csv', index=False)

print("Dataset formatted with instruction, input, and output has been saved as 'instruction_based_patent_dataset_cleaned.csv'.")


Dataset formatted with instruction, input, and output has been saved as 'instruction_based_patent_dataset_cleaned.csv'.


In [8]:
instruction_df.head()

Unnamed: 0,instruction,input,output
0,Generate the patent title based on the followi...,Related Art: A dummy element includes: a semic...,Dummy element and method of examining defect o...
1,Generate the abstract based on the following i...,Related Art: A dummy element includes: a semic...,A dummy element includes: a semiconductor subs...
2,Generate the claims based on the following input.,Related Art: A dummy element includes: a semic...,What is claimed is: \n \n 1. A dummy...
3,Generate the description based on the followin...,Related Art: A dummy element includes: a semic...,CROSS-REFERENCE TO RELATED APPLICATION \n ...
4,Generate the patent title based on the followi...,Related Art: The present invention provides a ...,Polymer-type fluorescent molecule probe


In [9]:
instruction_df.values

array([['Generate the patent title based on the following input.',
        'Related Art: A dummy element includes: a semiconductor substrate; a lower insulating film deposited on the semiconductor substrate; a first resistive layer deposited on the lower insulating film; an interlayer insulating film covering the first resistive layer; a first pad-forming electrode deposited on the interlayer insulating film so as to be connected to the first resistive layer Problem Statement: CROSS-REFERENCE TO RELATED APPLICATION This application claims benefit of priority under 35 USC 119 based on Japanese Patent Application No. 2018-150311 filed on Aug. 9, 2018, the entire contents of which are incorporated by reference herein. Field: Dummy element and method of examining defect of resistive element CROSS-REFERENCE TO RELATED APPLICATION This application claims benefit of priority under 35 USC 119 based on Japanese Patent Application No. 2018-150311 filed on Aug. 9, 20 Drawings: What is claimed is: