## This code is used for patent draft inference on 10 sample patents (These patents already existing)

In [4]:
import pandas as pd
# Load your DataFrame
df = pd.read_csv('D:/Topcoder/patent_documentation/patent_data_with_cleaned_inputs_phi3_mini.csv')
df.head()

Unnamed: 0,publication_number,title,abstract,claims,description,related_art,problem_statement,field,drawings,additional_details,cleaned_related_art,cleaned_problem_statement,cleaned_field,cleaned_drawings,cleaned_additional_details
0,US-11114351-B2,Dummy element and method of examining defect o...,A dummy element includes: a semiconductor subs...,What is claimed is: \n \n 1. A dummy...,CROSS-REFERENCE TO RELATED APPLICATION \n ...,\n Based on the following patent informatio...,\n Based on the following patent informatio...,\n Based on the following patent informatio...,\n Based on the following patent informatio...,\n Based on the following patent informatio...,A dummy element includes: a semiconductor subs...,CROSS-REFERENCE TO RELATED APPLICATION \n ...,Dummy element and method of examining defect o...,What is claimed is: \n \n 1. A dummy...,CROSS-REFERENCE TO RELATED APPLICATION \n ...
1,US-10946109-B2,Polymer-type fluorescent molecule probe,The present invention provides a fluorescent m...,The invention claimed is: \n \n 1. A...,TECHNICAL FIELD \n The present application...,\n Based on the following patent informatio...,\n Based on the following patent informatio...,\n Based on the following patent informatio...,\n Based on the following patent informatio...,\n Based on the following patent informatio...,The present invention provides a fluorescent m...,TECHNICAL FIELD \n The present application...,Polymer-type fluorescent molecule probe TECHNI...,The invention claimed is: \n \n 1. A...,TECHNICAL FIELD \n The present application...
2,US-11112260-B2,Geospatial navigation methods and systems for ...,An exemplary geospatial navigation system defi...,What is claimed is: \n \n 1. A metho...,BACKGROUND INFORMATION \n Use of mobile na...,\n Based on the following patent informatio...,\n Based on the following patent informatio...,\n Based on the following patent informatio...,\n Based on the following patent informatio...,\n Based on the following patent informatio...,An exemplary geospatial navigation system defi...,BACKGROUND INFORMATION \n Use of mobile na...,Geospatial navigation methods and systems for ...,What is claimed is: \n \n 1. A metho...,BACKGROUND INFORMATION \n Use of mobile na...
3,US-10940384-B2,Inciting user action for motion sensor calibra...,"In a method of motion sensor calibration, a mo...",What is claimed is: \n \n 1. A metho...,CROSS-REFERENCE TO RELATED APPLICATION—PROVISI...,\n Based on the following patent informatio...,\n Based on the following patent informatio...,\n Based on the following patent informatio...,\n Based on the following patent informatio...,\n Based on the following patent informatio...,"In a method of motion sensor calibration, a mo...",CROSS-REFERENCE TO RELATED APPLICATION—PROVISI...,Inciting user action for motion sensor calibra...,What is claimed is: \n \n 1. A metho...,CROSS-REFERENCE TO RELATED APPLICATION—PROVISI...
4,US-2021298305-A1,Use of a difluoro-(2-hydroxypropyl)pyridine co...,The present disclosure is related to the field...,What is claimed is: \n \n 1 . A me...,CROSS-REFERENCE TO RELATED APPLICATION(S) \n ...,\n Based on the following patent informatio...,\n Based on the following patent informatio...,\n Based on the following patent informatio...,\n Based on the following patent informatio...,\n Based on the following patent informatio...,The present disclosure is related to the field...,CROSS-REFERENCE TO RELATED APPLICATION(S) \n ...,Use of a difluoro-(2-hydroxypropyl)pyridine co...,What is claimed is: \n \n 1. A met...,CROSS-REFERENCE TO RELATED APPLICATION(S) \n ...


In [5]:
df.columns

Index(['publication_number', 'title', 'abstract', 'claims', 'description',
       'related_art', 'problem_statement', 'field', 'drawings',
       'additional_details', 'cleaned_related_art',
       'cleaned_problem_statement', 'cleaned_field', 'cleaned_drawings',
       'cleaned_additional_details'],
      dtype='object')

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import pandas as pd
import evaluate
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import pandas as pd
import evaluate
import os
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re
import time

# Load the model and tokenizer
model_name = "beunique/Llama-3.1-8B-bnb-4bit-patent"
# model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model.eval()

section_times = {}

start_time = time.time()

def generate_patent_section(instruction, input_text, max_tokens=512):
    prompt = f"### Instruction:\n{instruction}\n\n### Input:\n{input_text}\n\n### Response:"
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")
    
    with torch.no_grad():
        output = model.generate(
            **inputs, 
            max_new_tokens=max_tokens,
            temperature=0.7,
            top_k=40,
            top_p=0.92
        )
    
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    response = generated_text.split("### Response:")[-1].strip()
    return response


def prepare_input(row):
    return f"""Field of Invention: {row['cleaned_field']}
Problem to Solve: {row['cleaned_problem_statement']}
Related Art: {row['cleaned_related_art']}
Key Features: {row['cleaned_additional_details']}
Drawings: {row['cleaned_drawings']}"""


def format_claims(claims_text):
    formatted_claims = []
    seen_claims = set()
    for claim in claims_text.split('\n'):
        claim = claim.strip()
        if claim and claim.lower() not in seen_claims:
            if not claim[0].isdigit():
                claim = f"{len(formatted_claims) + 1}. {claim}"
            formatted_claims.append(claim)
            seen_claims.add(claim.lower())
    return '\n'.join(formatted_claims)

def remove_duplicate_claims(claims):
    unique_claims = []
    seen = set()
    for claim in claims.split('\n'):
        claim_text = ' '.join(claim.split()[1:])  # Remove the claim number
        if claim_text not in seen:
            unique_claims.append(claim)
            seen.add(claim_text)
    return '\n'.join(unique_claims)


def remove_repetitive_sentences(text, similarity_threshold=0.8):
    # Load a pre-trained sentence transformer model
    model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
    
    # Split the text into sentences
    sentences = re.split(r'(?<=[.!?])\s+', text)
    
    # Encode sentences
    sentence_embeddings = model.encode(sentences)
    
    # Calculate cosine similarity
    similarity_matrix = cosine_similarity(sentence_embeddings)
    
    # Find unique sentences
    unique_sentences = []
    for i, sentence in enumerate(sentences):
        is_unique = True
        for j in range(i):
            if i != j and similarity_matrix[i][j] > similarity_threshold:
                is_unique = False
                break
        if is_unique and sentence.strip():
            unique_sentences.append(sentence)
    
    # Join unique sentences
    return ' '.join(unique_sentences)




def assemble_patent(title, field, summary, claims, description, abstract):
    return f"""
{title.upper()}

BACKGROUND
Field: {field}

SUMMARY
{summary}

ABSTRACT

{abstract}

CLAIMS

{claims}

DESCRIPTION

{description}
"""

# Instructions for each section
title_instruction = "Generate a concise, technical title for the patent, no more than 15 words long."
field_instruction = "Generate 1 or 2 sentences describing the field of the invention based on the preamble(s) of the Patent Claims."

summary_instruction = "Generate a summary of the invention, reciting all the features of the Patent Claims."

abstract_instruction = """Generate a one-paragraph patent abstract (150-250 words) that includes:
1. The field of the invention
2. A brief summary of the problem solved
3. A high-level description of the invention
4. The primary advantage or improvement offered by the invention"""

claims_instruction = """Generate a set of diverse patent claims for the invention using the following structure:

1. Independent Claim (1 claim):
   - Broadly describe the core invention
   - Include all essential elements mentioned in the input
   - Format: "A [type of invention], comprising: [list of key components]; and [key function or purpose]."

2. Dependent Claims (8-10 claims total):
   - Each claim must add a unique, non-repetitive feature or limitation
   - Focus on different aspects: components, functions, configurations, materials, etc.
   - Format: "The [invention] of claim [X], wherein/further comprising [new, unique feature]."

3. Method Claim (1 claim):
   - Describe the process of using the invention
   - Include key steps that reflect the invention's operation
   - Format: "A method of [using/operating] the [invention] of claim 1, comprising: [list of key steps]."

4. Dependent Method Claims (1-2 claims):
   - Add specific, non-repetitive details to the method claim
   - Format: "The method of claim [X], further comprising [additional unique step or detail]."

Rules:
1. Ensure each claim is a single sentence.
2. Number claims consecutively.
3. STRICTLY AVOID ANY REPETITION in claim content or wording.If you don't follow this I will be penalized.
4. Each claim must introduce a new, unique aspect of the invention.
5. Limit to a maximum of 10 claims.
6. If you can't generate a new, unique claim, stop generating claims.

Make sure all claims are relevant to the invention described in the input."""



description_instruction = """Generate a detailed patent description including the following sections:
1. FIELD OF THE INVENTION (1-2 paragraphs)
2. BACKGROUND OF THE INVENTION (2-3 paragraphs)
3. SUMMARY OF THE INVENTION (2-3 paragraphs)
4. BRIEF DESCRIPTION OF THE DRAWINGS (1-2 paragraphs)
5. DETAILED DESCRIPTION OF THE PREFERRED EMBODIMENTS (4-5 paragraphs)

Use appropriate headings for each section. Provide technical details, examples, and references to the drawings. 
For the DETAILED DESCRIPTION section:
- Describe the components of the invention in detail
- Explain how these components interact
- Provide at least one specific example of how the invention operates
- Discuss potential variations or alternative embodiments
- 3. STRICTLY AVOID ANY REPETITION in claim content or wording. If you don't follow this I will be penalized.
Aim for a total description length of at least 1000 words."""

# Token limits for different sections
token_limits = {
    'Title': 30,
    'Field': 50,
    'Summary': 500,
    'Abstract': 250,
    'Claims': 1500,
    'Description': 7500
}

def generate_patent(input_text):
    title = generate_patent_section(title_instruction, input_text, token_limits['Title'])
    field = generate_patent_section(field_instruction, input_text, token_limits['Field'])
    abstract = generate_patent_section(abstract_instruction, input_text, token_limits['Abstract'])
    claims = generate_patent_section(claims_instruction, input_text, token_limits['Claims'])
    description = generate_patent_section(description_instruction, input_text, token_limits['Description'])
    # Post-process description
    description = remove_repetitive_sentences(description)
    
    claims = format_claims(claims)
    # In the generate_patent function, add this line after generating claims:
    claims = remove_duplicate_claims(claims)
    # Post-process claims
    claims = remove_repetitive_sentences(claims)
    summary = generate_patent_section(summary_instruction, input_text, token_limits['Summary'])
    return assemble_patent(title, field, summary, claims, description, abstract)

# Load the test data
test_df = pd.read_csv('D:/Topcoder/patent_documentation/patent_data_with_cleaned_inputs_phi3_mini.csv')
test_df = test_df.head(10)

# Prepare input text for each row
test_df['input_text'] = test_df.apply(prepare_input, axis=1)

# Load ROUGE metric
rouge = evaluate.load('rouge')

# Function to calculate ROUGE scores
def calculate_rouge_scores(pred, ref):
    return rouge.compute(predictions=[pred], references=[ref], use_stemmer=True)


# Create a directory to store the generated patents
os.makedirs('generated_patents', exist_ok=True)

# Evaluate on the entire test set
all_rouge_scores = []

for index, row in test_df.iterrows():
    input_text = row['input_text']
    generated_patent = generate_patent(input_text)
    
    # Save the generated patent to a text file
    with open(f'generated_patents/patent_{index+1}.txt', 'w', encoding='utf-8') as f:
        f.write(generated_patent)
    
    # Calculate ROUGE scores for each section
    title_scores = calculate_rouge_scores(generated_patent.split('\n\n')[0], row['title'])
    abstract_scores = calculate_rouge_scores(generated_patent.split('ABSTRACT\n\n')[1].split('CLAIMS')[0], row['abstract'])
    claims_scores = calculate_rouge_scores(generated_patent.split('CLAIMS\n\n')[1].split('DESCRIPTION')[0], row['claims'])
    description_scores = calculate_rouge_scores(generated_patent.split('DESCRIPTION\n\n')[1], row['description'])
    
    # Combine scores
    combined_scores = {
        'title': title_scores,
        'abstract': abstract_scores,
        'claims': claims_scores,
        'description': description_scores
    }
    
    all_rouge_scores.append(combined_scores)
    
    print(f"Processed patent {index + 1}/{len(test_df)}")

# Calculate average ROUGE scores
avg_rouge_scores = {
    section: {metric: sum(score[section][metric] for score in all_rouge_scores) / len(all_rouge_scores) 
              for metric in all_rouge_scores[0][section]}
    for section in ['title', 'abstract', 'claims', 'description']
}

# Print average ROUGE scores
print("\nAverage ROUGE Scores:")
for section, scores in avg_rouge_scores.items():
    print(f"\n{section.capitalize()}:")
    for metric, value in scores.items():
        print(f"  {metric}: {value:.4f}")

# Save the results to a CSV file
print(f"\nGenerated patents saved in the 'generated_patents' directory")
results_df = pd.DataFrame(all_rouge_scores)
results_df.to_csv('patent_generation_results.csv', index=False)
print("\nDetailed results saved to 'patent_generation_results.csv'")

total_time = time.time() - start_time
print(f"Total execution time in seconds: {total_time:.2f} seconds")
total_time_minutes = total_time / 60
print(f"Total execution time: {total_time_minutes:.2f} minutes")

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
  attn_output = torch.nn.functional.scaled_dot_product_attention(


Processed patent 1/10




Processed patent 2/10




Processed patent 3/10




Processed patent 4/10




Processed patent 5/10




Processed patent 6/10




Processed patent 7/10




Processed patent 8/10




Processed patent 9/10




Processed patent 10/10

Average ROUGE Scores:

Title:
  rouge1: 0.2191
  rouge2: 0.0895
  rougeL: 0.1760
  rougeLsum: 0.1814

Abstract:
  rouge1: 0.5725
  rouge2: 0.4278
  rougeL: 0.4812
  rougeLsum: 0.4767

Claims:
  rouge1: 0.2480
  rouge2: 0.1846
  rougeL: 0.2112
  rougeLsum: 0.2361

Description:
  rouge1: 0.1042
  rouge2: 0.0590
  rougeL: 0.0657
  rougeLsum: 0.0990

Generated patents saved in the 'generated_patents' directory

Detailed results saved to 'patent_generation_results.csv'
Total execution time in seconds: 10016.22 seconds
Total execution time: 166.94 minutes
