## This code is used for patent draft inference on sample unseen patents with some initial inputs

In [1]:

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import pandas as pd
import evaluate
import os
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re
import time

# Hugging Face Model Name
model_name = "beunique/Llama-3.1-8B-bnb-4bit-patent"

# Load the model and tokenizer from Hugging Face
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Ensure the model is in evaluation mode for inference
model.eval()

def generate_patent_section(instruction, input_text, max_tokens=512):
    prompt = f"### Instruction:\n{instruction}\n\n### Input:\n{input_text}\n\n### Response:"
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")
    
    with torch.no_grad():
        output = model.generate(
            **inputs, 
            max_new_tokens=max_tokens,
            temperature=0.7,
            top_k=40,
            top_p=0.92
        )
    
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    response = generated_text.split("### Response:")[-1].strip()
    return response

def prepare_input(input_data):
    return f"""Field of Invention: {input_data['field']}
Problem to Solve: {input_data['problem_statement']}
Related Art: {input_data['related_art']}
Key Features: {input_data['additional_details']}
Drawings: {input_data['drawings']}"""

def format_claims(claims_text):
    formatted_claims = []
    seen_claims = set()
    for claim in claims_text.split('\n'):
        claim = claim.strip()
        if claim and claim.lower() not in seen_claims:
            if not claim[0].isdigit():
                claim = f"{len(formatted_claims) + 1}. {claim}"
            formatted_claims.append(claim)
            seen_claims.add(claim.lower())
    return '\n'.join(formatted_claims)

def remove_duplicate_claims(claims):
    unique_claims = []
    seen = set()
    for claim in claims.split('\n'):
        claim_text = ' '.join(claim.split()[1:])  # Remove the claim number
        if claim_text not in seen:
            unique_claims.append(claim)
            seen.add(claim_text)
    return '\n'.join(unique_claims)


def remove_repetitive_sentences(text, similarity_threshold=0.8):
    # Load a pre-trained sentence transformer model
    model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
    
    # Split the text into sentences
    sentences = re.split(r'(?<=[.!?])\s+', text)
    
    # Encode sentences
    sentence_embeddings = model.encode(sentences)
    
    # Calculate cosine similarity
    similarity_matrix = cosine_similarity(sentence_embeddings)
    
    # Find unique sentences
    unique_sentences = []
    for i, sentence in enumerate(sentences):
        is_unique = True
        for j in range(i):
            if i != j and similarity_matrix[i][j] > similarity_threshold:
                is_unique = False
                break
        if is_unique and sentence.strip():
            unique_sentences.append(sentence)
    
    # Join unique sentences
    return ' '.join(unique_sentences)

def format_description(description_text):
    sections = description_text.split('\n\n')
    formatted_sections = []
    for section in sections:
        if ':' in section:
            title, content = section.split(':', 1)
            formatted_sections.append(f"{title.upper()}:\n{content.strip()}")
        else:
            formatted_sections.append(section)
    return '\n\n'.join(formatted_sections)

def assemble_patent(title, field, summary, claims, description, abstract):
    return f"""
{title.upper()}

BACKGROUND
Field: {field}

SUMMARY
{summary}

ABSTRACT

{abstract}

CLAIMS

{claims}

DESCRIPTION

{description}
"""

new_input_data = {
    'related_art': "Image recognition technology is commonly used in security systems to identify faces and objects.",
    'problem_statement': "Current AI-based image recognition systems are prone to false positives and struggle in low-light conditions, compromising security.",
    'field': "Artificial intelligence and computer vision",
    'drawings': "Illustrations of the camera sensors and AI processing unit are attached.",
    'additional_details': "The system includes infrared sensors and a new AI algorithm that enhances recognition accuracy in low-light environments."
}

input_text = f"Related Art: {new_input_data['related_art']}\nProblem Statement: {new_input_data['problem_statement']}\nField: {new_input_data['field']}\nDrawings: {new_input_data['drawings']}\nAdditional Details: {new_input_data['additional_details']}"

# Token limits for different sections
token_limits = {
    'Title': 30,
    'Field': 50,
    'Summary': 500,
    'Abstract': 250,
    'Claims': 1500,
    'Description': 7500
}

# Instructions for each section
title_instruction = "Generate a concise, technical title for the patent, no more than 15 words long."

field_instruction = "Generate 1 or 2 sentences describing the field of the invention based on the preamble(s) of the Patent Claims."

summary_instruction = "Generate a summary of the invention, reciting all the features of the Patent Claims."

abstract_instruction = """Generate a one-paragraph patent abstract (150-250 words) that includes:
1. The field of the invention
2. A brief summary of the problem solved
3. A high-level description of the invention
4. The primary advantage or improvement offered by the invention.
"""

claims_instruction = """Generate a set of patent claims for the invention using the following structure:

1. Independent Claim (1 claim):
   - Broadly describe the core invention
   - Include all essential elements mentioned in the input
   - Format: "A [type of invention], comprising: [list of key components]; and [key function or purpose]."

2. Dependent Claims - Components (3-4 claims):
   - Add specific details about individual components
   - Each claim should focus on a different component
   - Format: "The [invention] of claim 1, wherein [specific component] [has a specific feature or function]."

3. Dependent Claims - Functions (3-4 claims):
   - Describe specific functions or operations of the invention
   - Each claim should focus on a different function
   - Format: "The [invention] of claim 1, wherein [the invention or a component] is configured to [perform a specific function]."

4. Dependent Claims - Configurations (2-3 claims):
   - Describe different configurations or arrangements of components
   - Format: "The [invention] of claim 1, further comprising [additional component or arrangement]."

5. Method Claim (1 claim):
   - Describe the process of using the invention
   - Include key steps that reflect the invention's operation
   - Format: "A method of [using/operating] the [invention] of claim 1, comprising: [list of key steps]."

6. Dependent Method Claims (2-3 claims):
   - Add specific details to the method claim
   - Format: "The method of claim [method claim number], further comprising [additional step or detail]."

Ensure each claim is a single sentence. Number claims consecutively. Avoid any repetition in claim content or wording.
Aim for 12-15 total claims. Make sure all claims are relevant to the invention described in the input."""


description_instruction = """Generate a detailed patent description including the following sections:
1. FIELD OF THE INVENTION (1-2 paragraphs)
2. BACKGROUND OF THE INVENTION (2-3 paragraphs)
3. SUMMARY OF THE INVENTION (2-3 paragraphs)
4. BRIEF DESCRIPTION OF THE DRAWINGS (1-2 paragraphs)
5. DETAILED DESCRIPTION OF THE PREFERRED EMBODIMENTS (4-5 paragraphs)

Use appropriate headings for each section. Provide technical details, examples, and references to the drawings. 
For the DETAILED DESCRIPTION section:
- Describe the components of the invention in detail
- Explain how these components interact
- Provide at least one specific example of how the invention operates
- Discuss potential variations or alternative embodiments
Aim for a total description length of at least 1000 words.  Avoid any repetition in content or wording. I don't want repititions in the generated text."""


# Generate each section
title = generate_patent_section(title_instruction, input_text, token_limits['Title'])
field = generate_patent_section(field_instruction, input_text, token_limits['Field'])
summary = generate_patent_section(summary_instruction, input_text, token_limits['Summary'])
abstract = generate_patent_section(abstract_instruction, input_text, token_limits['Abstract'])
claims = generate_patent_section(claims_instruction, input_text, token_limits['Claims'])
description = generate_patent_section(description_instruction, input_text, token_limits['Description'])
# Post-process description
description = remove_repetitive_sentences(description)

# Post-process claims
claims = format_claims(claims)
# In the generate_patent function, add this line after generating claims:
claims = remove_duplicate_claims(claims)
# Post-process claims
claims = remove_repetitive_sentences(claims)

# Assemble the final patent
generated_patent = assemble_patent(title, field, summary, claims, description, abstract)

# Print the results
print(generated_patent)

# Optionally, save the generated patent to a file
with open("generated_patent.txt", "w") as f:
    f.write(generated_patent)


Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
  attn_output = torch.nn.functional.scaled_dot_product_attention(



IMAGE RECOGNITION TECHNOLOGY IS COMMONLY USED IN SECURITY SYSTEMS TO IDENTIFY FACES AND OBJECTS. CURRENT AI-BASED IMAGE RECOGNITION SYSTEMS ARE PRONE TO FALSE POSITIVES AND STRUGGLE IN LOW

BACKGROUND
Field: Image recognition technology is commonly used in security systems to identify faces and objects. Current AI-based image recognition systems are prone to false positives and struggle in low-light conditions, compromising security. This system includes infrared sensors and a new AI algorithm that enhances recognition accuracy

SUMMARY
Image recognition technology is commonly used in security systems to identify faces and objects. However, current AI-based image recognition systems are prone to false positives and struggle in low-light conditions, compromising security. The system includes infrared sensors and a new AI algorithm that enhances recognition accuracy in low-light environments.

ABSTRACT

Image recognition technology is commonly used in security systems to identify faces 