#### Landscape Analysis

In [17]:
# from collections import Counter
# import re

# def get_top_words(text, n=30):
#     # Basic cleaning: remove punctuation and make lowercase
#     text = re.sub(r'[^\w\s]', '', text.lower())
    
#     # Tokenize and count
#     words = text.split()
#     word_counts = Counter(words)
    
#     return word_counts.most_common(n)

# # Usage
# raw_combined_text = ' '.join(df['combined'].dropna())
# top_words = get_top_words(raw_combined_text, n=20)

# # Display
# print("Top 20 words (before cleaning):")
# for word, count in top_words:
#     print(f"{word}: {count}")

In [1]:
import pandas as pd
import numpy as np
import nltk
import spacy
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import io

# Ensure necessary downloads
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Initialize NLP tools
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Load BERT
bert_model = SentenceTransformer('all-MiniLM-L6-v2')

[nltk_data] Downloading package punkt to C:\Users\Akash
[nltk_data]     Mittal\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Akash
[nltk_data]     Mittal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Akash
[nltk_data]     Mittal\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# Adding the Custom Stop words
# Wordcloud
# from wordcloud import WordCloud
# import matplotlib.pyplot as plt

# def generate_wordcloud(text, title="WordCloud (Before Cleaning)"):
#     wordcloud = WordCloud(
#         width=800,
#         height=400,
#         background_color='white',
#         max_words=200
#     ).generate(text)

#     plt.figure(figsize=(10, 5))
#     plt.imshow(wordcloud, interpolation='bilinear')
#     plt.axis('off')
#     plt.title(title, fontsize=16)
#     plt.show()

#     return wordcloud

# wcloud = generate_wordcloud(' '.join(df['combined'].dropna()))


# top_words = sorted(wcloud.words_.items(), key=lambda x: x[1], reverse=True)[:40]
# for word, weight in top_words:
#     print(f"{word}: {weight:.3f}")


# Start with default stopwords
from wordcloud import STOPWORDS

custom_stopwords = set(STOPWORDS)

# Add patent-specific stopwords & generic domain noise words
patent_stopwords = {
    'method', 'claim', 'comprising', 'wherein', 'described', 'used', 'first',
    'second', 'one', 'plurality', 'step', 'module', 'position',
    'provided', 'based', 'corresponding',
    'comprise', 'obtain', 'connected', 'including', 'comprises',
    'said', 'system', 'device', 'process', 'data', 'forest', 'fire',
    'warning'
    }

custom_stopwords.update(patent_stopwords)


In [3]:
# -------------------------
# 📌 Utility Functions
# -------------------------
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[%s]' % re.escape(string.punctuation), ' ', text)
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in custom_stopwords and w.isalpha()]
    return ' '.join(tokens)

def preprocess_dataframe(df):
    df['combined'] = df['Title'].fillna('') + ' ' + df['Abstract'].fillna('') + ' ' + df['Claims'].fillna('')
    df['clean_text'] = df['combined'].apply(clean_text)
    return df

def get_tfidf_features(texts):
    vectorizer = TfidfVectorizer(max_features=300)
    tfidf_matrix = vectorizer.fit_transform(texts)
    return tfidf_matrix

def get_bert_embeddings(texts):
    return bert_model.encode(texts, show_progress_bar=True)

def ensemble_vectors(tfidf_matrix, bert_matrix):
    tfidf_matrix = normalize(tfidf_matrix)
    bert_matrix = normalize(bert_matrix)
    combined = np.hstack((tfidf_matrix.toarray(), bert_matrix))
    return combined

def compute_probabilities(patent_vecs, taxonomy_vecs, taxonomy_labels):
    similarity = cosine_similarity(patent_vecs, taxonomy_vecs)
    prob_df = pd.DataFrame(similarity, columns=taxonomy_labels)
    return prob_df


def build_taxonomy():
    max_depth = int(input("Enter the depth of levels in taxonomy (e.g., 3, 4, 5): "))
    paths = []

    def recursive_input(current_path, level):
        indent = "  " * level
        while True:
            label = input(f"{indent}Enter label for Level {level + 1} under path {current_path or 'ROOT'} (or type 'N' to stop): ").strip()
            if label.lower() == 'n':
                break
            new_path = current_path + [label]
            if level + 1 < max_depth:
                recursive_input(new_path, level + 1)
            else:
                paths.append(new_path)

    recursive_input([], 0)
    return paths

In [4]:
def build_taxonomy_dict():
    max_depth = int(input("Enter the depth of levels in taxonomy (e.g., 3, 4, 5): "))

    def recursive_input(level=0, path='ROOT'):
        node = {}
        indent = "  " * level
        while True:
            label = input(f"{indent}Enter label for Level {level + 1} under path {path} (or type 'N' to stop): ").strip()
            if label.lower() == 'n':
                break
            if level + 1 < max_depth:
                # Recursive call to get children of this label
                children = recursive_input(level + 1, path + ' > ' + label)
                node[label] = children
            else:
                # Leaf node: no further children
                node[label] = {}
        return node

    taxonomy_dict = recursive_input()
    return taxonomy_dict


In [6]:
# taxonomy_hierarchy = build_taxonomy_dict()
print(taxonomy_hierarchy)

{'Thermal OR Heat Based Sensor Type': {'Microbolometers': {}, 'Thermopile Sensors': {}, 'Pyroelectric Sensors': {}, 'Thermal Imaging Camera': {}, 'Temperature Sensor': {}, 'Others include Multi-Spectral Thermal Sensor (NIR, SWIR, MWIR, LWIR)': {}}, 'Wildfire Detection Functions': {'Early-Stage Fire Detection': {}, 'Active Flame OR  Smoke Density Detection': {}, 'Thermal or Hotspot OR Heat Map or Signature Identification': {}, 'Fire Spread OR Wind Monitoring': {}, 'AI OR ML OR Artificial Intelligence OR Machine Learning Based Detection': {}, 'Alarm and Notification Systems': {}, 'Sensor Deployment': {}, 'Ground OR Fixed OR Tree Trunk OR Soi': {}, 'Tower OR Pole OR Tree-Mounted': {}, 'Drone OR UAV OR Unmanned aerial vehicle Integrated': {}, 'Satellite-Based': {}, 'Others (Vehicle, Mobile, Robot, Wearables etc)': {}, 'Data Processing': {}, 'Edge OR On-Device Computing (Single-Chip)': {}, 'Cloud OR Central Computing': {}, 'Gas Sensor Type': {}, 'Carbon Monoxide (CO)': {}, 'Carbon Dioxide (

In [5]:
taxonomy_hierarchy = {'Thermal OR Heat Based Sensor Type': {'Microbolometers': {}, 'Thermopile Sensors': {}, 'Pyroelectric Sensors': {}, 'Thermal Imaging Camera': {}, 'Temperature Sensor': {}, 'Others include Multi-Spectral Thermal Sensor (NIR, SWIR, MWIR, LWIR)': {}}, 'Wildfire Detection Functions': {'Early-Stage Fire Detection': {}, 'Active Flame OR  Smoke Density Detection': {}, 'Thermal or Hotspot OR Heat Map or Signature Identification': {}, 'Fire Spread OR Wind Monitoring': {}, 'AI OR ML OR Artificial Intelligence OR Machine Learning Based Detection': {}, 'Alarm and Notification Systems': {}, 'Sensor Deployment': {}, 'Ground OR Fixed OR Tree Trunk OR Soi': {}, 'Tower OR Pole OR Tree-Mounted': {}, 'Drone OR UAV OR Unmanned aerial vehicle Integrated': {}, 'Satellite-Based': {}, 'Others (Vehicle, Mobile, Robot, Wearables etc)': {}, 'Data Processing': {}, 'Edge OR On-Device Computing (Single-Chip)': {}, 'Cloud OR Central Computing': {}, 'Gas Sensor Type': {}, 'Carbon Monoxide (CO)': {}, 'Carbon Dioxide (CO₂)': {}, 'Methane (CH₄)': {}}}

In [40]:
{'Thermal OR Heat Based Sensor Type': {'Microbolometers': {}, 'Thermopile Sensors': {}, 'Pyroelectric Sensors': {}, 'Thermal Imaging Camera': {}, 'Temperature Sensor': {}, 'Others include Multi-Spectral Thermal Sensor (NIR, SWIR, MWIR, LWIR)': {}}, 'Wildfire Detection Functions': {'Early-Stage Fire Detection': {}, 'Active Flame OR  Smoke Density Detection': {}, 'Thermal or Hotspot OR Heat Map or Signature Identification': {}, 'Fire Spread OR Wind Monitoring': {}, 'AI OR ML OR Artificial Intelligence OR Machine Learning Based Detection': {}, 'Alarm and Notification Systems': {}, 'Sensor Deployment': {}, 'Ground OR Fixed OR Tree Trunk OR Soi': {}, 'Tower OR Pole OR Tree-Mounted': {}, 'Drone OR UAV OR Unmanned aerial vehicle Integrated': {}, 'Satellite-Based': {}, 'Others (Vehicle, Mobile, Robot, Wearables etc)': {}, 'Data Processing': {}, 'Edge OR On-Device Computing (Single-Chip)': {}, 'Cloud OR Central Computing': {}, 'Gas Sensor Type': {}, 'Carbon Monoxide (CO)': {}, 'Carbon Dioxide (CO₂)': {}, 'Methane (CH₄)': {}}}

{'Thermal OR Heat Based Sensor Type': {'Microbolometers': {},
  'Thermopile Sensors': {},
  'Pyroelectric Sensors': {},
  'Thermal Imaging Camera': {},
  'Temperature Sensor': {},
  'Others include Multi-Spectral Thermal Sensor (NIR, SWIR, MWIR, LWIR)': {}},
 'Wildfire Detection Functions': {'Early-Stage Fire Detection': {},
  'Active Flame OR  Smoke Density Detection': {},
  'Thermal or Hotspot OR Heat Map or Signature Identification': {},
  'Fire Spread OR Wind Monitoring': {},
  'AI OR ML OR Artificial Intelligence OR Machine Learning Based Detection': {},
  'Alarm and Notification Systems': {},
  'Sensor Deployment': {},
  'Ground OR Fixed OR Tree Trunk OR Soi': {},
  'Tower OR Pole OR Tree-Mounted': {},
  'Drone OR UAV OR Unmanned aerial vehicle Integrated': {},
  'Satellite-Based': {},
  'Others (Vehicle, Mobile, Robot, Wearables etc)': {},
  'Data Processing': {},
  'Edge OR On-Device Computing (Single-Chip)': {},
  'Cloud OR Central Computing': {},
  'Gas Sensor Type': {},
  'C

Data Loading and Testing

In [7]:
# Step 1: Load your data
df = pd.read_excel("data\\patent_data_smallset.xlsx")

In [8]:
df.head()

Unnamed: 0,Patent Number,Title,Abstract,Claims
0,KR102090170B1,FOREST FIRE MONITORING DEVICE AND FOREST FIRE ...,The present invention relates to a forest fire...,1. As a forest fire monitoring device using a ...
1,KR20240059036A,IoT Smart forest fire monitoring system and me...,A smart forest fire monitoring system and meth...,1. A plurality of forest fire notification dev...
2,CN117831251A,Forest fire collection monitoring and early wa...,The embodiment of the invention provides a for...,1. The forest fire acquisition monitoring and ...
3,CN117994925A,Intelligent forestry forest fire monitoring de...,The invention relates to the technical field o...,1. An wisdom forestry forest fire monitored co...
4,DE102023132010A1,Method for evaluating the cause of a forest fi...,The invention relates to a method for evaluati...,1. Method for evaluating the cause of a forest...


In [9]:
# Preprocessing

df = preprocess_dataframe(df)

df.head()

# gives combined data from TAC in smallcase also performing the cleaning using stopwords

Unnamed: 0,Patent Number,Title,Abstract,Claims,combined,clean_text
0,KR102090170B1,FOREST FIRE MONITORING DEVICE AND FOREST FIRE ...,The present invention relates to a forest fire...,1. As a forest fire monitoring device using a ...,FOREST FIRE MONITORING DEVICE AND FOREST FIRE ...,monitoring monitoring using drone present inve...
1,KR20240059036A,IoT Smart forest fire monitoring system and me...,A smart forest fire monitoring system and meth...,1. A plurality of forest fire notification dev...,IoT Smart forest fire monitoring system and me...,iot smart monitoring linked iot alarm smart mo...
2,CN117831251A,Forest fire collection monitoring and early wa...,The embodiment of the invention provides a for...,1. The forest fire acquisition monitoring and ...,Forest fire collection monitoring and early wa...,collection monitoring early storage medium emb...
3,CN117994925A,Intelligent forestry forest fire monitoring de...,The invention relates to the technical field o...,1. An wisdom forestry forest fire monitored co...,Intelligent forestry forest fire monitoring de...,intelligent forestry monitoring invention rela...
4,DE102023132010A1,Method for evaluating the cause of a forest fi...,The invention relates to a method for evaluati...,1. Method for evaluating the cause of a forest...,Method for evaluating the cause of a forest fi...,evaluating cause cause evaluation invention re...


In [32]:
# Step 3: Taxonomy creation
taxonomy_paths = build_taxonomy()
taxonomy_labels = [' > '.join(path) for path in taxonomy_paths]

In [10]:
# taxonomy_labels
# taxonomy_hierarchy = build_taxonomy_dict()
print(taxonomy_hierarchy)

{'Thermal OR Heat Based Sensor Type': {'Microbolometers': {}, 'Thermopile Sensors': {}, 'Pyroelectric Sensors': {}, 'Thermal Imaging Camera': {}, 'Temperature Sensor': {}, 'Others include Multi-Spectral Thermal Sensor (NIR, SWIR, MWIR, LWIR)': {}}, 'Wildfire Detection Functions': {'Early-Stage Fire Detection': {}, 'Active Flame OR  Smoke Density Detection': {}, 'Thermal or Hotspot OR Heat Map or Signature Identification': {}, 'Fire Spread OR Wind Monitoring': {}, 'AI OR ML OR Artificial Intelligence OR Machine Learning Based Detection': {}, 'Alarm and Notification Systems': {}, 'Sensor Deployment': {}, 'Ground OR Fixed OR Tree Trunk OR Soi': {}, 'Tower OR Pole OR Tree-Mounted': {}, 'Drone OR UAV OR Unmanned aerial vehicle Integrated': {}, 'Satellite-Based': {}, 'Others (Vehicle, Mobile, Robot, Wearables etc)': {}, 'Data Processing': {}, 'Edge OR On-Device Computing (Single-Chip)': {}, 'Cloud OR Central Computing': {}, 'Gas Sensor Type': {}, 'Carbon Monoxide (CO)': {}, 'Carbon Dioxide (

In [11]:
# Traverse Taxonomy and Prepare Prompts

def extract_grouped_categories(taxonomy_dict):
    grouped = []

    def recurse(node, parent_label=None):
        if isinstance(node, dict):
            children = list(node.keys())
            if parent_label and children:
                grouped.append((parent_label, children))
            for child in children:
                recurse(node[child], child)

    recurse(taxonomy_dict)
    return grouped


# Build Prompts for LLM

def create_prompts_from_groups(grouped_categories):
    prompts = []
    for parent, children in grouped_categories:
        child_list = ', '.join(children)
        prompt = (
            f"The following are categories under '{parent}': {child_list}.\n"
            "Please describe each category in 1-2 sentences, including any commonly used synonyms or alternative terms. We need this to categorize patents as per technologies implemented."
        )
        prompts.append((parent, children, prompt))
    return prompts

In [12]:
from huggingface_hub import login
login("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")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to C:\Users\Akash Mittal\.cache\huggingface\token
Login successful


In [13]:
pip install accelerate

Note: you may need to restart the kernel to use updated packages.


In [15]:
# # Model from Hugging face

# from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# # Load the model (use a quantized version if RAM is limited)
# model_name = "mistralai/Mistral-7B-Instruct-v0.2"                   # or other open models
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")  # uses GPU if available

# text_gen = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=300)

In [20]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

model_name = "google/flan-t5-base"                                           # "distilgpt2"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

text2text = pipeline("text2text-generation", model=model, tokenizer=tokenizer)

# Example prompt
prompt = (
    "The following are types of Thermal Sensors: Microbolometers, Thermopile Sensors, Pyroelectric Sensors.\n"
    "Describe each sensor type in 1-2 sentences and list any common synonyms."
)

result = text2text(prompt, max_new_tokens=256)
print(result[0]["generated_text"])

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

KeyboardInterrupt: 

In [17]:
for parent, children, prompt in create_prompts_from_groups(extract_grouped_categories(taxonomy_hierarchy)):
    print(f"\nPrompt for group: {parent}\n")
    print(prompt)
    response = text_gen(prompt)[0]['generated_text']
    print("\nLLM Response:\n", response)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Prompt for group: Thermal OR Heat Based Sensor Type

The following are categories under 'Thermal OR Heat Based Sensor Type': Microbolometers, Thermopile Sensors, Pyroelectric Sensors, Thermal Imaging Camera, Temperature Sensor, Others include Multi-Spectral Thermal Sensor (NIR, SWIR, MWIR, LWIR).
Please describe each category in 1-2 sentences, including any commonly used synonyms or alternative terms. We need this to categorize patents as per technologies implemented.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



LLM Response:
 The following are categories under 'Thermal OR Heat Based Sensor Type': Microbolometers, Thermopile Sensors, Pyroelectric Sensors, Thermal Imaging Camera, Temperature Sensor, Others include Multi-Spectral Thermal Sensor (NIR, SWIR, MWIR, LWIR).
Please describe each category in 1-2 sentences, including any commonly used synonyms or alternative terms. We need this to categorize patents as per technologies implemented.

Patents for Thermo-Riskin Thermal Sensor Types
Other commonly used technologies include thermophores, thermal sensors, thermal sensors, thermal sensors, Thermal Imaging Camera, temperature sensor
Thermos is a thermophore containing an array of sensors with very little or no electrical conductivity, but is capable of operating as thermal temperature sensors. Thermal sensors may cause serious damage to the entire sensor and may cause the cooling of the whole sensor.
Thermal/Heat Racks The energy

Prompt for group: Wildfire Detection Functions

The following a