In [1]:
#import the knowledge base

import pandas as pd

facts = pd.read_pickle("data/fan-systems-facts.pkl")
facts

Unnamed: 0,patent_id,sentence_id,head,relation,tail
0,10001020,10001020_0,present invention,generally relates to,fans
1,10001020,10001020_0,present invention,generally relates to,fan assemblies
2,10001020,10001020_0,present invention,generally relates to,fan wheels
3,10001020,10001020_0,present invention,generally relates to,composite fan blades
4,10001020,10001020_0,present invention,generally relates,attachment
...,...,...,...,...,...
2927524,9995315,9995315_37,wire winding space,is increased for,convenient assembling
2927525,9995315,9995315_37,stator,in,motor housing
2927526,9995315,9995315_37,stator,in,cover
2927527,9995315,9995315_37,cover,of,ceiling fan motor


In [2]:
#removing duplicate facts within patents

facts = facts.drop(columns=["sentence_id"]).drop_duplicates()
facts["facts"] = facts["head"] + " :: " + facts["relation"] + " :: " + facts["tail"]

facts

Unnamed: 0,patent_id,head,relation,tail,facts
0,10001020,present invention,generally relates to,fans,present invention :: generally relates to :: fans
1,10001020,present invention,generally relates to,fan assemblies,present invention :: generally relates to :: f...
2,10001020,present invention,generally relates to,fan wheels,present invention :: generally relates to :: f...
3,10001020,present invention,generally relates to,composite fan blades,present invention :: generally relates to :: c...
4,10001020,present invention,generally relates,attachment,present invention :: generally relates :: atta...
...,...,...,...,...,...
2927524,9995315,wire winding space,is increased for,convenient assembling,wire winding space :: is increased for :: conv...
2927525,9995315,stator,in,motor housing,stator :: in :: motor housing
2927526,9995315,stator,in,cover,stator :: in :: cover
2927527,9995315,cover,of,ceiling fan motor,cover :: of :: ceiling fan motor


In [3]:
#sort entities according to frequency

ent_counts = pd.concat([facts["head"], facts["tail"]]).value_counts().rename_axis('unique_entities').reset_index(name='counts')
ent_counts

Unnamed: 0,unique_entities,counts
0,plurality,52465
1,fan,44387
2,air,38567
3,each,27595
4,motor,25475
...,...,...
261346,chiller system,1
261347,recirculation feature,1
261348,stage high pressure compressor,1
261349,above points,1


In [4]:
#filter top 30 entities with 2 or more words

ent_counts = ent_counts.loc[ent_counts["unique_entities"].str.contains(" ")]

#removing general entities

terms = ["present invention", "present disclosure", "preferred embodiment"]
ent_counts = ent_counts.loc[~ent_counts["unique_entities"].isin(terms)]

ent_counts.head(30)

Unnamed: 0,unique_entities,counts
17,fan blades,10530
28,fan blade,7990
29,air flow,7970
30,fan assembly,7834
45,ceiling fan,6518
52,axial direction,5715
55,leading edge,5326
62,radial direction,4916
64,air outlet,4823
74,electric motor,4369


In [5]:
#sort facts based on patent frequency

freq_ents = ent_counts.head(30)["unique_entities"].tolist()
facts = facts.loc[(facts["head"].isin(freq_ents)) & (facts["tail"].isin(freq_ents)) & (facts["head"] != facts["tail"])]
facts = pd.merge(facts, facts["facts"].value_counts(), on="facts").sort_values(by="count", ascending=False)
facts

Unnamed: 0,patent_id,head,relation,tail,facts,count
100,9033674,leading edge,to,trailing edge,leading edge :: to :: trailing edge,80
71,5577888,leading edge,to,trailing edge,leading edge :: to :: trailing edge,80
82,6755615,leading edge,to,trailing edge,leading edge :: to :: trailing edge,80
81,6726451,leading edge,to,trailing edge,leading edge :: to :: trailing edge,80
80,6634855,leading edge,to,trailing edge,leading edge :: to :: trailing edge,80
...,...,...,...,...,...,...
1827,11236762,centrifugal fan,has,fan wheel,centrifugal fan :: has :: fan wheel,1
1822,11231041,fan housing,with,fan blades,fan housing :: with :: fan blades,1
1821,11231041,inner surface,of,fan housing,inner surface :: of :: fan housing,1
1820,11231040,fan wheel,of,fan assembly,fan wheel :: of :: fan assembly,1


In [6]:
#selecting the most frequent relation between each pair of frequent entities

from itertools import product

pairs = [(h, t) for (h, t) in list(product(freq_ents, freq_ents)) if h != t]

freq_facts = []
for index, row in facts.iterrows():
    if len(pairs) > 0:
        for h, t in pairs:
            if ((h, t) == (row["head"], row["tail"])) & (row["count"] > 1):
                freq_facts.append({
                    "patent_id": row["patent_id"],
                    "head": row["head"],
                    "relation": row["relation"],
                    "tail": row["tail"],
                    "facts": row["facts"],
                    "count": row["count"]
                })
                pairs.remove((h, t))
                pairs.remove((t, h))
                break

selected_facts = pd.DataFrame(freq_facts)
selected_facts

Unnamed: 0,patent_id,head,relation,tail,facts,count
0,9033674,leading edge,to,trailing edge,leading edge :: to :: trailing edge,80
1,4995787,leading edge,of,each blade,leading edge :: of :: each blade,31
2,8070447,leading edge,of,fan blade,leading edge :: of :: fan blade,30
3,9239062,fan blade,for,gas turbine engine,fan blade :: for :: gas turbine engine,26
4,8727700,outer periphery,of,fan wheel,outer periphery :: of :: fan wheel,23
...,...,...,...,...,...,...
172,9587645,gas turbine engine,comprising,trailing edge,gas turbine engine :: comprising :: trailing edge,2
173,10767658,axial fan,wherein,inner surface,axial fan :: wherein :: inner surface,2
174,10816005,fan blades,mechanically coupled to,circuit board,fan blades :: mechanically coupled to :: circu...,2
175,9194398,centrifugal fan,improve,air flow,centrifugal fan :: improve :: air flow,2


In [7]:
#converting data frame to a dictionary

import pickle

edges = {}
titles = {}

for index, row in selected_facts.iterrows():
    edges[(row["head"], row["tail"])] = row["relation"]
    titles[(row["head"], row["tail"])] = f"This fact has been mentioned in {row['count']} patents, e.g., in US{row['patent_id']}."

with open("data/generalisable-facts.pkl", "wb") as f:
    pickle.dump(edges, f)

In [8]:
#visualising the selected facts
from pyvis.network import Network
import random

g = Network(width="100%", height="1600px", directed =True)

nodes = ent_counts.head(30)["unique_entities"].tolist()
sizes = [item/1000 for item in ent_counts.head(30)["counts"].tolist()]
labels = [f"This entity has been mentioned in {item} facts." for item in ent_counts.head(30)["counts"].tolist()]
colors = ["#A9A9A9"  for item in nodes]

g.add_nodes(nodes, size=sizes, color=colors, title=labels)

for h, t in edges:
    #including an edge by chance to ensure the visualisation is readable
    if random.random() < 0.3:
        g.add_edge(source=h, to=t, label=edges[(h, t)], color="#D3D3D3", title=titles[h, t])

g.show(f"generalisable-facts.html", notebook=False)

generalisable-facts.html


In [9]:
#shuffling the selected facts

facts_to_text = selected_facts["facts"].tolist()
random.shuffle(facts_to_text)

with open("data/generalisable-facts.txt", "w+", encoding="utf-8") as f:
    f.write("\n".join(facts_to_text))

In [10]:
#gathering patent data

patents = pd.read_pickle("data/patents.pkl")
patents

Unnamed: 0,patent_id,patent_date,description
1838344,11853544,2023-12-26,"Electronic device, method for driving electron..."
1836877,11852059,2023-12-26,Noise muffler for an air moving device. A nois...
1836866,11852048,2023-12-26,Gas admission valve (GAV) assembly and system ...
1836867,11852049,2023-12-26,Electric actuator. A differential device (5) o...
1836868,11852050,2023-12-26,Support arrangement for an actuator of a cam p...
...,...,...,...
4735925,6838900,2005-01-04,Middle pull-up point-to-point transceiving bus...
4735924,6838899,2005-01-04,Apparatus and method of error detection and co...
4735923,6838898,2005-01-04,Apparatus and method for testing high current ...
4735922,6838897,2005-01-04,Integrated circuit test system and method. The...


In [11]:
#Select the 10 random patents related to fan systems

patents = patents.loc[patents["patent_id"].isin(facts["patent_id"].unique().tolist())]
patents = patents.sample(10)
patents["description"] = "Patent: " + patents["patent_id"] + ". " + patents["description"]

patents

Unnamed: 0,patent_id,patent_date,description
1353749,11364772,2022-06-21,Patent: 11364772. Air conditioning unit. An ai...
1232866,11242862,2022-02-08,Patent: 11242862. Blower device. A blower devi...
324966,10327392,2019-06-25,Patent: 10327392. Battery-powered debris blowe...
1090193,11098953,2021-08-24,Patent: 11098953. Integrated fan heat exchange...
476909,10480520,2019-11-19,Patent: 10480520. Motor-driven fan with an ass...
7103171,9217440,2015-12-22,Patent: 9217440. Ceiling fan. A ceiling fan in...
974947,10982681,2021-04-20,Patent: 10982681. Fan blade structure and cent...
6983757,9097261,2015-08-04,Patent: 9097261. Axial fan with flow guide bod...
491367,10495114,2019-12-03,Patent: 10495114. Blower. A blower fan include...
885622,10892606,2021-01-12,Patent: 10892606. Ventilation unit for electri...


In [12]:
#getting the list of abstracts as text file

with open("data/domain-knowledge-text.txt", "w+", encoding="utf-8") as f:
    f.write("\n\n".join(patents["description"].tolist()))