In [1]:
import warnings

warnings.filterwarnings("ignore")

In [2]:
import pandas as pd

#Importing facts

facts = pd.read_pickle("data/fan-systems-facts.pkl")
facts

Unnamed: 0,patent_id,sentence_id,head,relation,tail
0,10001020,10001020_0,present invention,generally relates to,fans
1,10001020,10001020_0,present invention,generally relates to,fan assemblies
2,10001020,10001020_0,present invention,generally relates to,fan wheels
3,10001020,10001020_0,present invention,generally relates to,composite fan blades
4,10001020,10001020_0,present invention,generally relates,attachment
...,...,...,...,...,...
2927524,9995315,9995315_37,wire winding space,is increased for,convenient assembling
2927525,9995315,9995315_37,stator,in,motor housing
2927526,9995315,9995315_37,stator,in,cover
2927527,9995315,9995315_37,cover,of,ceiling fan motor


In [3]:
#Finding facts that mention 'airflow' and 'noise' in either of the entities

issue = "airflow noise"
level_0 = facts.loc[(facts["head"] + " " + facts["tail"]).str.contains(r'^(?=.*airflow)(?=.*noise)')]
level_0

Unnamed: 0,patent_id,sentence_id,head,relation,tail
24390,10036400,10036400_62,noise,generated by,airflow
52431,10072670,10072670_48,noises,are increased,swirling airflow
58343,10079525,10079525_194,heated airflows,the,noise sources
60152,10079526,10079526_194,heated airflows,the,noise sources
76943,10100846,10100846_43,airflow,to,noise production
...,...,...,...,...,...
2768809,9745998,9745998_0,turbulent flow noise,caused by,airflow turbulence
2820812,9850907,9850907_16,airflow,generates,noise
2854495,9909485,9909485_28,recirculating airflow,to reduce,noise
2854753,9909485,9909485_93,noise,in,certain airflow condition


In [4]:
#shortlisting all facts that occur in the sentences of level 0

facts = facts.loc[facts["sentence_id"].isin(level_0["sentence_id"])]
facts

Unnamed: 0,patent_id,sentence_id,head,relation,tail
24388,10036400,10036400_62,flow channel,facilitates reducing,noise
24389,10036400,10036400_62,flow channel,facilitates reducing,airflow disruption
24390,10036400,10036400_62,noise,generated by,airflow
24391,10036400,10036400_62,noise,impacting,cabinet
24392,10036400,10036400_62,airflow,impacting,cabinet
...,...,...,...,...,...
2881802,9945390,9945390_58,mounting arm,is contoured to reduce,noise levels
2881803,9945390,9945390_58,mounting arm,is contoured to reduce,airflow restrictions
2881804,9945390,9945390_58,noise levels,generated by,blower
2881805,9945390,9945390_58,noise levels,to reduce,airflow restrictions


In [5]:
#filtering the facts from the shortlist that only extend from level_o

entity_markers = (level_0["patent_id"] + ": " + level_0["head"]).tolist() + (level_0["patent_id"] + ": " + level_0["tail"]).tolist()

facts = facts.loc[(facts["patent_id"] + ": " + facts["head"]).isin(entity_markers) & (facts["patent_id"] + ": " + facts["tail"]).isin(entity_markers)]
facts = facts.drop(columns=["sentence_id"]).reset_index(drop=True)
facts = facts.drop_duplicates()
facts.sample(frac=1)

Unnamed: 0,patent_id,head,relation,tail
90,6148954,apparatus,for reducing,airflow noise
11,10316862,lower pressure airflow,generates,less noise
112,7214033,frequency range,of,airflow noise
63,11397013,airflow,reduce,noise
96,6193011,apparatus,for reducing,airflow noise
...,...,...,...,...
157,8608439,airflow,at,lowest operational noise levels
45,11286953,noise,caused by,airflow
76,5577888,noise,generated by,airflow generator
38,11125238,airflow noise,of,wake


In [6]:
import random

#shuffling the facts
facts["facts"] = facts["head"] + " :: " + facts["relation"] + " :: " + facts["tail"]
fact_list = facts["facts"].tolist()
random.shuffle(fact_list)

#getting the list of facts as text file
with open("data/airflow-facts.txt", "w+", encoding="utf-8") as f:
    f.write("\n".join(fact_list))

In [7]:
#converting data frame to a dictionary

import pickle

edges = {}
titles = {}

for index, row in facts.iterrows():
    edges[(row["head"], row["tail"])] = row["relation"]
    if (row["head"], row["tail"]) not in titles:
        titles[(row["head"], row["tail"])] = f"This fact has been recorded in the following patents: US{row['patent_id']}"
    else:
        titles[(row["head"], row["tail"])] += f", US{row['patent_id']}"
        
with open("data/airflow-facts.pkl", "wb") as f:
    pickle.dump(edges, f)

In [8]:
#Getting entity counts

from collections import Counter

entities = facts["head"].tolist() + facts["tail"].tolist()
counts = Counter(entities)

max_freq = counts.most_common(1)[0][1]

counts.most_common(10)

[('noise', 55),
 ('airflow', 48),
 ('airflow noise', 21),
 ('airflow path', 7),
 ('motor fan assembly', 6),
 ('fan noise', 6),
 ('serpentine airflow route', 4),
 ('noise level', 4),
 ('noise reduction plate', 4),
 ('motor cooling airflow properties', 3)]

In [9]:
#visualising the selected facts
from pyvis.network import Network
import random

g = Network(width="100%", height="1600px", directed =True)

nodes = list(set(entities))
sizes = [(((counts[item] - 1)/(max_freq - 1))*10) + 3 for item in nodes]
labels = [f"This entity has been mentioned in {counts[item]} facts." for item in nodes]
colors = ["#708090" if item in ["noise", "airflow noise"] else "#A9A9A9" for item in nodes]

g.add_nodes(nodes, size=sizes, color=colors, title=labels)

for h, t in edges:
    #including an edge by chance to ensure the visualisation is readable
    if random.random() < 1:
        g.add_edge(source=h, to=t, label=edges[(h, t)], color="#D3D3D3", title=titles[h, t])

g.show(f"airflow-facts.html", notebook=False)

airflow-facts.html


In [10]:
#Import the sentences

sentences = pd.read_pickle("data/fan-systems-sentences.pkl")
sentences

Unnamed: 0,patent_id,sentence_id,sentence
0,10001020,10001020_0,The present invention generally relates to fan...
1,10001020,10001020_1,The improved fan blade is characterized by a f...
2,10001020,10001020_2,The fan blade body includes opposingly paired ...
3,10001020,10001020_3,The fan blade/fan blade body may be a single t...
4,10001020,10001020_4,"As to the latter, it is advantageously contemp..."
...,...,...,...
603179,9995315,9995315_33,The ceiling fan motor housing and cover side f...
603180,9995315,9995315_34,A ceiling fan motor housing and cover side fix...
603181,9995315,9995315_35,The motor housing includes a rotor and plural ...
603182,9995315,9995315_36,The cover includes plural fixing holes formed ...


In [11]:
#Find sentences that include the issue term

sentences = sentences.loc[sentences["sentence"].str.contains(r'^(?=.*airflow)(?=.*noise)')]
sentences.sample(frac=1)

Unnamed: 0,patent_id,sentence_id,sentence
232477,11286953,11286953_208,"Accordingly, it is possible to further suppres..."
329514,6217281,6217281_97,The low noise fan filter unit as recited in cl...
15626,10100846,10100846_96,The fan assembly of claim 2 in which the noise...
101547,10537041,10537041_25,"Accordingly, by means of the heat dissipation ..."
248526,11371525,11371525_51,After the first blade 121 at the inlet of the ...
...,...,...,...
71311,10400785,10400785_50,By being able to produce the same airflow at a...
505558,9011092,9011092_9,"As a result, an airflow in a region downstream..."
602942,9995311,9995311_149,The concave portion CRC may guide airflow conc...
329532,6217281,6217281_115,A low noise fan filter unit for providing filt...


In [12]:
#getting the list of sentences as text file

with open("data/airflow-sentences.txt", "w+", encoding="utf-8") as f:
    f.write("\n\n".join(sentences["sentence"].tolist()))