# Description
We want to obtain a keyword influence graph. Its nodes are keyphrases of AI and Renewable Energy patents and its edges are given by the Katz influence (directed path aggregates) in the patent citation network. Node weights are given by keyword count and edge weights by Katz influence.   

In [183]:
import pandas as pd
from keyphrase_vectorizers import KeyphraseCountVectorizer
from keybert import KeyBERT
import pickle
import networkx as nx





# Gathering the data

In [3]:

AI_patents=pd.read_csv('../databases/AI_patents.csv' , dtype={'patent_number': object})
AI_patents=AI_patents[AI_patents.patent_number.astype(str).str.isnumeric()]
AI_patents.head()

Unnamed: 0.1,Unnamed: 0,meanSPNPcited_1year_before,filing_year,patent_number,RPbyYear,group_id,subgroup_id,abstract,title
0,0,-2.589326,2015,10000124,0.516084,Y02T,"['Y02T10/645', 'B60W10/08', 'B60W2710/18', 'B6...","Systems, apparatus and methods to multiple lev...","Independent steering, power, torque control an..."
1,1,-1.441175,2016,10001422,0.68859,G01L,"['G01L1/16', 'G06N3/008', 'A61B2562/0247', 'A6...",A pain sensing device includes a sensor array ...,Method and device for sensing pain
2,2,-1.728041,2015,10001763,0.641872,G05B,"['G05B15/02', 'G05B2219/2642', 'G06N5/025']",A method for controlling intelligent device ba...,Control device and method for controlling inte...
3,3,-3.980396,2016,10001778,0.347794,G05D,"['G05D1/0011', 'G05D13/00', 'G05D1/0088', 'G08...",Systems and methods for controlling an unmanne...,Velocity control for an unmanned aerial vehicle
4,4,-3.207635,2016,10001780,0.430521,G01C,"['G01C21/3415', 'G05D1/0276', 'G05D1/0214', 'G...",Systems and methods for dynamic route planning...,Systems and methods for dynamic route planning...


In [4]:
lcet=pd.read_csv('output/LCET_patents_with_influencers.csv')  
lcet.head()

Unnamed: 0.1,Unnamed: 0,Patent number,Patent year,Technology type,CPC class,Patent date,Patent title,patent_number_x,AI_influencers,patent_number_y,influencing_AI_subdomains,patent_number,influencing_AI_keywords
0,0,94.0,1836,hydro,Y02E10/226,18361128,Reacting sotaxy steak-engine,,,,,,
1,1,479.0,1837,wind,Y02E10/723,18371123,Improvement in windmills,,,,,,
2,2,512.0,1837,hydro,Y02E10/223,18371215,Water-wheel,,,,,,
3,3,518.0,1837,hydro,Y02E10/223,18371215,Improvement in horizontal water-wheels,,,,,,
4,4,695.0,1838,hydro,Y02E10/223,18380416,Improved water-wheel,,,,,,


In [5]:
AI_patents['Technology type']=['AI']* AI_patents.shape[0]
lcet['patent_number']=lcet['Patent number']


lcet['patent_number']=lcet['patent_number'].apply(lambda x: int(x))
lcet=lcet[lcet.patent_number.astype(str).str.isdigit()]
lcet.patent_number=lcet.patent_number.astype(int)

In [6]:
AI_patents=AI_patents[['patent_number', 'Technology type']]
lcet=lcet[['patent_number', 'Technology type', "AI_influencers"]]

In [7]:
lcet=pd.DataFrame(lcet.groupby('patent_number').first())

In [8]:
lcet['patent_number']=lcet.index
lcet

Unnamed: 0_level_0,Technology type,AI_influencers,patent_number
patent_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
94,hydro,,94
479,wind,,479
512,hydro,,512
518,hydro,,518
695,hydro,,695
...,...,...,...
10218298,wind,"[('4523331', 0.00028211099074559984), ('532177...",10218298
10218304,thermal,"[('4748674', 0.0013060694015999993), ('4105998...",10218304
10218306,thermal,"[('4736447', 0.0013060694015999993), ('4651297...",10218306
10218307,pv,"[('5138928', 0.002176782335999999), ('5187592'...",10218307


In [9]:
DF=pd.concat([AI_patents,lcet])
DF.patent_number=DF.patent_number.astype('int')

In [10]:
DF

Unnamed: 0,patent_number,Technology type,AI_influencers
0,10000124,AI,
1,10001422,AI,
2,10001763,AI,
3,10001778,AI,
4,10001780,AI,
...,...,...,...
10218298,10218298,wind,"[('4523331', 0.00028211099074559984), ('532177..."
10218304,10218304,thermal,"[('4748674', 0.0013060694015999993), ('4105998..."
10218306,10218306,thermal,"[('4736447', 0.0013060694015999993), ('4651297..."
10218307,10218307,pv,"[('5138928', 0.002176782335999999), ('5187592'..."


In [11]:
DF=DF[~DF["Technology type"].isin(['fission', 'fusion'])]

In [12]:
pats=pd.read_csv("../databases/patent.tsv", sep='\t', usecols=['id', 'abstract']) 

pats=pats[pats.id.astype(str).str.isdigit()]
pats.id=pats.id.astype(int)

pats

  pats=pd.read_csv("../databases/patent.tsv", sep='\t', usecols=['id', 'abstract'])


Unnamed: 0,id,abstract
0,10000000,A frequency modulated (coherent) laser detecti...
1,10000001,The injection molding machine includes a fixed...
2,10000002,The present invention relates to: a method for...
3,10000003,The invention relates to a method for producin...
4,10000004,The present invention relates to provides a do...
...,...,...
7430868,T998013,
7430869,T998014,
7430870,T999001,
7430871,T999002,


In [14]:
pats=pats.groupby('id').first()

In [16]:
DF=DF.merge(pats, how='inner', left_on='patent_number', right_on='id')

In [17]:
DF=DF.reindex()
DF

Unnamed: 0,patent_number,Technology type,AI_influencers,abstract
0,10000124,AI,,"Systems, apparatus and methods to multiple lev..."
1,10001422,AI,,A pain sensing device includes a sensor array ...
2,10001763,AI,,A method for controlling intelligent device ba...
3,10001778,AI,,Systems and methods for controlling an unmanne...
4,10001780,AI,,Systems and methods for dynamic route planning...
...,...,...,...,...
67390,10218298,wind,"[('4523331', 0.00028211099074559984), ('532177...",Systems and methods for controlling reactive c...
67391,10218304,thermal,"[('4748674', 0.0013060694015999993), ('4105998...",A roof mount assembly mounts a structure to a ...
67392,10218306,thermal,"[('4736447', 0.0013060694015999993), ('4651297...","In various representative aspects, an assembly..."
67393,10218307,pv,"[('5138928', 0.002176782335999999), ('5187592'...",A connection box for solar panels to enable th...


In [18]:
DF=DF.groupby('patent_number').first()
DF

Unnamed: 0_level_0,Technology type,AI_influencers,abstract
patent_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3930750,wind,,A power plant of the windmill type is provide...
3933628,thermal,,This invention relates to an improved anaerob...
3934323,thermal,,A panel having multiple tubular passages exte...
3934573,thermal,,A spherical system for the concentration and ...
3934643,thermal,,A heat pipe comprising a hermetically sealed ...
...,...,...,...
10701321,AI,,A system and method for distributed analysis o...
10701353,AI,,"A system is disclosed, comprising a data proce..."
10701434,AI,,A seek content extraction system analyzes fram...
10701489,AI,,A wearable device for binaural audio is descri...


# Obtaining patent keywords

In [184]:
kw_model = KeyBERT()

def get_keyphrases(docs):
    keyphrases=kw_model.extract_keywords(docs=docs, keyphrase_ngram_range = (1, 3),  stop_words='english' ,
                                          use_mmr=True, diversity=0.6) #, use_maxsum=True, top_n=5, nr_candidates=20) vectorizer=KeyphraseCountVectorizer()

    #keyphrases= [sorted(kws,key=lambda x: x[0], reverse=True) for kws in keyphrases]
    return   keyphrases 




In [None]:
DF['keywords']=get_keyphrases(list(DF.abstract.astype(str).values))


In [20]:
DF.keywords[7728715]

[('remote monitoring', 0.6483),
 ('digital image containing', 0.5042),
 ('signal containing information', 0.4366),
 ('cmos', 0.3909),
 ('parking spaces items', 0.153)]

In [21]:
DF.to_csv("output/AI_LCET_keywords_keybert_final.tsv", sep='\t')

In [163]:
import pandas as pd
from ast import literal_eval

DF=pd.read_csv('output/AI_LCET_keywords_keybert.tsv', sep='\t')
DF.set_index('patent_number', drop=True, inplace=True)
DF.keywords=DF.astype(str).keywords.apply(lambda x : x.split(',')) #literal_eval)
DF

Unnamed: 0_level_0,Unnamed: 0,Technology type,AI_influencers,abstract,keywords
patent_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
3930750,20006,wind,,A power plant of the windmill type is provide...,"[windmill type, wind tracking mechanisms, prop..."
3933628,20007,thermal,,This invention relates to an improved anaerob...,"[anaerobic digestion apparatus, pond therebene..."
3934323,20008,thermal,,A panel having multiple tubular passages exte...,"[panels, flows, fluid tight hollow header, mel..."
3934573,20009,thermal,,A spherical system for the concentration and ...,"[boiler, solar energy, spherical system, appar..."
3934643,20010,thermal,,A heat pipe comprising a hermetically sealed ...,"[heat pipe, saturated vapor, communicates, hol..."
...,...,...,...,...,...
10701321,6195,AI,,A system and method for distributed analysis o...,"[video surveillance system, only image data, c..."
10701353,6196,AI,,"A system is disclosed, comprising a data proce...","[imaging system, processing, exemplary pattern..."
10701434,6197,AI,,A seek content extraction system analyzes fram...,"[seek content extraction system, video content..."
10701489,6198,AI,,A wearable device for binaural audio is descri...,"[binaural audio, wearable device, ambient nois..."


In [164]:
bad_keywords=['photovoltaic', 'disclosed', 'semiconductor substrate', 'solar', 'wind', 'apparatus', 'present invention', 'turbine',
'methods', 'neural', 'wind turbine comprising', 'processing', 'compounds', 'method', "detecting", 'electronic device']

# Computing graph weights

In [165]:

num_keywords=3
df=DF


edge_weights={}
node_weights={}

for i, X in df.iterrows():
    Type=X["Technology type"]
    for kw1 in X.keywords[:num_keywords]: #,s
        #if len(kw1.split())==1:
        #    continue
        if kw1 in bad_keywords:
                continue
        if kw1=='solar cells':
             kw1='solar cell'
        if kw1=='photovoltaic cells':
             kw1='photovoltaic cell'  
        if kw1=='photovoltaic devices':
             kw1='photovoltaic device'
        if kw1=='wind turbines':
             kw1='wind turbine' 
        if kw1 in ['classification', 'classifier', 'classifying']:
             kw1='classifier'

        if kw1 in ['digital image', 'image data', 'image processing','image processing apparatus']:
             kw1='image processing'
        if kw1== 'machine learning model':
             kw1= 'machine learning'
        if Type=="thermal" and kw1=='solar panel' or kw1=='solar panels' or kw1=="panels":
             continue     
        if kw1=='system' or kw1=="solar energy" or kw1==  'second electrode':
             continue 
        if kw1=='solar collectors':
            kw1='solar collector'
        if kw1=='electrodes':
            kw1='electrode' 
        if kw1== 'electronic devices':
            kw1= 'electronic device'
        if kw1=='fabricating':
             kw1=  'manufacturing'
        if kw1=='polymers':
             kw1='polymer'  
        if kw1=='wind power':
             kw1="wind energy"
        if kw1=='solar energy collector':
             kw1= 'solar heat collector',           
       
        
 
 
        
        Type=X["Technology type"]
        if (kw1, Type) in node_weights:
            node_weights[(kw1, Type)]+=1#s
        else:
             node_weights[(kw1, Type)]=1#s

        if X.AI_influencers==None or type(X.AI_influencers)==float:
            continue  # Nans
        if Type=="AI":
             continue #AI influence on AI is excluded
        for AIpat, s_AI in literal_eval(X.AI_influencers):

                for kw2 in df.keywords[int(AIpat)][:num_keywords]: #,s_RET
                    #if len(kw2.split())==1:
                    #    continue
                    if kw2 in bad_keywords:
                         continue

                    if ((kw2, "AI"),(kw1, Type))  in edge_weights.keys():
                        edge_weights[((kw2, "AI"),(kw1, Type))  ]+=s_AI#*s_RET
                    else:
                        edge_weights[(( kw2,"AI"),( kw1, Type))  ]=s_AI#*s_RET



In [166]:
#sort node_weights by value

def sort_weights(node_weights):
    return sorted(node_weights.items(), key=lambda x: x[1], reverse=True)

sorted_node_weights=sort_weights(node_weights)
sorted_edge_weights=sort_weights(edge_weights)

sorted_node_weights


[(('solar cell', 'pv'), 2196),
 (('wind turbine', 'wind'), 1655),
 (('substrate', 'pv'), 823),
 (('photovoltaic device', 'pv'), 607),
 (('manufacturing', 'pv'), 501),
 (('image processing', 'AI'), 492),
 (('classifier', 'AI'), 469),
 (('solar collector', 'thermal'), 443),
 (('photovoltaic cell', 'pv'), 410),
 (('neural network', 'AI'), 332),
 (('thin film', 'pv'), 296),
 (('fermentation', 'biofuel'), 275),
 (('semiconductor device', 'pv'), 261),
 (('wind turbine blade', 'wind'), 259),
 (('semiconductor', 'pv'), 247),
 (('biomass', 'biofuel'), 244),
 (('solar cell module', 'pv'), 242),
 (('ethanol', 'biofuel'), 238),
 (('autonomous vehicle', 'AI'), 234),
 ((('solar heat collector',), 'thermal'), 216),
 (('machine learning', 'AI'), 204),
 (('electrode', 'pv'), 198),
 (('polymer', 'pv'), 189),
 (('etching', 'pv'), 186),
 (('wind energy', 'wind'), 184),
 (('dye sensitized solar cell', 'pv'), 184),
 (('generator', 'wind'), 181),
 (('wind turbine generator', 'wind'), 165),
 (('biodiesel', 'b

In [167]:
sorted_edge_weights

[((('expert system', 'AI'), ('wind turbine', 'wind')), 252.07767096779605),
 ((('adaptive control', 'AI'), ('wind turbine', 'wind')), 165.7536164683794),
 ((('wind turbine', 'AI'), ('wind turbine', 'wind')), 164.23044556800096),
 ((('neural network', 'AI'), ('wind turbine', 'wind')), 150.90384552665128),
 ((('power generator speed', 'AI'), ('wind turbine', 'wind')),
  144.9066055679999),
 ((('automatic transmission', 'AI'), ('wind turbine', 'wind')),
  135.2134188393425),
 ((('fuzzy reasoning', 'AI'), ('wind turbine', 'wind')), 118.00058296319894),
 ((('fuzzy set theory', 'AI'), ('wind turbine', 'wind')), 110.29885443587976),
 ((('pattern recognition', 'AI'), ('wind turbine', 'wind')),
  106.95644114734088),
 ((('expert system', 'AI'), ('solar cell', 'pv')), 99.61504094896097),
 ((('pattern recognition apparatus', 'AI'), ('wind turbine', 'wind')),
  97.54663162075477),
 ((('probabilistic learning element', 'AI'), ('wind turbine', 'wind')),
  96.02192405938088),
 ((('fuzzy production ru

In [168]:
top_nodes= sorted_node_weights[:70] 
print(top_nodes)

top_nodes_=[t[0] for t in top_nodes]
top_nodes_


[(('solar cell', 'pv'), 2196), (('wind turbine', 'wind'), 1655), (('substrate', 'pv'), 823), (('photovoltaic device', 'pv'), 607), (('manufacturing', 'pv'), 501), (('image processing', 'AI'), 492), (('classifier', 'AI'), 469), (('solar collector', 'thermal'), 443), (('photovoltaic cell', 'pv'), 410), (('neural network', 'AI'), 332), (('thin film', 'pv'), 296), (('fermentation', 'biofuel'), 275), (('semiconductor device', 'pv'), 261), (('wind turbine blade', 'wind'), 259), (('semiconductor', 'pv'), 247), (('biomass', 'biofuel'), 244), (('solar cell module', 'pv'), 242), (('ethanol', 'biofuel'), 238), (('autonomous vehicle', 'AI'), 234), ((('solar heat collector',), 'thermal'), 216), (('machine learning', 'AI'), 204), (('electrode', 'pv'), 198), (('polymer', 'pv'), 189), (('etching', 'pv'), 186), (('wind energy', 'wind'), 184), (('dye sensitized solar cell', 'pv'), 184), (('generator', 'wind'), 181), (('wind turbine generator', 'wind'), 165), (('biodiesel', 'biofuel'), 164), (('rotor', '

[('solar cell', 'pv'),
 ('wind turbine', 'wind'),
 ('substrate', 'pv'),
 ('photovoltaic device', 'pv'),
 ('manufacturing', 'pv'),
 ('image processing', 'AI'),
 ('classifier', 'AI'),
 ('solar collector', 'thermal'),
 ('photovoltaic cell', 'pv'),
 ('neural network', 'AI'),
 ('thin film', 'pv'),
 ('fermentation', 'biofuel'),
 ('semiconductor device', 'pv'),
 ('wind turbine blade', 'wind'),
 ('semiconductor', 'pv'),
 ('biomass', 'biofuel'),
 ('solar cell module', 'pv'),
 ('ethanol', 'biofuel'),
 ('autonomous vehicle', 'AI'),
 (('solar heat collector',), 'thermal'),
 ('machine learning', 'AI'),
 ('electrode', 'pv'),
 ('polymer', 'pv'),
 ('etching', 'pv'),
 ('wind energy', 'wind'),
 ('dye sensitized solar cell', 'pv'),
 ('generator', 'wind'),
 ('wind turbine generator', 'wind'),
 ('biodiesel', 'biofuel'),
 ('rotor', 'wind'),
 ('photoelectric conversion device', 'pv'),
 ('display device', 'pv'),
 ('process', 'biofuel'),
 ('semiconductor layer', 'pv'),
 ('rotor blade', 'wind'),
 ('mounting', '

In [169]:
#sorted([x  for ((x, l), s) in top_nodes if l=='pv'])

['amorphous silicon',
 'device',
 'display device',
 'dye sensitized solar cell',
 'electrode',
 'electronic device',
 'emitting device',
 'etching',
 'flexible substrate',
 'inverter',
 'manufacturing',
 'manufacturing method',
 'optoelectronic device',
 'organic electronic device',
 'organic light',
 'photodetector',
 'photoelectric conversion device',
 'photovoltaic cell',
 'photovoltaic device',
 'photovoltaic module',
 'polymer',
 'semiconductor',
 'semiconductor device',
 'semiconductor layer',
 'silicon solar cell',
 'silicon substrate',
 'solar battery',
 'solar cell',
 'solar cell comprising',
 'solar cell module',
 'substrate',
 'thin film']

In [28]:
f=open('output/edge-node-weights_keybert_final', "wb")
pickle.dump([sorted_node_weights, sorted_edge_weights], f)
f.close()

# Constructing the graph

In [175]:
badnodes=['solar cell comprising','systems','artificial neural network','ethanol production']

In [170]:
G=nx.DiGraph()

G.add_nodes_from (   [  (kw, {"weight": s, "type": t}) for  ((kw,t), s) in    top_nodes  if 
                   kw not in badnodes ] )

G.nodes['solar cell']



{'weight': 2196, 'type': 'pv'}

In [179]:
G.add_weighted_edges_from( [(kw[0][0],kw[1][0],s) for  (kw,s) in  sorted_edge_weights if  kw[0] in set(top_nodes_) and  kw[1] in set(top_nodes_)]) 

In [180]:
G.remove_nodes_from(badnodes)

G.remove_nodes_from(list(nx.isolates(G)))


In [181]:
# Saving for Gephi and Cytoscape
nx.write_gexf(G, "output/keyword_graph_keybert_nodeweight2.gexf")
nx.write_graphml(G, "output/keyword_graph_keybert_nodeweight2.graphml")