Code for the analysis and visualizations for the paper "A similarity search approach to patent classification" by Reza Rezazadegan and Zahra Bagheri

www.github.com/rezareza007
www.dreamintelligent.com



In [None]:
import pandas as pd
pd.options.mode.chained_assignment = None

import gensim
import pickle
import gc
import numpy as np
from ast import literal_eval

from gensim import models
from gensim import corpora
from gensim.models import Word2Vec


TOP_WORDS_NUM=10

Method='weightedsum'
number_topic=100

In [2]:
#loading final ldamodel from totall text
final_ldamodel = models.ldamodel.LdaModel.load("data/lda_full_"+str(number_topic)+"_nltk_model_pickle")    # ("C:/Users/bagheri/Desktop/finallda_model_pickle")

In [3]:
#dictionary = corpora.Dictionary.load("data/dictionary_claim+brief")  # ("C:/Users/bagheri/Desktop/finallda_model_pickle.id2word")
from gensim import corpora
dictionary = corpora.Dictionary.load("data/lda_full_" +str(number_topic) + "_nltk_model_"+"pickle.id2word")

In [4]:
dictionary.doc2bow(['edg'])

[(164, 1)]

In [5]:
final_ldamodel.num_topics

100

In [6]:
final_ldamodel.show_topics(-1)

[(0,
  '0.064*"mesh" + 0.062*"teeth" + 0.062*"applianc" + 0.058*"tooth" + 0.033*"dental" + 0.031*"restor" + 0.030*"sleep" + 0.029*"bridg" + 0.024*"tongu" + 0.022*"mouth"'),
 (1,
  '0.206*"frame" + 0.170*"block" + 0.055*"encod" + 0.044*"window" + 0.043*"decod" + 0.038*"motion" + 0.038*"cod" + 0.032*"predict" + 0.025*"pictur" + 0.020*"transform"'),
 (2,
  '0.027*"enzym" + 0.027*"strain" + 0.021*"cultur" + 0.016*"bacteria" + 0.015*"growth" + 0.015*"microorgan" + 0.015*"produc" + 0.014*"infect" + 0.014*"glucos" + 0.013*"biolog"'),
 (3,
  '0.059*"cell" + 0.031*"protein" + 0.026*"amino" + 0.024*"acid" + 0.023*"antibodi" + 0.021*"bind" + 0.018*"express" + 0.016*"polypeptid" + 0.014*"cancer" + 0.013*"peptid"'),
 (4,
  '0.138*"metal" + 0.037*"oxid" + 0.024*"alloy" + 0.019*"ceram" + 0.018*"aluminum" + 0.016*"copper" + 0.016*"steel" + 0.015*"melt" + 0.012*"temperatur" + 0.011*"titanium"'),
 (5,
  '0.128*"sheet" + 0.107*"cover" + 0.068*"adhes" + 0.052*"glass" + 0.042*"flexibl" + 0.032*"protect" + 

# word2vec similarity

In [7]:
word2vec_model=Word2Vec.load("data/word2vec_lemmatized_300.model") 


In [8]:
word2vec_model.wv.key_to_index

{'invent': 0,
 'provid': 1,
 'includ': 2,
 'devic': 3,
 'present': 4,
 'second': 5,
 'method': 6,
 'form': 7,
 'compris': 8,
 'embodi': 9,
 'data': 10,
 'control': 11,
 'signal': 12,
 'gener': 13,
 'process': 14,
 'exampl': 15,
 'prefer': 16,
 'surfac': 17,
 'layer': 18,
 'accord': 19,
 'have': 20,
 'oper': 21,
 'materi': 22,
 'portion': 23,
 'posit': 24,
 'group': 25,
 'imag': 26,
 'base': 27,
 'time': 28,
 'mean': 29,
 'connect': 30,
 'relat': 31,
 'object': 32,
 'acid': 33,
 'applic': 34,
 'contain': 35,
 'light': 36,
 'unit': 37,
 'element': 38,
 'compound': 39,
 'direct': 40,
 'inform': 41,
 'cell': 42,
 'receiv': 43,
 'member': 44,
 'high': 45,
 'select': 46,
 'refer': 47,
 'say': 48,
 'plural': 49,
 'circuit': 50,
 'apparatu': 51,
 'structur': 52,
 'addit': 53,
 'configur': 54,
 'power': 55,
 'aspect': 56,
 'type': 57,
 'describ': 58,
 'user': 59,
 'differ': 60,
 'compon': 61,
 'product': 62,
 'field': 63,
 'result': 64,
 'electr': 65,
 'determin': 66,
 'requir': 67,
 'temperatu

In [9]:
word2vec_model.wv.similarity('edg','best')

-0.005109138

In [10]:


def text_preprocess(text):

    from nltk.stem  import PorterStemmer 
    from nltk.stem import WordNetLemmatizer



    

    ps = PorterStemmer()

    
    if text=="":
        return []


    result = []
    for token in gensim.utils.simple_preprocess(text):
     

        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(ps.stem(WordNetLemmatizer().lemmatize(token, pos='v') ))

   
    return result

def prepare_text_for_lda(text):
  
    corpus=dictionary.doc2bow(text_preprocess(text))
    #print(corpus)
    topic=final_ldamodel[corpus]
    return topic


In [11]:
def similarity_list(l1,l2):
    """Takes two lists of words and returns their pairwise similarity."""
    listnew=[]
    for w1 in l1:
        for w2 in l2:
             listnew.append( (w1,w2, word2vec_model.wv.similarity(w1,w2))  )
        
    return listnew 

In [12]:
#word2vec_model.wv.similarity("good", "bad")

In [14]:
l=['secur',
 'authent',
 'encrypt',
 'author',
 'signatur',
 'ident',
 'public',
 'verif',
 'verifi',
 'privat']
l2=['plural',
 'array',
 'row',
 'arrang',
 'locat',
 'individu',
 'multipl',
 'respect',
 'subset',
 'correspond']

In [16]:
from numpy.linalg import norm

def WeightedSumSimilarity(topic1, topic2):
    """  Each topic is a list of (word, score) tuples
         returns their WeightedSumSimilarity
    """
    

    wess = 0
    
    centroid1 = np.zeros(word2vec_model.wv.vector_size)
    weights = 0

    for  word, score in topic1: 
        centroid1 = centroid1 + word2vec_model.wv.get_vector(word)*score  
        weights += score

    centroid1 = centroid1 / weights

    centroid2 = np.zeros(word2vec_model.wv.vector_size)
    weights = 0
    for word, score in topic2: 
    
        centroid2 = centroid2 +  word2vec_model.wv.get_vector(word)*score 
        weights += score

    centroid2 = centroid2 / weights

    # compute cosine similarity
    wesss = np.dot(centroid1,centroid2)/(norm(centroid1)*norm(centroid2))

   
    return wesss 

In [17]:
topic1=[('secur', 0.22477894), ('authent', 0.08872543), ('encrypt', 0.065337606), ('author', 0.040510204), ('signatur', 0.03953595), ('ident', 0.036186848), ('public', 0.029587945), ('verif', 0.0246569), ('verifi', 0.023375357), ('privat', 0.022541929)]
topic2=[('plural', 0.43420598), ('array', 0.15278189), ('row', 0.018694116), ('arrang', 0.018617006), ('locat', 0.015721645), ('individu', 0.014942128), ('multipl', 0.014155169), ('respect', 0.013648521), ('subset', 0.012087191), ('correspond', 0.01197267)]

#WeightedSumSimilarity(topic1,topic2 )

In [18]:
topic4= [('secur', 0.22477894), ('authent', 0.08872543), ('encrypt', 0.065337606), ('author', 0.040510204), ('signatur', 0.03953595), ('ident', 0.036186848), ('public', 0.029587945), ('verif', 0.0246569), ('verifi', 0.023375357), ('privat', 0.022541929)]
topic3=[('secur', 0.22477894), ('authent', 0.08872543), ('encrypt', 0.065337606), ('author', 0.040510204), ('signatur', 0.03953595), ('ident', 0.036186848), ('public', 0.029587945), ('verif', 0.0246569), ('verifi', 0.023375357), ('privat', 0.022541929)]
#WeightedSumSimilarity(topic3,topic4 )

In [19]:


def top_words_topic(topic_number):#this fuction find ten top words of a topic
    list_words=[]
    topic=final_ldamodel.show_topic(topic_number, topn=TOP_WORDS_NUM)
    for i in range(len(topic)):
        words=topic[i][0]
        list_words.append(words)
    return list_words     
    
    
    


In [20]:
top_words_topic(1)

['frame',
 'block',
 'encod',
 'window',
 'decod',
 'motion',
 'cod',
 'predict',
 'pictur',
 'transform']

In [21]:
def top_score_topic(topic_number):#this fuction find ten top words of a topic
    list_scores=[]
    topic=final_ldamodel.show_topic(topic_number,  topn=TOP_WORDS_NUM)
    for i in range(len(topic)):
        scores=topic[i][1]
        list_scores.append(scores)
    return list_scores 

In [22]:
top_score_topic(1)

[0.20559832,
 0.16979916,
 0.05485317,
 0.044131763,
 0.042668343,
 0.038275987,
 0.03765017,
 0.031801883,
 0.02507376,
 0.020173665]

In [23]:
listt=[(120,0.25), (200, 0.25), (273, 0.25),(476,0.25)]
listt2=[(50, 0.25), (120, 0.25), (325, 0.25),(150,0.25)]

In [24]:

def find_words_topics(list1):#this function finds  the list of top words of each topics of list
    
    listtopics1=[]
    
    for i in range(len(list1)):
        topics1=top_words_topic(list1[i][0])
        listtopics1.append(topics1)
        
 
   
    return listtopics1

In [25]:
#find_words_topics(listt)

In [26]:
def find_score_of_words_topics(list1):#this function find list of score of top words of each topics of list
    
    listtopics1=[]
    
    for i in range(len(list1)):
         topics1=top_score_topic(list1[i][0])
       
         listtopics1.append(topics1)
    return listtopics1    
        
                           

In [27]:
#find_score_of_words_topics(listt)

In [28]:
# computing similarity between topics

simil_matrix_ws={}


In [None]:

listtopics1=   find_words_topics( list(zip(range(final_ldamodel.num_topics) , [0]*final_ldamodel.num_topics) ))
listscores1=   find_score_of_words_topics(list(zip(range(final_ldamodel.num_topics), [0] *final_ldamodel.num_topics  )))


list_topics=[]
for i in range(len(listtopics1)):     
    tuples= list(zip(listtopics1[i],listscores1[i]))
    list_topics.append(tuples)


print(list_topics)

for i,topic1 in enumerate(list_topics):
    print(i)
    for j,topic2 in enumerate(list_topics):
            
            
           
            simil_matrix_ws[(i,j)]=WeightedSumSimilarity(topic1, topic2)


f=open("data/simil_matrices_"+str(number_topic), "wb")

pickle.dump([simil_matrix_ws], f)
f.close()    

In [29]:
f=open("data/simil_matrices_"+str(number_topic), "rb")

[ simil_matrix_ws]=pickle.load(f)
f.close()    

In [31]:
simil_matrix_ws

{(0, 0): 1.0000000000000002,
 (0, 1): 0.180233542322208,
 (0, 2): -0.12163077934138654,
 (0, 3): -0.14881095889727208,
 (0, 4): -0.009975972219713912,
 (0, 5): 0.242668853773514,
 (0, 6): -0.07668328440047696,
 (0, 7): -0.0679971721422142,
 (0, 8): 0.27911005363721075,
 (0, 9): 0.07276321772081076,
 (0, 10): -0.062195638065469114,
 (0, 11): 0.34053938956773827,
 (0, 12): 0.09320079478358904,
 (0, 13): 0.38736339342160775,
 (0, 14): 0.39693245989528936,
 (0, 15): 0.23924966310082615,
 (0, 16): 0.31635423172908034,
 (0, 17): 0.10467986722058212,
 (0, 18): 0.1338361457053275,
 (0, 19): 0.06287091438713999,
 (0, 20): -0.041859569868474236,
 (0, 21): 0.06080008591148363,
 (0, 22): -0.006816038896267741,
 (0, 23): 0.3004187530754664,
 (0, 24): -0.126449448124976,
 (0, 25): -0.1371079602583713,
 (0, 26): 0.020334655872910695,
 (0, 27): 0.006465133640408737,
 (0, 28): -0.05049033656886694,
 (0, 29): -0.020304999284170407,
 (0, 30): -0.01761764110114274,
 (0, 31): -0.08390915401879975,
 (0, 32)

In [33]:

def text_similarity_weightedsum1(list1,list2):
    sum=0
    for i in range(len(list1)):
        for j in range(len(list2)):
            num_topics1=list1[i][0]
            num_topics2=list2[j][0]
            similarity_distance=simil_matrix_ws[(num_topics1,num_topics2)] 
            prob1=list1[i][1]
            prob2=list2[j][1]
            sum+=prob2*prob1*similarity_distance
    return(sum)

In [34]:
#text_similarity_weightedsum1(listt,listt2)

In [36]:
def document_similarity(list1,list2,method):
    
    
    return text_similarity_weightedsum1(list1,list2)
    

In [37]:
#document_similarity(listt,listt2,'weightedsum')

In [39]:
f=open('data/patent_topics_dict_full_'+str(number_topic), 'rb')

pp=pickle.load(f) 

f.close()

In [40]:
pp

{4000024: [(4, 0.09828475),
  (40, 0.04111879),
  (41, 0.011398201),
  (51, 0.045610446),
  (56, 0.13706556),
  (62, 0.017377084),
  (63, 0.0110401735),
  (66, 0.0292527),
  (72, 0.028781475),
  (73, 0.10113948),
  (76, 0.11652438),
  (77, 0.04556094),
  (84, 0.016034232),
  (86, 0.15513715),
  (99, 0.10724975)],
 3944441: [(4, 0.11748408),
  (5, 0.023765754),
  (9, 0.022683777),
  (14, 0.027219841),
  (16, 0.061248176),
  (24, 0.01323811),
  (40, 0.020755023),
  (46, 0.15748012),
  (50, 0.06273302),
  (55, 0.09928371),
  (60, 0.01141443),
  (69, 0.117805876),
  (73, 0.034363408),
  (85, 0.033761505),
  (86, 0.05927193),
  (91, 0.084923804),
  (98, 0.020378066)],
 3953613: [(5, 0.012166092),
  (7, 0.06836463),
  (11, 0.08507905),
  (14, 0.032640472),
  (15, 0.17246434),
  (16, 0.019359957),
  (23, 0.023759682),
  (27, 0.012373694),
  (34, 0.08824107),
  (35, 0.23374134),
  (50, 0.07132162),
  (55, 0.022627989),
  (62, 0.03655896),
  (78, 0.010519015),
  (86, 0.028382441),
  (90, 0.0216

# Similarity Search

In [44]:
CPCs=['section24', 'subsection', 'group', 'subgroup']


In [45]:
CPCs=['section24', 'subsection', 'group', 'subgroup']

CPC_dfs=[]

for cpc in CPCs:
    cpc_df=pd.read_csv('data/cpc_'+cpc+'.csv')
    #print(cpc, cpc_df)
    if cpc=='subgroup':
        cpc_df['full_text']=cpc_df.title
    
    cpc_df['topics']=cpc_df['full_text'].apply(prepare_text_for_lda)
    CPC_dfs.append(cpc_df)


In [46]:
CPC_dfs

[  id                                          full_text  \
 0  A  AGRICULTURE; FORESTRY; ANIMAL HUSBANDRY; HUNTI...   
 1  B  PHYSICAL OR CHEMICAL PROCESSES OR APPARATUS IN...   
 2  C  INORGANIC CHEMISTRY TREATMENT OF WATER, WASTE ...   
 3  D                                           Textiles   
 4  E  CONSTRUCTION OF ROADS, RAILWAYS, OR BRIDGES Ba...   
 5  F  MACHINES OR ENGINES IN GENERAL; ENGINE PLANTS ...   
 6  G  MEASURING; TESTING OPTICS PHOTOGRAPHY; CINEMAT...   
 7  H  BASIC ELECTRIC ELEMENTS GENERATION; CONVERSION...   
 8  Y  TECHNOLOGIES OR APPLICATIONS FOR MITIGATION OR...   
 
                                               topics  
 0  [(0, 0.038410485), (2, 0.027287414), (5, 0.034...  
 1  [(4, 0.05592099), (5, 0.040813982), (7, 0.0208...  
 2  [(2, 0.051608805), (4, 0.2139333), (5, 0.02820...  
 3                                  [(44, 0.5049941)]  
 4  [(1, 0.025343418), (5, 0.02950565), (8, 0.0369...  
 5  [(8, 0.011524327), (14, 0.0543117), (21, 0.029...  
 6  [(

In [47]:
#cp['topics'] = cp['topics'].apply(literal_eval)

In [50]:
number_to_keep=[9,20,30,40]
def most_similar(patent_number,method,level,CPC):
    #sm=[]
    level=level-1 # In the list starts from 0
    
    lpatent=pp[patent_number] #['topics']
    #print(lpatent)
    
    cpc=CPC[level].copy()
    
    
    cpc['simils']=cpc.topics.apply(lambda T:  document_similarity(T,lpatent,method) )


    
    # zip the two lists using zip() function
    data= list(zip( list(cpc.simils.values) , list(cpc.id.values)))
    data.sort(reverse = True, key=lambda X: X[0])
    most_similar=(data[0: number_to_keep[level]  ])
            
   
    
    

    return most_similar 

In [51]:
pp[7011726]

[(4, 0.19387619),
 (5, 0.019311374),
 (12, 0.026457569),
 (29, 0.051519066),
 (43, 0.34756526),
 (76, 0.010997065),
 (84, 0.022817545),
 (86, 0.29096642)]

In [52]:
am=most_similar(7011726,'weightedsum',1, CPC_dfs)

am

[(0.1505501378163901, 'C'),
 (0.09210157934840917, 'B'),
 (0.09190937420463394, 'H'),
 (0.05820695975022365, 'D'),
 (0.053673150069288245, 'Y'),
 (0.046021096213060025, 'E'),
 (0.00773201393856068, 'F'),
 (-0.0197764404921723, 'A'),
 (-0.05565911079148371, 'G')]

# find real cpc section,class,subclass

In [53]:

real_cpc=pd.read_csv(r'data/real_cpc.tsv', sep='\t', dtype={'patent_id': int})
real_cpc


Unnamed: 0.1,Unnamed: 0,patent_id,subgroup_id
0,0,3930271,A63B71/146
1,1,3930272,"A47D7/02,Y10T403/32451"
2,2,3930273,"A61G7/0509,A61G7/0507"
3,3,3930274,"B63H9/04,B63B35/34,B63B7/085,B63B34/00"
4,4,3930275,"B29L2031/50,B29K2105/04,B29C65/04,B29C66/112,B..."
...,...,...,...
7228673,7228673,11212947,"H01G4/40,H01L2224/29139,H01G4/008,C04B35/493,H..."
7228674,7228674,11212948,"B64D2221/00,H05K7/20881,H05K7/20936,H05K7/20945"
7228675,7228675,11212949,"H05K7/20481,H05K9/0015,H05K7/20472,H05K5/026,H..."
7228676,7228676,11212950,"H05K13/0069,Y10T29/53174,H05K13/0061,G06F30/00..."


In [54]:
patents=pd.DataFrame({'patent_id':pp.keys()})
patents


Unnamed: 0,patent_id
0,4000024
1,3944441
2,3953613
3,3945518
4,3932608
...,...
7236572,11212947
7236573,11212948
7236574,11212949
7236575,11212950


In [55]:
# if we do not remove y and a
real_cpc=patents.merge(real_cpc, how='inner', on='patent_id')
#print(patents)

patents=real_cpc[['patent_id']]

real_cpc=real_cpc.set_index(['patent_id'])
print(real_cpc)

           Unnamed: 0                                        subgroup_id
patent_id                                                               
4000024         69523                                         C06B23/007
3944441         14136                                           B23K7/10
3953613         23282                                          A21C9/088
3945518         15209                      E02F9/2203,E02F9/18,B66C23/76
3932608          2324  A23L33/18,A23L29/05,A21D2/245,A23L7/101,A61Q11...
...               ...                                                ...
11212947      7228673  H01G4/40,H01L2224/29139,H01G4/008,C04B35/493,H...
11212948      7228674    B64D2221/00,H05K7/20881,H05K7/20936,H05K7/20945
11212949      7228675  H05K7/20481,H05K9/0015,H05K7/20472,H05K5/026,H...
11212950      7228676  H05K13/0069,Y10T29/53174,H05K13/0061,G06F30/00...
11212951      7228677     H05K13/0882,H05K13/0452,H05K13/086,H04B10/1141

[7223148 rows x 2 columns]


In [56]:
real_cpc.shape # we use real_cpcnew for searching similarity

(7223148, 2)

In [57]:
patents

Unnamed: 0,patent_id
0,4000024
1,3944441
2,3953613
3,3945518
4,3932608
...,...
7223143,11212947
7223144,11212948
7223145,11212949
7223146,11212950


In [64]:
def find_cpc_group(number_patent,level):  ## this code find cpc levels
    classes=real_cpc['subgroup_id'][number_patent].split(',')
    #print(classes)

    if level==4:
        return classes
    elif level==1:
        return [ C[0] for C in classes ]
    elif level==3:
        return [C[:4] for C in classes]
    elif level ==2:
        return [C[:3] for C in classes]
    else:
        print("Wrong level!")
    



In [65]:
pp

{4000024: [(4, 0.09828475),
  (40, 0.04111879),
  (41, 0.011398201),
  (51, 0.045610446),
  (56, 0.13706556),
  (62, 0.017377084),
  (63, 0.0110401735),
  (66, 0.0292527),
  (72, 0.028781475),
  (73, 0.10113948),
  (76, 0.11652438),
  (77, 0.04556094),
  (84, 0.016034232),
  (86, 0.15513715),
  (99, 0.10724975)],
 3944441: [(4, 0.11748408),
  (5, 0.023765754),
  (9, 0.022683777),
  (14, 0.027219841),
  (16, 0.061248176),
  (24, 0.01323811),
  (40, 0.020755023),
  (46, 0.15748012),
  (50, 0.06273302),
  (55, 0.09928371),
  (60, 0.01141443),
  (69, 0.117805876),
  (73, 0.034363408),
  (85, 0.033761505),
  (86, 0.05927193),
  (91, 0.084923804),
  (98, 0.020378066)],
 3953613: [(5, 0.012166092),
  (7, 0.06836463),
  (11, 0.08507905),
  (14, 0.032640472),
  (15, 0.17246434),
  (16, 0.019359957),
  (23, 0.023759682),
  (27, 0.012373694),
  (34, 0.08824107),
  (35, 0.23374134),
  (50, 0.07132162),
  (55, 0.022627989),
  (62, 0.03655896),
  (78, 0.010519015),
  (86, 0.028382441),
  (90, 0.0216

In [66]:
# Hiererchical search
NUM_FUNNEL=4

def funnel_search(patent_number,method,string, CPC):
    index={ 1:1, 3:2, 4:3 }
    lpatent=pp[int(patent_number)]
    
    cpc=CPC[ index[len(string)] ].copy() 
    #We search only CPC classes that start with string
    cpc_filter=cpc.loc[cpc['id'].str.startswith(string)]
    #computing similarities with the patent vector
    #cpc_filter['simils']=cpc_filter.topics.apply(lambda T:  document_similarity(T,lpatent,method) )
    
    #if cpc_filter.shape[0]>1000:
    #     
    #    cpc_filter=dd.from_pandas(cpc_filter, npartitions=20)
    #
    #    cpc_filter['simils']=cpc_filter.topics.apply(lambda T:  document_similarity(T,lpatent,method) , meta=pd.Series(dtype=float))
    #    cpc_filter=cpc_filter.compute(scheduler='processes', num_workers=20)
    #else:
    cpc_filter['simils']=cpc_filter.topics.apply(lambda T:  document_similarity(T,lpatent,method) )
         
        
    data= list(zip( list(cpc_filter.simils.values) , list(cpc_filter.id.values)))
    data.sort(reverse = True, key=lambda X: X[0])
    most_similar=data#[0:NUM_FUNNEL]
    
    
    return  most_similar

def correct_score(a,b):
    
    l=[]

    l1=[]
    for i in a:
        l.append(i[0])
        l1.append(i[1])

    list1=[j*b[0] for j in l]
    k=list(zip(list1,l1))  
    return k  




def funnel_search_final(patent_number,method, level, CPC):
    list_subsection=[]
    list_group=[]
    list_subgroup=[]

        
    section=most_similar(patent_number,method,1, CPC)
    if level==1:
        return section
    
        
    for i in section[:4]:
        subsection=funnel_search(patent_number,method,i[1],CPC)
        subsection1=correct_score(subsection,i)
        #for j in subsection1:
        #    list_subsection.append(j)
        list_subsection.extend(subsection1)
    list_subsection.sort(reverse = True, key=lambda X: X[0])
    if level==2:    
        return list_subsection[:number_to_keep[1]]
    
       
        
    for i in list_subsection[:20]:
            group=funnel_search(patent_number,method,i[1], CPC)
            group1=correct_score(group,i)
            #for j in group1:
            #        list_group.append(j)
            list_group.extend(group1)
    list_group.sort(reverse = True, key=lambda X: X[0])
    if level==3:    
        return list_group[:number_to_keep[2]]
        

    
    for k in list_group[:40]:
        #print(k[1], end=' ')
        subgroup=funnel_search(patent_number,method,k[1], CPC)
        subgroup1=correct_score(subgroup,k)
        #for l in subgroup1:
        ##    list_subgroup.append(l)
        list_subgroup.extend(subgroup1)
    list_subgroup.sort(reverse = True, key=lambda X: X[0])         
    return list_subgroup[:number_to_keep[3]] 


In [67]:
find_cpc_group( 4796795,3)

['B08B', 'H05K', 'H05K', 'H05K', 'B23K', 'H05K', 'B08B']

In [68]:
real_cpc['subgroup_id'][3930271]

'A63B71/146'

# Measuring accuracy

In [69]:
def accuracy_method_tot(number_patent,method,level,types, CPC):     
    real_cpc=find_cpc_group(int(number_patent),level)
    if types=='direct':
        
        predict_cpc_rank=most_similar(number_patent,method,level, CPC)
        #level1=level-1 # In the list starts from 0
        #number_to_keep=[3,5,10,10]
        #predict_cpc_intersection=(predict_cpc_rank[0: number_to_keep[level1]  ])
    else:
        predict_cpc_rank=funnel_search_final(number_patent,method,level, CPC)
        #predict_cpc_intersection=predict_cpc_rank

    return predict_cpc_rank    
    
    
    l=[]        
    for i in range(len(predict_cpc_rank)):          
        #for j in range(len(real_cpc)):              
        j=0
        if predict_cpc_rank[i][1]==real_cpc[j]:               
                a=i+1                 
                l.append(a) 
                            
    if len(l) ==0:    
            accuracy_rank=0     
    else:        
            c=min(l)
            accuracy_rank=(1/(c))*100
    l1=[]
    for i in range(len(predict_cpc_intersection)):
        l1.append(predict_cpc_intersection[i][1])
        
        
    count=len(set(real_cpc)&set(l1))
   
    accuracy_intersection=(count/(min(len(real_cpc),len(predict_cpc_intersection))))*100  
    return accuracy_rank,real_cpc,predict_cpc_rank, accuracy_intersection #, #pp[number_patent] predict_cpc_rank, ,predict_cpc_intersection
             


In [70]:


def accuracy_test1(number_samples,method,level,types, CPC, noAY=False):  
    #if not noAY:
    samples=patents['patent_id'].sample(n=number_samples,random_state=1)    
    #else:
    #    samples=patents_noAY['patent_id'].sample(n=number_samples,random_state=1)

    df_samples=pd.DataFrame(samples)
    df_samples.reset_index(drop=True, inplace=True)
    #print(df_samples)

    ##df_samples=dd.from_pandas(df_samples, npartitions=5)

    
    df_samples['predicted_cpc']=df_samples['patent_id'].apply(lambda t:accuracy_method_tot(t,method,level,types, CPC)) #, meta= pd.Series(list)) 
    
    #df_samples['accuracy_rank'],  df_samples['real_cpc'], df_samples['predicted_cpc_rank']  ,df_samples['accuracy_intersection']=zip(*df_samples['patent_id'].apply(lambda t:accuracy_method_tot(t,method,level,types, CPC))  )  #df_samples['patent_topics'],
    #df_samples['predicted_cpc_rank'], df_samples['predict_cpc_intersection']
    
    return df_samples




# run level 4

In [71]:
def accuracy_test1(number_samples,method,level,types, CPC):  
    samples=patents['patent_id'].sample(n=number_samples,random_state=1)    
    
    df_samples=pd.DataFrame(samples)
    df_samples.reset_index(drop=True, inplace=True)
    chunk_size=1000
    for i in range(0,len(df_samples),chunk_size):
         print(i, end='  ')
        
    
         chunk=df_samples.iloc[i:i+chunk_size]


    
         chunk['predicted_cpc']=chunk['patent_id'].apply(lambda t:accuracy_method_tot(t,method,level,types, CPC))
         chunk.to_csv('data/result_'+str(level) + '_'+str(number_topic)+ '_funnel_AY_1M_.csv',mode='a',index=False,header=(i==0))
    #df_samples['accuracy_rank'],  df_samples['real_cpc'], df_samples['predicted_cpc_rank']  ,df_samples['accuracy_intersection']=zip(*df_samples['patent_id'].apply(lambda t:accuracy_method_tot(t,method,level,types, CPC))  )  #df_samples['patent_topics'],
    #df_samples['predicted_cpc_rank'], df_samples['predict_cpc_intersection']
        
  

In [72]:

for index in [1,2,3,4]:
    print(index)
    accuracy_test1(1000000,Method,index,'funnel', CPC_dfs)

1


133000  134000  135000  136000  137000  138000  139000  140000  141000  142000  143000  144000  145000  146000  147000  148000  149000  150000  151000  152000  153000  154000  155000  156000  157000  158000  159000  160000  161000  162000  163000  164000  165000  166000  167000  168000  169000  170000  171000  172000  173000  174000  175000  176000  177000  178000  179000  180000  181000  182000  183000  184000  185000  186000  187000  188000  189000  190000  191000  192000  193000  194000  195000  196000  197000  198000  199000  200000  201000  202000  203000  204000  205000  206000  207000  208000  209000  210000  211000  212000  213000  214000  215000  216000  217000  218000  219000  220000  221000  222000  223000  224000  225000  226000  227000  228000  229000  230000  231000  232000  233000  234000  235000  236000  237000  238000  239000  240000  241000  242000  243000  244000  245000  246000  247000  248000  249000  250000  251000  252000  253000  254000  255000  256000  257000  

In [None]:
from multiprocessing import Pool
def process_chunk(chunk): 
    chunk['predicted_cpc'] = chunk['patent_id'].apply(lambda t: accuracy_method_tot(t, method, level, types,CPC)) 
    return chunk
number_samples = 1000000
method = 'weightedsum'
level = 4
types = "funnel" 
CPC=CPC_dfs
samples=patents['patent_id'].sample(n=number_samples,random_state=1) 

    #else:
    #    samples=patents_noAY['patent_id'].sample(n=number_samples,random_state=1)

df_samples=pd.DataFrame(samples)
df_samples.reset_index(drop=True, inplace=True)
# Replace with your desired value # Number of processes to run in parallel 
num_processes = 20 # Replace with your desired value 
# Create a list of parameter tuples for each parallel process
parameter_list = [(df_samples.iloc[i:i+number_samples],) 
for i in range(0, len(df_samples), number_samples)]
# Create a Pool with the specified number of processes 
with Pool(processes=num_processes) as pool: 
    # Use pool.imap to apply the function to each chunk of the DataFrame in parallel
    results = pool.imap(process_chunk, parameter_list) # Concatenate the results from each process 
    df_result = pd.concat(results, ignore_index=True) # Save the combined DataFrame to a CSV file 
df_result.to_csv('data/result_4' + '_'+str(number_topic)+ '_DIRECT_AY_1M_p.csv', mode='a', index=False, header=False)

# Similarity search 

In [None]:
# test  similarity for different levels -direct

datapath="../databases/PatentsView/"
index=[1,2,3,4]


numsamples={1:1000000, 2:1000000, 3:100000, 4:100000  }

for i in index:
    s=accuracy_test1(numsamples[i],Method,i,'direct', CPC_dfs)
    ss=s.to_csv('data/result_' +str(i)+ '_'+str(number_topic)+ '_AY.csv')

In [None]:
# test  similarity for different levels -funnel
index1=[1,2,3,4]
#index1=[3,4]
for i in index1:

    s=accuracy_test1(1000000,Method,i,'funnel', CPC_dfs)
    ss=s.to_csv('data/result_' +str(i)+ '_'+str(number_topic)+ '_funnel_AY_1M_.csv')

In [None]:
del CPC_dfs
gc.collect()

# find accuracy in each section,subsection,...

In [None]:

def filter_str(string,string1,string2,string3):
    sample_filter=df_samples.loc[df_samples['real_cpc'].str.startswith(string)]
    sample_filter1=df_samples.loc[df_samples['real_cpc'].str.startswith(string1)]
    sample_filter2=df_samples.loc[df_samples['real_cpc'].str.startswith(string2)]
    sample_filter3=df_samples.loc[df_samples['real_cpc'].str.startswith(string3)]
    df=df_samples.drop(sample_filter.index)
    df1=df.drop(sample_filter1.index)
    df2=df1.drop(sample_filter2.index)
    df3=df2.drop(sample_filter3.index)
    return df3.describe()

# precision and recall 
for section C

In [None]:

df1=pd.read_csv("data/result_1_1.csv")
df1

In [None]:
real_c=df1.loc[df1['real_cpc'].str.startswith("['C'")]
real_not_c=df1.drop(real_c.index)

In [None]:
TP=real_c.loc[real_c['accuracy']!=0]
TP

In [None]:
FP=0
for i in real_not_c.index:
    for j in  real_not_c['predicted_cpc'][i]:
        if j[1]=='C':
            FP+=1
print(FP)      

In [None]:
precision=len(TP)/(FP+len(TP))
precision

In [None]:
FN=real_c.loc[real_c['accuracy']==0]
len(FN)

In [None]:
recall=len(TP)/(len(TP)+len(FN))
recall

In [None]:
samples_real_not_c=real_not_c.sample(9450,random_state=1000)
samples_real_not_c

In [None]:
FP1=0
for i in samples_real_not_c.index:
    for j in  samples_real_not_c['predicted_cpc'][i]:
        if j[1]=='C':
            FP1+=1
print(FP1)  

In [None]:
precision1=len(TP)/(FP1+len(TP))
precision1

# results

In [None]:
level=1
types='direct'
number_topic=500

In [None]:

df=pd.read_csv('data/result_' +str(level)+ '_'+str(number_topic)+ '_AY.csv')


In [None]:
# find real_cpc_rank
df['real_cpc']=df['patent_id'].apply(lambda t:find_cpc_group(t,level))
df

In [None]:
def set_real_cpc(real_cpc):
    l=list(set(real_cpc))
    return l

In [None]:
# find real-cpc_intersection
df['real_cpc_intersection']=df['real_cpc'].apply(set_real_cpc)   


In [None]:

df['predicted_cpc']=df['predicted_cpc'].apply(literal_eval)

In [None]:
def find_predict_cpc_intersection(predicted_cpc_rank,level,types):
    if types=='direct':
        
        level1=level-1 # In the list starts from 0
        number_to_keep=[3,10,50,200]
        predicted_cpc_intersection=predicted_cpc_rank[0: number_to_keep[level1]]
                
    else:
                
        level1=level-1 # In the list starts from 0
        number_to_keep=[3,8,16,32]
        predicted_cpc_intersection=predicted_cpc_rank[0: number_to_keep[level1]]         
    return predicted_cpc_intersection

In [None]:
df['predicted_cpc_intersection']=df['predicted_cpc'].apply(lambda t:find_predict_cpc_intersection(t,level,types))

In [None]:
def accuracy_method_intersection(real_cpc,predict_cpc):
    l=[]
    for i in range(len(predict_cpc)):
        l.append(predict_cpc[i][1])
        
        
    count=len(set(real_cpc)&set(l))
   
    accuracy=(count/(min(len(real_cpc),len(predict_cpc))))*100   
            
    return accuracy

In [None]:

df['accuracy_intersection'] = df.apply(lambda x: accuracy_method_intersection(x['real_cpc_intersection'], x['predicted_cpc_intersection']), 
                        axis=1)

In [None]:
def accuracy_method_rank(real_cpc,predict_cpc):
    
    l=[]        
    for i in range(len(predict_cpc)):          
        #for j in range(len(real_cpc)):              
        j=0
        if predict_cpc[i][1]==real_cpc[j]:               
                a=i+1                 
                l.append(a) 
                            
    if len(l) ==0:    
            accuracy_rank=0     
    else:        
            c=min(l)
            accuracy_rank=(1/(c))*100
    return accuracy_rank     

In [None]:

df['accuracy_rank'] = df.apply(lambda x: accuracy_method_rank(x['real_cpc'], x['predicted_cpc']), 
                        axis=1)

In [None]:
df.describe()