Code for the analysis and visualizations for the paper "A similarity search approach to patent classification" by Reza Rezazadegan and Zahra Bagheri

www.github.com/rezareza007
www.dreamintelligent.com



In [13]:
import pandas as pd

import gc
import os.path
from tqdm import tqdm

import dask.dataframe as dd
import nltk
#nltk.download('wordnet')

fulltext_path="data/fulltext.tsv"
preprocessed_path="data/fulltext_preprocessed.tsv"



In [3]:
# Text preprocessing function

from nltk.stem  import PorterStemmer 
from nltk.stem import WordNetLemmatizer

from gensim.parsing.preprocessing import STOPWORDS
from  gensim.utils import simple_preprocess


ps = PorterStemmer()


def text_preprocess(text):

   
    
    if text=="":
        return []
    
    
    result = []
    for token in simple_preprocess(text):
        
        
        if token not in STOPWORDS and len(token) > 3:
            result.append(ps.stem(WordNetLemmatizer().lemmatize(token, pos='v') ))

    return result

In [5]:
# The path for brieftext and claims data files

rpath=r"../databases/PatentsView/"

def path(year, type):
    if type=='brief':
        pathh=rpath+'BriefText/'+'brf_sum_text_'+str(year)+'.tsv.zip'
        if year>=2004:
            pathh=pathh[:-4]
        return pathh
    else:
        return rpath+'Claims/'+'claims_'+str(year)+'.tsv.zip' 


In [6]:
from pandarallel import pandarallel
pandarallel.initialize()


for year in tqdm(range(1976, 2022)):
    

    print(year)


    textcol='text' if year not in [2004, 2005, 2013, 2014, 2011, 2017, 2018, 2019] else 'summary_text'


    # patent Brief texts
    df1=pd.read_csv(path(year, 'brief'),sep='\t', usecols=['patent_id', textcol]) #,chunksize=1000)
    df1=df1.rename(columns={textcol:'brief_text'})
    df1=df1.dropna()

    df1.brief_text=df1.brief_text.parallel_apply(lambda x: str(x).replace( '\n',' '  ).replace('\t', ' ')).replace(r"\r", "").replace("\"", "" ) 


    df1=df1[df1.patent_id.astype(str).str.isnumeric()]

    #patent claims
    df3=pd.read_csv(path(year, 'claim'),sep='\t',usecols=['patent_id','text']) #chunksize=5000,
    df3=df3.dropna()

    df3=df3.rename(columns={'text':'claim_text'})
    df3=df3[df3.patent_id.astype(str).str.isnumeric()]

    df3.claim_text=df3.claim_text.parallel_apply(lambda x: str(x).replace( '\n',' '  ).replace('\t', ' ').replace(r"\r", "").replace("\"", "" ) )


    df_g=df3.groupby('patent_id', sort=False)['claim_text'].apply(' '.join)
    df_claim=pd.DataFrame(df_g)


    del df3
    del df_g
    gc.collect()

    #merging the two
    df_merge=df1.merge(df_claim,on='patent_id', how='left')

    # Here fulltext means merger of brieftext and claims 

    df_merge['full_text']=df_merge.parallel_apply(lambda X: str(X.brief_text)+" "+str(X.claim_text), axis=1)
    df_merge=df_merge[['patent_id','full_text']]

    df_merge=df_merge.rename(columns={'patent_id': 'patent_number'})

    
    print("Writing merged texts to disk")
    df_merge.to_csv(fulltext_path,index=False, header=not os.path.exists(fulltext_path), mode='a', sep='\t')

    #Applying preprocess function
    df_merge['text_preprocessed']=df_merge.full_text.apply(lambda x: " ".join(text_preprocess))
    df_merge[['patent_number', 'text_preprocessed']].to_csv(preprocessed_path,index=False, header=not os.path.exists(preprocessed_path), mode='a', sep='\t')



    #it's not needed anymore
    del df_merge
    gc.collect()





INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/


  0%|          | 0/46 [00:00<?, ?it/s]

1976


  2%|▏         | 1/46 [18:22<13:46:56, 1102.60s/it]

1977


  4%|▍         | 2/46 [35:45<13:02:41, 1067.30s/it]

1978


  7%|▋         | 3/46 [53:50<12:50:39, 1075.34s/it]

1979


  9%|▊         | 4/46 [1:07:10<11:16:46, 966.83s/it]

1980


 11%|█         | 5/46 [1:24:16<11:15:08, 988.01s/it]

1981


 13%|█▎        | 6/46 [1:42:46<11:26:18, 1029.46s/it]

1982


 15%|█▌        | 7/46 [1:58:59<10:57:16, 1011.19s/it]

1983


 17%|█▋        | 8/46 [2:15:01<10:30:28, 995.49s/it] 

1984


 20%|█▉        | 9/46 [2:34:05<10:42:27, 1041.83s/it]

1985


 22%|██▏       | 10/46 [2:54:52<11:03:04, 1105.12s/it]

1986


 24%|██▍       | 11/46 [3:16:49<11:22:32, 1170.08s/it]

1987


 26%|██▌       | 12/46 [3:43:49<12:20:40, 1307.08s/it]

1988


 28%|██▊       | 13/46 [4:09:49<12:40:58, 1383.58s/it]

1989


 30%|███       | 14/46 [4:42:03<13:46:39, 1549.98s/it]

1990


 33%|███▎      | 15/46 [5:12:20<14:02:22, 1630.41s/it]

1991


 35%|███▍      | 16/46 [5:41:22<13:52:00, 1664.01s/it]

1992


 37%|███▋      | 17/46 [6:10:54<13:39:56, 1696.43s/it]

1993


 39%|███▉      | 18/46 [6:41:43<13:33:06, 1742.36s/it]

1994


 41%|████▏     | 19/46 [7:14:50<13:37:00, 1815.58s/it]

1995


 43%|████▎     | 20/46 [7:49:03<13:37:39, 1886.91s/it]

1996


 46%|████▌     | 21/46 [8:27:49<14:01:10, 2018.83s/it]

1997


 48%|████▊     | 22/46 [9:09:07<14:22:35, 2156.47s/it]

1998


 50%|█████     | 23/46 [10:04:17<15:59:24, 2502.82s/it]

1999


 52%|█████▏    | 24/46 [11:02:35<17:07:12, 2801.48s/it]

2000


 54%|█████▍    | 25/46 [12:02:50<17:45:54, 3045.47s/it]

2001


  df3=pd.read_csv(path(year, 'claim'),sep='\t',usecols=['patent_id','text']) #chunksize=5000,
 57%|█████▋    | 26/46 [12:43:32<15:54:45, 2864.29s/it]

2002


  df3=pd.read_csv(path(year, 'claim'),sep='\t',usecols=['patent_id','text']) #chunksize=5000,
 59%|█████▊    | 27/46 [13:24:50<14:30:23, 2748.60s/it]

2003


  df3=pd.read_csv(path(year, 'claim'),sep='\t',usecols=['patent_id','text']) #chunksize=5000,
 61%|██████    | 28/46 [14:07:12<13:25:59, 2686.66s/it]

2004


  df3=pd.read_csv(path(year, 'claim'),sep='\t',usecols=['patent_id','text']) #chunksize=5000,
 63%|██████▎   | 29/46 [14:46:36<12:13:43, 2589.64s/it]

2005


  df3=pd.read_csv(path(year, 'claim'),sep='\t',usecols=['patent_id','text']) #chunksize=5000,
 65%|██████▌   | 30/46 [15:36:17<12:01:53, 2707.11s/it]

2006


  df3=pd.read_csv(path(year, 'claim'),sep='\t',usecols=['patent_id','text']) #chunksize=5000,
 67%|██████▋   | 31/46 [16:30:57<11:59:43, 2878.90s/it]

2007


  df3=pd.read_csv(path(year, 'claim'),sep='\t',usecols=['patent_id','text']) #chunksize=5000,
 70%|██████▉   | 32/46 [17:22:20<11:26:03, 2940.24s/it]

2008


  df3=pd.read_csv(path(year, 'claim'),sep='\t',usecols=['patent_id','text']) #chunksize=5000,
 72%|███████▏  | 33/46 [18:05:07<10:12:46, 2828.17s/it]

2009


  df3=pd.read_csv(path(year, 'claim'),sep='\t',usecols=['patent_id','text']) #chunksize=5000,
 74%|███████▍  | 34/46 [18:55:58<9:39:02, 2895.17s/it] 

2010


  df3=pd.read_csv(path(year, 'claim'),sep='\t',usecols=['patent_id','text']) #chunksize=5000,
 76%|███████▌  | 35/46 [20:07:12<10:06:36, 3308.79s/it]

2011


  df3=pd.read_csv(path(year, 'claim'),sep='\t',usecols=['patent_id','text']) #chunksize=5000,
 78%|███████▊  | 36/46 [21:13:19<9:44:22, 3506.24s/it] 

2012


  df3=pd.read_csv(path(year, 'claim'),sep='\t',usecols=['patent_id','text']) #chunksize=5000,
 80%|████████  | 37/46 [22:31:39<9:39:40, 3864.48s/it]

2013


  df1=pd.read_csv(path(year, 'brief'),sep='\t', usecols=['patent_id', textcol]) #,chunksize=1000)
  df3=pd.read_csv(path(year, 'claim'),sep='\t',usecols=['patent_id','text']) #chunksize=5000,
 83%|████████▎ | 38/46 [23:56:20<9:23:54, 4229.28s/it]

2014


  df1=pd.read_csv(path(year, 'brief'),sep='\t', usecols=['patent_id', textcol]) #,chunksize=1000)
  df3=pd.read_csv(path(year, 'claim'),sep='\t',usecols=['patent_id','text']) #chunksize=5000,
 85%|████████▍ | 39/46 [25:29:09<9:00:18, 4631.16s/it]

2015


  df1=pd.read_csv(path(year, 'brief'),sep='\t', usecols=['patent_id', textcol]) #,chunksize=1000)
  df3=pd.read_csv(path(year, 'claim'),sep='\t',usecols=['patent_id','text']) #chunksize=5000,
 87%|████████▋ | 40/46 [26:55:50<8:00:12, 4802.11s/it]

2016


  df1=pd.read_csv(path(year, 'brief'),sep='\t', usecols=['patent_id', textcol]) #,chunksize=1000)
  df3=pd.read_csv(path(year, 'claim'),sep='\t',usecols=['patent_id','text']) #chunksize=5000,
 89%|████████▉ | 41/46 [28:23:36<6:51:46, 4941.23s/it]

2017


  df1=pd.read_csv(path(year, 'brief'),sep='\t', usecols=['patent_id', textcol]) #,chunksize=1000)
  df3=pd.read_csv(path(year, 'claim'),sep='\t',usecols=['patent_id','text']) #chunksize=5000,
 91%|█████████▏| 42/46 [29:57:52<5:43:42, 5155.67s/it]

2018


  df1=pd.read_csv(path(year, 'brief'),sep='\t', usecols=['patent_id', textcol]) #,chunksize=1000)
  df3=pd.read_csv(path(year, 'claim'),sep='\t',usecols=['patent_id','text']) #chunksize=5000,
 93%|█████████▎| 43/46 [31:31:58<4:25:08, 5302.83s/it]

2019


  df1=pd.read_csv(path(year, 'brief'),sep='\t', usecols=['patent_id', textcol]) #,chunksize=1000)
  df3=pd.read_csv(path(year, 'claim'),sep='\t',usecols=['patent_id','text']) #chunksize=5000,
 96%|█████████▌| 44/46 [33:19:54<3:08:29, 5654.90s/it]

2020


  df3=pd.read_csv(path(year, 'claim'),sep='\t',usecols=['patent_id','text']) #chunksize=5000,
 98%|█████████▊| 45/46 [35:19:08<1:41:44, 6104.68s/it]

2021


  df3=pd.read_csv(path(year, 'claim'),sep='\t',usecols=['patent_id','text']) #chunksize=5000,
100%|██████████| 46/46 [37:20:36<00:00, 2922.53s/it]  


# word2vec model

In [6]:
import pickle
f=open("data/dictionary_full", "rb")
dictionary=pickle.load(f)
f.close()

In [1]:

class SentenceIterator: 
    def __init__(self, filepath): 
        self.filepath = filepath 

    def __iter__(self): 
        for line in open(self.filepath): 
            #print(line.split()[1:])
            yield line.split() [1:] #dictionary.doc2bow(line.split()[1:])   

sentences = SentenceIterator("data/fulltext_preprocessed.tsv") 





In [3]:
from gensim.models import Word2Vec
word2vec_model=Word2Vec(sentences, workers=20)

In [None]:
word2vec_model.save('data/word2vec_full.model')