# patent topic modeling

Code for the analysis and visualizations for the paper "A similarity search approach to patent classification" by Reza Rezazadegan and Zahra Bagheri

www.github.com/rezareza007
www.dreamintelligent.com



In [None]:
import pandas as pd
import gensim
import pickle
import gc
from tqdm import tqdm
import matplotlib.pyplot as plt
import logging


from gensim.models.coherencemodel import CoherenceModel
from gensim.models.callbacks import ConvergenceMetric, CoherenceMetric


S=5000000000


In [3]:
def text_preprocess(text):

    from nltk.stem  import PorterStemmer 
    from nltk.stem import WordNetLemmatizer



    import gensim
    import string
    

    ps = PorterStemmer()

    if text=="":
        return []
    
    
    result = []
    for token in gensim.utils.simple_preprocess(text):
       
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(ps.stem(WordNetLemmatizer().lemmatize(token, pos='v') ))

    return result


In [5]:
#obtaining the corpus dictionary
dictionary = gensim.corpora.Dictionary([])


j=0
with open("data/fulltext_preprocessed.tsv", "r") as F: #,  encoding='utf8'

    F.readline()
    while True:
        print(j) 
        Z=F.readlines(S)
        if not Z:
            break
        
        W=[]  
        #for l in tqdm(Z):
        W=[l.split()[1:]   for l in Z]
        #print(W[:10])    
        dictionary.add_documents(W)
        j+=1    
                

F.close()  

0
1
2
3
4
5
6
7
8
9
10
11


In [None]:
dictionary.filter_extremes(no_below=10, no_above=0.4, keep_n=200000)

f=open("data/dictionary_full", "wb")
pickle.dump(dictionary,f)
f.close()


In [4]:

f=open("data/dictionary_full", "rb")
dictionary=pickle.load(f)
f.close()

In [5]:
dictionary.__dict__

{'token2id': {'accomplish': 0,
  'accordingli': 1,
  'add': 2,
  'alter': 3,
  'aluminum': 4,
  'ammonium': 5,
  'amount': 6,
  'appar': 7,
  'area': 8,
  'aspect': 9,
  'attempt': 10,
  'binder': 11,
  'burn': 12,
  'call': 13,
  'chamber': 14,
  'chang': 15,
  'characterist': 16,
  'class': 17,
  'combust': 18,
  'compon': 19,
  'compos': 20,
  'composit': 21,
  'concern': 22,
  'concurr': 23,
  'condit': 24,
  'consid': 25,
  'consider': 26,
  'consist': 27,
  'constant': 28,
  'contact': 29,
  'creat': 30,
  'cupric': 31,
  'decreas': 32,
  'depend': 33,
  'design': 34,
  'desir': 35,
  'detail': 36,
  'dimens': 37,
  'discov': 38,
  'droplet': 39,
  'employ': 40,
  'especi': 41,
  'evolv': 42,
  'expon': 43,
  'factor': 44,
  'ferric': 45,
  'flow': 46,
  'fluorid': 47,
  'format': 48,
  'formul': 49,
  'gaseou': 50,
  'grain': 51,
  'greatest': 52,
  'group': 53,
  'guid': 54,
  'heretofor': 55,
  'higher': 56,
  'hydroxi': 57,
  'impart': 58,
  'import': 59,
  'influenc': 60,
  

In [None]:
# Obtaining bag of words

S=500000000
W=[]
with open("data/fulltext_preprocessed.tsv") as F: #,  encoding='utf8'
    F.readline() # header!
    j=0
    while True:
        print(j, end=' ')
        
        Z=F.readlines(S)
        if not Z:
            break
       

        
        W.extend([dictionary.doc2bow(l.split()[1:]) for l in Z ])

        del Z
        gc.collect()    
        j=j+1
            

F.close()

#W now contains the bag of words of texts  


In [None]:

f=open("data/bagofwords", "wb")
pickle.dump(W, f)
f.close()
        



In [None]:

f=open("data/bagofwords", "rb")
W=pickle.load( f)
f.close()
        



In [6]:

class SentenceIterator: 
    def __init__(self, filepath): 
        self.filepath = filepath 

    def __iter__(self): 
        for line in open(self.filepath): 
            #print(line.split()[1:])
            yield dictionary.doc2bow(line.split()[1:])   

sentences = SentenceIterator("data/fulltext_preprocessed.tsv") 





In [7]:
ldamodel = gensim.models.LdaMulticore(sentences,  id2word=dictionary, num_topics=100, passes=10, workers=24)



In [None]:
ldamodel.save("data/lda_full_100_nltk_model_pickle")


In [4]:

ldamodel=gensim.models.LdaModel.load("data/lda_full_model_pickle")

In [5]:
ldamodel.__dict__

{'workers': 24,
 'batch': False,
 'dtype': dtype('float32'),
 'num_terms': 200000,
 'distributed': False,
 'num_topics': 500,
 'chunksize': 2000,
 'decay': 0.5,
 'offset': 1.0,
 'minimum_probability': 0.01,
 'num_updates': 7236596,
 'passes': 10,
 'update_every': 1,
 'eval_every': 10,
 'minimum_phi_value': 0.01,
 'per_word_topics': False,
 'callbacks': None,
 'alpha': array([0.002, 0.002, 0.002, 0.002, 0.002, 0.002, 0.002, 0.002, 0.002,
        0.002, 0.002, 0.002, 0.002, 0.002, 0.002, 0.002, 0.002, 0.002,
        0.002, 0.002, 0.002, 0.002, 0.002, 0.002, 0.002, 0.002, 0.002,
        0.002, 0.002, 0.002, 0.002, 0.002, 0.002, 0.002, 0.002, 0.002,
        0.002, 0.002, 0.002, 0.002, 0.002, 0.002, 0.002, 0.002, 0.002,
        0.002, 0.002, 0.002, 0.002, 0.002, 0.002, 0.002, 0.002, 0.002,
        0.002, 0.002, 0.002, 0.002, 0.002, 0.002, 0.002, 0.002, 0.002,
        0.002, 0.002, 0.002, 0.002, 0.002, 0.002, 0.002, 0.002, 0.002,
        0.002, 0.002, 0.002, 0.002, 0.002, 0.002, 0.002, 0.002

In [None]:
ldamodel.show_topics()#(num_words=10)

[(184,
  '0.495*"carbon" + 0.368*"atom" + 0.042*"dioxid" + 0.020*"contain" + 0.008*"produc" + 0.007*"monoxid" + 0.004*"chemic" + 0.003*"oxygen" + 0.002*"hydrocarbon" + 0.002*"bond"'),
 (142,
  '0.500*"environ" + 0.192*"run" + 0.121*"cloud" + 0.081*"environment" + 0.023*"runtim" + 0.012*"virtual" + 0.006*"surround" + 0.005*"chang" + 0.005*"system" + 0.005*"harsh"'),
 (353,
  '0.231*"back" + 0.216*"underli" + 0.100*"undersid" + 0.069*"backsid" + 0.065*"nylon" + 0.052*"carpet" + 0.029*"tuft" + 0.026*"tack" + 0.011*"overli" + 0.011*"undersurfac"'),
 (51,
  '0.258*"dope" + 0.091*"dopant" + 0.088*"highest" + 0.065*"lowest" + 0.036*"concentr" + 0.026*"carrier" + 0.026*"effici" + 0.020*"higher" + 0.018*"phosphoresc" + 0.018*"casino"'),
 (225,
  '0.844*"target" + 0.085*"lubric" + 0.032*"capsul" + 0.006*"facsimil" + 0.004*"stake" + 0.003*"radiotherapi" + 0.002*"aspect" + 0.001*"determin" + 0.001*"disclosur" + 0.001*"acquir"'),
 (171,
  '0.273*"set" + 0.214*"camera" + 0.202*"motion" + 0.108*"pict

# Obtaining patent topics

In [None]:

topic_dict={}

df_patents=pd.read_csv("data/fulltext_preprocessed.tsv", sep='\t', chunksize=100000)
i=0
for data in df_patents:
    print(i, end='   ')
    data["topics"]=data.text_preprocessed.apply(lambda x: ldamodel[dictionary.doc2bow(str(x).split())])
    ids=list(data.patent_number.values)
    topics=list(data.topics.values)
    topic_dict.update(dict(zip(ids,topics)))
    i=i+1




topic_dict

0   1   2   3   4   5   6   7   8   9   10   11   12   13   14   15   16   17   18   19   20   21   22   23   24   25   26   27   28   29   30   31   32   33   34   35   36   37   38   39   40   41   42   43   44   45   46   47   48   49   50   51   52   53   54   55   56   57   58   59   60   61   62   63   64   65   66   67   68   69   70   71   72   

{4000024: [(7, 0.010013518),
  (13, 0.045969673),
  (44, 0.015058209),
  (50, 0.028039228),
  (81, 0.0148919495),
  (176, 0.025581844),
  (212, 0.059604574),
  (246, 0.04072427),
  (353, 0.15146242),
  (367, 0.018755825),
  (385, 0.042595882),
  (401, 0.021731272),
  (425, 0.09143508),
  (465, 0.010727183),
  (535, 0.01954834),
  (683, 0.08845657),
  (684, 0.011660446),
  (797, 0.014647306),
  (816, 0.010239715),
  (859, 0.015068526),
  (861, 0.031063205),
  (877, 0.028507678),
  (960, 0.013692259)],
 3944441: [(3, 0.020723514),
  (17, 0.022398923),
  (61, 0.057826456),
  (109, 0.028298024),
  (124, 0.016524026),
  (236, 0.01981063),
  (385, 0.01609272),
  (401, 0.044854034),
  (412, 0.018161885),
  (422, 0.010024312),
  (432, 0.052980762),
  (488, 0.033102553),
  (508, 0.07072867),
  (517, 0.0391065),
  (566, 0.11570502),
  (595, 0.015105162),
  (627, 0.0107053425),
  (651, 0.02897991),
  (683, 0.07876699),
  (686, 0.016909223),
  (716, 0.017326256),
  (825, 0.0342195),
  (945, 0.0113

In [None]:


f=open('data/patent_topics_dict_full_100', 'wb')
pickle.dump(topic_dict,f)
f.close()

# Topic visualization

In [None]:
import pyLDAvis
import pyLDAvis.gensim_models

In [20]:
vis=pyLDAvis.gensim_models.prepare(ldabow,W,dictionary)
pyLDAvis.enable_notebook()
pyLDAvis.display(vis)

  default_term_info = default_term_info.sort_values(
