In [1]:
import os
import pandas as pd
import re
from os.path import expanduser as ospath

## Load PASTAT dataset to retrieve list of patent to parse from the bulk data

In [2]:
db_EP = pd.read_excel(ospath('~/final_project/A_Data_Collection/EPO/PATSTAT_EPO_patents/ALL_EPO_PAT_NUM.xlsx'), index_col=0) 
db_EP = db_EP.sort_values(by='publn_nr')
db_EP['publn_nr'] = db_EP['publn_nr'].astype(str)#to transform the column from int to str
db_EP['publn_nr'] = db_EP['publn_nr'].str.zfill(7)#add leading zeros


In [3]:
def get_patents(dataset):
    
    
    path = ospath('~/final_project/A_Data_Collection/EPO/Sample_Bulk_full_text_EPO/')
    files = os.listdir(path)
    #opens all the files in the directory
    for file_name in files:
        with open(ospath('~/final_project/A_Data_Collection/EPO/Sample_Bulk_full_text_EPO/' + file_name), encoding="utf8") as f:
            patent = f.readlines()
        patent = sorted(patent)
        publication_nr = list(dataset['publn_nr'])
        publication_nr = sorted(publication_nr)
        pub_nr = []
        for i in range(len(publication_nr)):
            if publication_nr[i]>= file_name[2:-4] and publication_nr[i]  <= file_name[2:4]+'99999':
                pub_nr.append(publication_nr[i])
        list_claim = select_pat(pub_nr,patent)
        titles, claims , abstrs = split_types(list_claim)
        db = create_db(titles, claims , abstrs)
        db.to_excel(ospath('~/final_project/A_Data_Collection/EPO/Data_parsed/'+ file_name[:-4]+ '.xlsx'), header = True)


In [4]:
def select_pat(pub_nr,data):
    
    """select the patents in the list
    from the text file
    
    @data = text file
    @pub_nr = list of patents"""
    
    count = 0
    list_patent_selected = []
    for patent in data:
        for i in range(len(pub_nr)):
            if patent[3:10] in pub_nr[i]:#patent[3:10] patent number
                if patent[25:27] == 'en':
                    patent = re.sub('\s+',' ',str(patent))#strip the text from tab an new line symbols
                    
                    list_patent_selected.append([patent[3:10], patent[11:13], patent[25:27],patent[28:33], patent[34:]])# pat_num,kind, language, type and text

    pat_found = []
    temp = [list_patent_selected[0]]
    for i in range(len(list_patent_selected)):
        if temp[-1][0] == list_patent_selected[i][0]:
            temp.append(list_patent_selected[i])
        else:
            pat_found.append(temp)
            temp = [list_patent_selected[i]] 
    
    return pat_found

In [5]:
def split_types(pat_found):
    
    """split the patent and put together
    titles, claims and abstracts in their 
    respective lists. Often no abstract for 
    type B patent but only present in type A
    Select claims only from patents type B
    
    Parameters
    ----------
    pat_found : list of patents found with their text 
    
    Returns
    -------
    titles : list of titles
    abstrs : list of abstrs
    claims : list of claims split
    
    """
    
    #pat_found = select_pat(data, pub_nr)

    titles, claims, abstrs, = ([] for i in range(3))
    for i in range(len(pat_found)):
        tit = 'NA'
        cl = 'NA'
        ab = 'NA'
        for j in range(len(pat_found[i])):
        
            if pat_found[i][j][3] == 'TITLE':
                if tit == 'NA':
                    tit = pat_found[i][j]
    
            if pat_found[i][j][3] == 'ABSTR':
                if ab == 'NA':
                    ab = pat_found[i][j]
                    ab = re.sub(r'<sub>(.*?)</sub>',"", str(ab[-1]))#subscripts, which needs to be done here
                    ab = re.sub(r'<sup>(.*?)</sup>',"", str(ab))#subscripts, which needs to be done here
                    ab = re.sub(r'(<.*?>)|(-->)', " ", str(ab))#clean the text
        
            if pat_found[i][j][3] == 'CLAIM':#makes sure to take the claim from type Bs
                cl = pat_found[i][j]
                if cl != 'NA':
                    if 'B' not in cl:
                        cl = pat_found[i][j] 
                    
                else:  
                    continue  
        titles.append(tit)    
        abstrs.append(ab)
        claims.append(cl)
        
    claims = split_claims(claims)
    
        
    return titles, claims, abstrs

In [6]:
def split_claims(claims):
    
    """split claims and clean them by all the unnecessary
    symbols or tags in the text.
    
    Parameters
    ----------
    claims : list of claims no yet split
    
    Returns
    -------
    
    claims : list of claims split
    
    """
    claim_list = []
    for cl in claims:
        splited_claims = re.findall(r'<claim\s+id="c-en-01-[0-9]+"\s+num=".*?">', str(cl))
        num_split = len(splited_claims)
        num_split = [i for i in range(1,len(splited_claims)+1)]
        count = len(num_split)
        claim_split = []
        for i in range(len(num_split)):
            if count >= 2:     
                next_claim = re.findall(r'(<claim\s+id="c-en-01-[0]+'+str(num_split[i])+'"\s+num=".*?"><claim-text>.*?)<claim\s+id="c-en-01-[0]+'+str(num_split[i+1])+'"\s+num=".*?">', str(cl))
                claim_split.extend(next_claim)
            else:
                last = re.findall(r'<claim\s+id="c-en-01-[0]+'+str(num_split[i])+'"\s+num=".*?"><claim-text>.*?$', str(cl))
                claim_split.extend(last)
            count -= 1
        print(len(claim_split), 'len(claim_split)')
        claim_sep = []
        for i in range(len(claim_split)):
            claim = re.sub(r'(<sub>)|(</sub>)',"", str(claim_split[i]))#subscript
            #claim = re.sub(r'<sup>(.*?)</sup>',"", str(claim))#subscript
            claim = re.sub(r'(<.*?>)|(-->)|(\\)', " ", str(claim))
            claim = re.sub(r'^\s*?[0-9]+.\s*', "", str(claim))
            claim = re.sub(r'(\')|(\[)|(\])', "", str(claim))
            claim_sep.append(claim)
        claim_list.append(claim_sep)
            
    return claim_list

In [7]:
def create_db(titles, claims , abstrs):
    
    
    """Creats a dataset with the patents found
    
    Parameters
    ----------
    itles : list of titles
    abstrs : list of abstrs
    claims : list of claims split
    
    Returns
    -------
    db : dataset with the name of the bulk dataset - EP.......
    
    """
    
    
    list_pat = []
    for i in range(len(claims)):
        pat = titles[i][0]
        type_tech = db_EP.loc[db_EP['publn_nr'] == pat]
        type_tech = type_tech['Type_Techn']
        type_tech = list(type_tech)
        
        if len(type_tech) > 1:
            for k in range(len(type_tech)):
                temp = [[pat, titles[i][-1], 'Abstract' ,abstrs[i],type_tech[k]]]
                temp_2 = []
                for j in range(len(claims[i])):
                    new_ls = [titles[i][0], titles[i][-1]]
                    new_ls.extend(['Claim ' + str(j+1), claims[i][j],type_tech[k]])
                    temp_2.append(new_ls)
                temp.extend(temp_2)
                list_pat.extend(temp)
        else:
            temp = [[pat, titles[i][-1], 'Abstract' ,abstrs[i], type_tech[0]]]
            temp_2 = []
            for j in range(len(claims[i])):
                new_ls = [titles[i][0], titles[i][-1]]
                new_ls.extend(['Claim ' + str(j+1), claims[i][j], type_tech[0]])
                temp_2.append(new_ls)
            temp.extend(temp_2)
            list_pat.extend(temp)
    db = pd.DataFrame(list_pat, columns = ['Publn_nr', 'Title', 'Type', 'Text','Tech'], index = None)
    db['Publn_nr'] = db['Publn_nr'].str.zfill(7)
    
    print(db.head())
    
    return db

In [8]:
get_patents(db_EP)

  Publn_nr                                              Title      Type  \
0  1100097  SOLID ELECTROLYTIC CAPACITOR AND PROCESS FOR P...  Abstract   
1  1100097  SOLID ELECTROLYTIC CAPACITOR AND PROCESS FOR P...   Claim 1   
2  1100097  SOLID ELECTROLYTIC CAPACITOR AND PROCESS FOR P...   Claim 2   
3  1100097  SOLID ELECTROLYTIC CAPACITOR AND PROCESS FOR P...   Claim 3   
4  1100097  SOLID ELECTROLYTIC CAPACITOR AND PROCESS FOR P...   Claim 4   

                                                Text Tech  
0   A solid electrolytic capacitor includes an ox...   SC  
1    A method of making a solid electrolytic capa...   SC  
2    The method of making a solid electrolytic ca...   SC  
3    The method of making a solid electrolytic ca...   SC  
4    The method of making a solid electrolytic ca...   SC  
  Publn_nr                        Title      Type  \
0  1201006  RECHARGEABLE BATTERY PACKS   Abstract   
1  1201006  RECHARGEABLE BATTERY PACKS    Claim 1   
2  1201006  RECHARGEABLE BATTE