In [None]:
from bs4 import BeautifulSoup as bs
import zipfile
import requests
import pandas as pd
import re
import pathlib
from os.path import expanduser as ospath

def open_file(file):
    
    file_xml = open(file +'.xml', mode='r')
    file_text_raw = file_xml.read()
    file_xml.close()
    text_file =re.compile("<\?xml version=\"1\.0\" encoding\=\"UTF\-8\"\?>")
    file_text = text_file.split(file_text_raw)
    
    while '' in file_text:
        file_text.remove('')
    print("Number of patents :", len(file_text))
    
    
    publn_nr_list, title_list, type_list, num_claims_list, claim_list, abstract_list, classification_list, = ([] for i in range(7))

    for text in file_text:
    
        publn_nr = re.findall('file\=\"([U][S]\w\w\d{6})\-\d{8}\.XML\"', text)# publication number
        type_pat =re.findall("<kind>([A-Z]\d)</kind>",text)# type of patent
        titl =re.findall("<invention-title id=\"\w{5,6}\">(.*?)</invention-title>",text) #title of the invention
        clas =  re.findall(r'<classification-cpc-text>(.*)</classification-cpc-text>',text)
        cl = re.findall("<claim-text>[\s\S<]*</claim-text>",text) 
        abstr = re.findall("\<abstract id\=\"abstract\"\>\n\<p id\=\"p\-0001\" num\=\"0000\"\>(.*?)\<\/p\>\n\<\/abstract\>",text)
        
        #number of claims
        num_claim_minus_1 = re.findall(r'<claim id="CLM-\d*" num="(\d+)">',str(cl))
        if len(num_claim_minus_1) == 0:
            num_claims = 1
        else:
            num_claims = len(num_claim_minus_1) + 1
        
        
        #classiffication IPC
        if len(clas) == 0:
            classif = 'NA'
        else:
            classif = clas
        
        
        #claims
        if len(cl) == 0:
            claims = 'NA'
        else:
            claims = cl

        #Abstract
        if len(abstr)==0:
            abstract = 'NA'
        else:    
            abstract = abstr
            
        #Title
        if len(titl)==0:
            title = 'NA'
        else:
            title = titl
        
        if len(publn_nr)!=0:                             
            publn_nr_list.extend(publn_nr)
            type_list.append(type_pat)
            num_claims_list.append(num_claims)
            claim_list.append(claims)
            abstract_list.extend(abstract)
            classification_list.append(classif)
            title_list.extend(title)

    #cleaning claim text         
    item=0
    for items in claim_list:
        claim_list[item]=re.sub('<.*?>','',str(claim_list[item]))
        claim_list[item]=re.sub('\n',',',str(claim_list[item]))
        claim_list[item]=re.sub('\,\,\,',',',str(claim_list[item]))
        claim_list[item]=re.sub("[\.][\,][\,]",'.,',str(claim_list[item]))
        claim_list[item]=re.sub("[\,][\,]",',',str(claim_list[item]))
        claim_list[item]=re.sub("[\;][\,]",'; ',str(claim_list[item]))
        item += 1
        
   #reference - adapted from - https://github.com/imoisharma/U.S.-Patents-Claims/blob/master/U.S.%20Patents.ipynb
    
    
    

    return publn_nr_list, title_list, type_list, num_claims_list, claim_list, abstract_list, classification_list

In [None]:
def dataset(publn_nr_list, title_list, type_list, num_claims_list, claim_list, abstract_list, classification_list):
    
    data_frame = pd.DataFrame()
    for i in range(len(publn_nr_list)):
        add_row = {'publn_nr': publn_nr_list[i],
                    'patent_title': title_list[i],
                    'classification': classification_list[i][2:-1], 
                    'type': type_list[i],
                    'number_of_claims':num_claims_list[i],
                    'claims_text':claim_list[i][2:-2],
                    'abstract':abstract_list[i]
                        }
        #append row to the dataframe
        data_frame = data_frame.append(add_row, ignore_index=True)
    
    return data_frame

In [None]:
year = '2005'#years go from 2001 to 2020

URL = "https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/"+ year + "/"

In [None]:


def get_file(URL):
    
    """1. Download zip files, one at the time. 
        2. Uncrompresses the file and sent it to the parser
        3. Deletes both compressed and uncompressed files
        input = URL 
        output = CSV files save"""
    
    zip_file_not_correctly_downl = []
    
    for link in bs(requests.get(URL).text, 'html.parser').findAll("a", attrs={'href': re.compile(".zip")}):
        file_link = link.get('href')
        print(file_link)
        
        
        if file_link[2:-4]:
        
            with open(link.text, 'wb') as file:
                response = requests.get(URL + file_link)
                file.write(response.content)

            try:
                with zipfile.ZipFile(file_link, 'r') as file:
                    print(file.printdir())

                    print('Extracting files...')
                    file.extractall()
                    print('Complete!')

                    file_n = file.filename[:-4]
                    if file_n[-3] == '_':#some of the xml files have a different name, example 'ipg200317_r1.zip'
                        file_n[ : -3]

                    publn_nr_list, title_list, type_list, num_claims_list, claim_list, abstract_list, classification_list = open_file(file_n)
            
                    data_frame = dataset(publn_nr_list, title_list, type_list, num_claims_list, claim_list, abstract_list, classification_list)
                
                    data_frame.to_excel(ospath('~/code_final_project/C_Feature_extraction/Data_for_w2v/USPTO/db_'+ file_n + '.xlsx'), index = None, header = True)
            
            except zipfile.BadZipFile:
                    
                    print('Error: Zip file cannot be open')
                    
                    zip_file_not_correctly_downl.append(file_link[3:-4])
                    
                    continue
        else:
            continue

        file_to_rem = pathlib.Path(file_n + '.zip')
        file_to_rem.unlink()
            
        file_to_rem = pathlib.Path(file_n + '.xml')
        file_to_rem.unlink()
        
    
            
            
get_file(URL)