In [1]:
# Script to calculate generality and  application data
# Generality: how diverse is the impact of a patent
# This is done by calculating the herfindal index of citing patents

# Nov 12th, 2020
# The script has been rewritten completely to introduce parallelism 
# 

# Feb 7th, 2020
# While the previous version is already up, I want to improve the script
# I want to make calculations with only one script
# Also, I want to compare different class systems
# this version took 10 minutes - i am moving the old version away

# Jan 16th, 2020
# Due to performance problems in the HPC, this script was divided in two, the script following this is generality_2
# generality > 1 is not an error, but a consequence of adopting WIPO
# it seems that the original calculation had only one class per patent
# WIPO provides multiple classes - so when you divide by the total number of citations, 
#  you do not have the proportion of classes cited anymore
# to correct this issue, I can calculate Generality and Originality based only on the first WIPO class

# Jan 13th, 2020
# Script is running but there are two major issues
# - there should not exist generality > 1 , so there is an error in calculation
# - too many NANs (about 400k), but I will tackle this issue in 'too_many_nans.ipynb'

# to tackle the first problem, I'll begin by creating a subset of the database
# to do that, I'll use USPTO classification system

In [2]:
import pandas as pd
import numpy as np
import re

import dask.dataframe as dd
from dask.delayed import delayed
from fastparquet import ParquetFile

import glob
import graphviz

In [3]:
from dask.distributed import Client
client = Client()
client

0,1
Client  Scheduler: tcp://127.0.0.1:45561  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 12  Memory: 16.71 GB


In [4]:
file_list=glob.glob("data/citation/*")
# file_list=file_list[1:5]

classification = 'data/wipo.parquet.gz'
dst= 'data/generality.parquet.gz'

dfs = [delayed(pd.read_parquet)(f, columns=['patent_id']) for f in file_list]
class_df=pd.read_parquet(classification, columns=['wipo_sector_id'])

In [5]:
def join_class(df, class_df):
    df=df.merge(class_df, how='inner', left_index=True, right_index=True)
    return df

In [6]:
#this function reads the custom-made wipo groups and prepare the dict for aggregation
#the aggregation features one count and sums for all classes
def wipo_categories():
    file_classes = 'data/classes.csv.gz'
    classes=pd.read_csv(file_classes, compression='gzip')
    classes=classes[classes['system']=='wipo_field_id'].sector_title.unique().tolist()
    aggregation={}
    for i, element in enumerate(classes):
        if i==0:
                aggregation[element]=['count','sum']
        else:
            aggregation[element]='sum'
    return aggregation

In [7]:
#this function calculate the terms that will be used later to calculate the originality/generality
#it groups by the index and calculate total citation and the sum of citations in each class

def prepare_df(df):
    aggregation=wipo_categories()
    df2=df.groupby(df.index).agg(aggregation)
    df2.columns = ['_'.join(col).strip() for col in df2.columns.values] #flatten the index (https://www.roelpeters.be/how-to-flatten-a-multiindex-pandas-dataframe/)
    return df2

In [8]:
# receives a table made by prepare_df and calculates 1-herfindal
# this function calculates generality based on Herfindal index
# Herfindal is the sum of the squares divided by the square of the sum of citations
# it is a measure of concentration

# i separate from prepare_df() to make testing easier
def gen_orig(df):
    return 1-(np.square(df.iloc[:,1:]).sum(axis='columns')/np.square(df.iloc[:,0]))


In [9]:
# the script is thought to be used both by generality (this case) and originality (later)
# the only difference is the index, to be changed right in the beginning of the script
# patent_id is the citing patent
# citation_id (the index) is the cited patent

In [10]:
# The script starts here, by merging citations with their classes and merging into one dataframe
for i,dfx in enumerate(dfs):
    dfx=join_class(dfx, class_df)
    if i==0:
        df=dfx
    else:
        df=delayed(dd.concat)([df,dfx])

In [11]:
# df = dd.from_delayed([load_chunk(f) for f in file_list]).repartition(partition_size='500MB')
# df = dd.read_parquet('data/citation/*')

In [12]:
# df = dd.from_delayed([join_class(f, class_df) for f in dfs]).repartition(partition_size='500MB')
# df = delayed(join_class)(dfs, class_df)

In [13]:
df = delayed(df.wipo_sector_id.str.get_dummies)().repartition(partition_size='500MB')

In [14]:
df = delayed(prepare_df)(df)

In [15]:
df.compute()



KilledWorker: ('read_parquet-2e8ca03d-f377-4a8d-b079-181ae69d4580', <Worker 'tcp://127.0.0.1:39763', name: 3, memory: 0, processing: 3>)

In [None]:
a

In [None]:
df = delayed(gen_orig)(df)

In [None]:
df.visualize()

In [None]:
%%time
df.compute().to_parquet(dst, compression='gzip')

In [None]:
#generates a similar dataframe and tests gen_orig()
def test_prepare_df():
    file_classes= 'data/classes.csv.gz'
    classes=pd.read_csv(file_classes, compression='gzip')
    classes=classes[classes['system']=='wipo_field_id'].sector_title.unique().tolist()
    df = pd.DataFrame(np.random.randint(0,100,size=(100, len(classes))), columns=classes)
    return prepare_df(df)

