In [1]:
# Script to calculate generality and  application data
# Generality: how diverse is the impact of a patent
# This is done by calculating the herfindal index of citing patents

In [2]:
# Feb 9th, 2020
# It's working and I'm keeping as v3
# Now I begin in v4 
# This can be used both for generality and originality
# also, I can use for multiple classes

# Feb 7th, 2020
# transforming this in a function

# Feb 7th, 2020
# While the previous version is already up, I want to improve the script
# I want to make calculations with only one script
# Also, I want to compare different class systems
# this version took 10 minutes - i am moving the old version away

# Jan 16th, 2020
# Due to performance problems in the HPC, this script was divided in two, the script following this is generality_2
# generality > 1 is not an error, but a consequence of adopting WIPO
# it seems that the original calculation had only one class per patent
# WIPO provides multiple classes - so when you divide by the total number of citations, 
#  you do not have the proportion of classes cited anymore
# to correct this issue, I can calculate Generality and Originality based only on the first WIPO class

# Jan 13th, 2020
# Script is running but there are two major issues
# - there should not exist generality > 1 , so there is an error in calculation
# - too many NANs (about 400k), but I will tackle this issue in 'too_many_nans.ipynb'

# to tackle the first problem, I'll begin by creating a subset of the database
# to do that, I'll use USPTO classification system

In [3]:
# Trying to save memory is leading to a small nightmare 
# I am postponing the use of dask modules

In [4]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt

%matplotlib inline

In [5]:
def preprocessing(class_df, citation_df, generality=True):
    #there is a bug in pandas.read_csv
    #it fails to classify as object when told so in the options
    #so in this script, I guarantee the indexes are objects
    #then I match class to citation
    #inner matching in a citation-level returns citation level
    #i haven't tested but it could also be used to match class to patent

    #to guarantee same format for the merge
    class_df['id']=class_df['id'].astype(str)
    citation_df=citation_df.astype(str)

    #join on index is faster
    class_df.set_index('id', inplace=True)
    if generality:
        citation_df.set_index('patent_id', inplace=True)
    else:
        citation_df.set_index('citation_id', inplace=True)
    
    #citation level dataset
    #join is faster than merge
    df=citation_df.join(class_df, how='inner')  

    return df #citations with classifications

In [6]:
def index_processing(df, generality=True):
    #the left dataframe is citation df, which is indexed by patent_id
    #when later on I group by citation_id is very possible that NaNs appear
    
    #considering the second level of class systems
    subclass=df.columns[2] #checar
#     print(subclass)
    #total citation
    if generality:
        col_name='generality'
        total_citation=df.groupby('citation_id').count().iloc[:,0]
    else:
        col_name='originality'
        total_citation=df.groupby('patent_id').count().iloc[:,0]

    total_citation=np.square(total_citation)
#     print(total_citation.head())
    #df square: count citations, square them, and sum
    #citation-subclassclass level of observation
    if generality:
        df=df.groupby(['citation_id', subclass]).count()
    else:
        df=df.groupby(['patent_id', subclass]).count()

    df_squared=np.square(df)

    if generality:
        df_squared=df_squared.reset_index().groupby('citation_id').sum()
    else:
        df_squared=df_squared.reset_index().groupby('patent_id').sum()

    df=pd.concat([total_citation, df_squared], axis=1) #testado ate aqui, ok
    df=df.iloc[:,0:2]
#     df=df.rename({1: 'df_squared', 2: 'total_citation'}, axis='columns')
    df.columns=['total_citation','df_squared']

    df['herfindal']=df['df_squared']/df['total_citation'] #its a measure of concentration

    df[col_name]=1-df['herfindal'] # as defined in Hall et al, 2001
#     print(df.head())
#     print(df2.head())
    return df[col_name]

In [7]:
citation= 'data/cleanuspatentcitation.csv'
citation_df=pd.read_csv(citation, sep=',', usecols=['patent_id', 'citation_id'])

In [8]:
class_systems=['wipo', 'ipcr', 'cpc', 'nber']

In [9]:
def plot_hist(df, class_system, generality=True):
    fig, ax = plt.subplots(figsize=(10, 8))

    if generality:
        ax = df['generality'].hist()
        ax.set_title('Generality Distribution\n'+class_system.upper()+'\n')
        filename='./img/gen_histogram_'+class_system.upper()+'.png'  
    else:
        ax = df['originality'].hist()
        ax.set_title('Originality Distribution\n'+class_system.upper()+'\n')
        filename='./img/orig_histogram_'+class_system.upper()+'.png'  

    plt.savefig(filename)   

In [10]:
gener_df=citation_df.set_index('citation_id')
orig_df=citation_df.set_index('patent_id')

In [11]:
# #generality

# for class_system in class_systems:
#     classification = 'data/'+class_system+'.csv'
# #     class_df=pd.read_csv(classification, dtype=object, usecols=[0,2])
#     class_df=pd.read_csv(classification, dtype=object)
#     df=preprocessing(class_df, citation_df)
#     df=index_processing(df)
# #     print(df.to_frame().head())
#     gener_df=gener_df.merge(df.to_frame(), how='outer', left_index=True, right_index=True)
# #     dst= 'data/'+class_system+'_generality_v4.csv'
# #     df.to_csv(dst)
# #     plot_hist(df, class_system)

In [12]:
# gener_df=gener_df.iloc[:,1:]
# gener_df.columns = class_systems
# gener_df.to_csv('data/generality_classes.csv')
# gener_df.head()

In [13]:
#originality

for class_system in class_systems:
    classification = 'data/'+class_system+'.csv'
#     class_df=pd.read_csv(classification, dtype=object, usecols=[0,2])
    class_df=pd.read_csv(classification, dtype=object)
    df=preprocessing(class_df, citation_df, generality=False)
    df=index_processing(df, generality=False)
#     print(df.to_frame().head())
    orig_df=orig_df.merge(df.to_frame(), how='outer', left_index=True, right_index=True)

#     orig_df=orig_df.merge(df.to_frame(), how='outer', left_index=True, right_index=True)
#     dst= 'data/'+class_system+'_originality_v4.csv'
#     df.to_csv(dst)
#     plot_hist(df, class_system, generality=False)

# orig_df=orig_df.iloc[:,1:]
# orig_df.columns = class_systems
# orig_df.to_csv('data/originality_classes.csv')
# orig_df.head()

           originality
patent_id             
3931349            0.0
3935991            0.0
3943789            0.0
3944004            0.0
3945191            0.0
           originality
patent_id             
3931349            0.0
3935991            0.0
3943789            0.0
3944004            0.0
3945191            0.0
           originality
patent_id             
3931349            0.0
3935991            0.0
3943789            0.0
3944004            0.0
3945191            0.0
           originality
patent_id             
3931349            0.0
3935991            0.0
3943789            0.0
3944004            0.0
3945191            0.0
