In [1]:
#Script to evaluate the determinants of patent citation 
#Renato Kogeyama

# August 19, 2020
# Updated all files to latest data available in PatentsView (Jun/2020)
# Excluded self citations from the dataset

# July 13, 2020
# I am changing the script to test Nemet and Johnson 2012, but with centrality measures as DV

# Apr 27, 2020
# Separe most central patents, classify them as disruptive and calculate dvs from extant literature

# Mar 16, 2020
# Introducing centrality measures as dv

# Feb 04, 2020
# to set labels in heatmap keyword:xticklabels
# for ex.
# sns.heatmap(globalWarming_df, xticklabels = np.arange(0,15))
# to be implemented later
# another alternative is to substitute the values in the dataset and convert columns to categories
# to understand the impact, i should run some test
# however i am focusing now in calculate Corredoira's 2015 and Nemet & Johnson 2012

# Feb 03, 2020
# version backed up as _old

# Feb 02, 2020
# the best way to deal with the classification names is to use a dictionary
# this avoid charging memory with the strings
# However, WIPO is organized differently than the other systems
# I'll update the wipo code to uniformize the behavior in this script
# I am creating a code that reflects the first level of classification 

# Feb 01, 2020
# Introduction of categorical graphs: barplot and heatmap
# heatmap is not the real deal, its a simplification
# the real deal would be the correlation table - there is a suggestion based on cramer, 
    # but implementation was not ready
# graphs exported and google docs updated
# next step: update cit_tree to reflect Corredoira's 205 Influence measure
# plot a network graph: https://plot.ly/python/network-graphs/
# reproduce 2012 Nemet and Johnson with other class systems
# correct bias in generality and originality (multiply for N/N-1)

# Jan 21, 2020
# Classifications added
# Code reorganized - much faster now
# Still missing the update of applications to the grant number
# I should provide now descriptive statistics on all variables

# Jan 21, 2020
# The current data does not have Class
# I should go back and get this info - but there are too many scripts now and
#   I should reorganize them before moving forward
# I should also include the patent publication date - to control for the policy changes
# In the citation file, I should change application number for grant when possible 
#   This will improve realiability of all measures related to citation
# Introduce classifications

# Jan 18, 2020
# Variables calculated
# Generality, average delay, forward and backward citations, cumulative citation (cit_tree)
# Still missing originality
# the file with variables that are used in this script should get a name independent from the date


#Miami, December 24th, 2019
# Prof. Rafael Corredoira suggested:
# - Inclusion of a tree of citations
#   To track back the source of citations. This is information is not given by direct count of citations.
# - Consider policy changes in the way patents are cited
#   Policy changes in 2000 changed the time frame of citation, and 2010 partially moved citation to applications
# - Track classification changes 
#   The original classification system in USPTO changed from a technical based to a market based classification system
#   See if there is an impact
# - Consider a text analysis of the claims
#   Classification is based on the claims but it is not clear how many claims are related to each classification category
# - Include moderation effect from classification
#   Citations patterns may change across industries, so some effects may disappear if industry is not accounted for.

# In summary, his ideas help increase structure of the current work


#Syracuse, December 3rd, 2019

#The original script is getting too complex
#There was many tentative scripts to play with data
#Here I am writing a script to show the relevance of variables to patent citation

#11-12-2019
#Introducing normalization

#10-11-2019
#I introduced log backward citation, what corrects for very dispersed results
#but the major problem is that few patents receive citations
#bring back binary output

#10-10-2019
#Added graphics and new distributions

#10-03-2019
#I rewrote the citation data to clean the strings

#09-15-2019
#O naive bayes tem algum problema com distribuicoes desbalanceadas
#o scikit learn tem um modulo que corrige count distributions com muitos zeros, o complementNB
#porem este nao esta disponivel na atual versao disponibilizada no HPC da FIU

#09-10-2019
#o trabalho pede uma abordagem mais sistematica e cuidadosa
#estou agrupando o codigo antigo comentado e vou comecar um novo codigo

#09-27-2019
#I am renaming citation as forward citation and backward citation

#09-17-2018

#Alto uso de memoria - rodar no Amazon AWS 



In [2]:
import pandas as pd
import numpy as np
import IPython.display as display
import seaborn as sns
          
import itertools

from sklearn import preprocessing
from sklearn import linear_model, datasets
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn import naive_bayes
from sklearn.metrics import roc_curve, auc
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import LinearRegression

import scipy.stats as ss
import datetime
import matplotlib.pyplot as plt

from math import sqrt

import sys
sys.path.append('/home/rkogeyam/scripts/')
sys.path.append('scripts/')

#from determinants_scripts import classes

# from plotbar import plotbar
# from plot_heat import heatmap


from best_num_attr import best_num_attr
from xattrSelect import xattrSelect
from sampler import sampler
from normalize import normalize
from nbayes import nbayes

import gzip
import statsmodels.api as sm

import os


In [3]:
latex='data/results.tex'
# dataset='data/dataset.csv'
dataset=gzip.open('data/dataset.csv.gz', 'rt')

In [4]:
%matplotlib inline
sns.set()
sns.set_palette(sns.cubehelix_palette(8))
# pd.options.display.float_format = '{:,.2f}'.format

In [5]:
dtypes={'id':object,'type':object, 'kind':object, 'num_claims':float, 'cit_received':float, 'cit_made':float,
       'cit_received_delay':float, 'cit_made_delay':float, 'parent_citation':float,
       'originality':float, 'generality':float, 'wipo_sector_id':object, 'ipcr_section':object,
       'ipcr_ipc_class':object, 'ipcr_subclass':object, 'cpc_section_id':object,
       'cpc_subsection_id':object, 'cpc_group_id':object, 'nber_category_id':object,
       'nber_subcategory_id':object, 'uspc_mainclass_id':object, 'uspc_subclass_id':object, 'eigen':float, 'pagerank':float, 'katz':float}

In [6]:
# only main classes (exclude uspc)

usecols=['id', 'date', 'num_claims', 'cit_received', 'cit_made',
         'cit_received_delay', 'cit_made_delay',
         'originality', 'generality', 'wipo_sector_id', 'pagerank']

In [7]:
# only WIPO class system, exclude type and kind

# usecols=['id', 'date', 'num_claims', 'cit_received', 'cit_made',
#         'cit_received_delay', 'cit_made_delay', 'parent_citation',
#         'originality', 'generality', 'wipo_sector_id', 'eigen', 'pagerank', 'katz']

In [8]:
df=pd.read_csv(dataset, usecols=usecols, dtype=dtypes, parse_dates=['date'], index_col='id')

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9570679 entries, 0000000 to re25727
Data columns (total 10 columns):
date                  datetime64[ns]
num_claims            float64
cit_received          float64
cit_made              float64
cit_received_delay    float64
cit_made_delay        float64
originality           float64
generality            float64
wipo_sector_id        object
pagerank              float64
dtypes: datetime64[ns](1), float64(8), object(1)
memory usage: 803.2+ MB


In [9]:
df['year']=df.date.dt.year

df['decade']=df.date.dt.year//10*10
df['decade'] =df['decade'].apply(lambda x: int(x) if str(x) != 'nan' else np.nan)
decades=list(df.decade.unique())
# decades = [int(x) for x in decades if str(x) != 'nan']

In [10]:
obj_cols=list(df.select_dtypes(include=[object]).columns.values)
obj_cols

['wipo_sector_id']

In [11]:
num_cols=list(df.select_dtypes(include=[np.number]).columns.values)
num_cols

['num_claims',
 'cit_received',
 'cit_made',
 'cit_received_delay',
 'cit_made_delay',
 'originality',
 'generality',
 'pagerank',
 'year',
 'decade']

## Data Analysis

### Descriptive 

In [12]:
descriptive=df.describe(include=[np.number]).loc[['count','mean','std','min','max']].append(df[num_cols].isnull().sum().rename('isnull'))

  interpolation=interpolation)


In [13]:
descriptive.apply(lambda x: x.apply('{:,.2f}'.format)).transpose()

Unnamed: 0,count,mean,std,min,max,isnull
num_claims,7330226.0,14.23,11.68,1.0,887.0,2240453.0
cit_received,9570679.0,11.38,33.38,0.0,4609.0,0.0
cit_made,6188752.0,14.78,41.95,0.0,5811.0,3381927.0
cit_received_delay,7632624.0,20.21,24.31,-13.37,215.0,1938055.0
cit_made_delay,6907058.0,13.87,10.71,-195.99,182.74,2663621.0
originality,6297168.0,0.15,0.21,0.0,0.8,3273511.0
generality,6552760.0,0.15,0.21,0.0,0.8,3017919.0
pagerank,8524799.0,0.0,0.0,0.0,0.0,1045880.0
year,7330226.0,2004.55,11.8,1976.0,2020.0,2240453.0
decade,7330226.0,1999.7,11.79,1970.0,2020.0,2240453.0


In [14]:
df.describe(include=[np.object])#.append(df[np.object].isnull().sum().rename('isnull')).transpose()

Unnamed: 0,wipo_sector_id
count,6621121
unique,5
top,1
freq,2506493


### Barplots and Heatmaps

In [15]:
# # barplot
# # as of 02.03.20, working

# for i in obj_cols:
#     plotbar(i, df, classes)

# # barplot with decades
# for i in obj_cols:
#     plotbar(i, df, classes,decade=True)

# # barplot with decades and inverted axis
# for i in obj_cols:
#     plotbar(i, df, classes,decade=True, decade_x=True)

# # heatmaps all periods
# for double in list(itertools.combinations(obj_cols, 2)):
#     heatmap(df[double[0]], df[double[1]]) 

# # print heatmaps per decade
# for decade in decades:
#     df_dec=df[df['decade']==decade]
#     for double in list(itertools.combinations(obj_cols, 2)):
#         heatmap(df_dec[double[0]], df_dec[double[1]], decade) 


### Histograms

In [16]:
# #histograms
# #could improve cutting off outliers
# for variable in num_cols:
#     ax=df[variable].hist()
#     ax.set_title('Histogram '+ variable.title()+'\n')
#     plt.show()

### Trends and Boxplots

In [17]:
#iterate over numerical variables

num_cols.remove('decade')
num_cols.remove('year')

In [18]:
# for variable in num_cols:
    
#     title=variable.replace('_', ' ')
#     fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(14, 6))

#     axes[0] = df.groupby('year').mean().plot(y=variable, ax=axes[0])
#     evl_title='Evolution of '+ title +'\n'
#     axes[0].set_title(evl_title)
#     axes[0].set_ylim(bottom=0)
    
#     axes[1] = sns.boxplot(x='decade', y=variable, data=df)

#     box_title='Dispersion of '+ title +'\n'
#     axes[1].set_title(box_title)
#     axes[1].set_ylim(bottom=0)
#     axes[1].set_ylabel("")
    
#     filename='./img/evol_dispersion_'+variable.lower()+'.png'  
#     plt.savefig(filename) 
#     plt.show()


In [19]:
# the generality data on the 2010's is too concentrated around 0
# to check, I draw this hist to understand what is happening
# it could be an effect of truncation - generality increases with forward citation

# df[df['decade']==2010]['generality'].hist()

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9570679 entries, 0000000 to re25727
Data columns (total 12 columns):
date                  datetime64[ns]
num_claims            float64
cit_received          float64
cit_made              float64
cit_received_delay    float64
cit_made_delay        float64
originality           float64
generality            float64
wipo_sector_id        object
pagerank              float64
year                  float64
decade                float64
dtypes: datetime64[ns](1), float64(10), object(1)
memory usage: 949.2+ MB


### Models

In [21]:
# normalization
df=normalize(df.dropna())

Total number of observations and attributes
(3891146, 12)
Number of numerical attributes: 10
Number of non-numerical attributes: 10


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df[num_cols]=min_max_scaler.fit_transform(df[num_cols])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3891146 entries, 3931349 to 9854083
Data columns (total 12 columns):
date                  datetime64[ns]
num_claims            float64
cit_received          float64
cit_made              float64
cit_received_delay    float64
cit_made_delay        float64
originality           float64
generality            float64
wipo_sector_id        object
pagerank              float64
year                  float64
decade                float64
dtypes: datetime64[ns](1), float64(10), object(1)
memory usage: 385.9+ MB


In [23]:
df.head()

Unnamed: 0_level_0,date,num_claims,cit_received,cit_made,cit_received_delay,cit_made_delay,originality,generality,wipo_sector_id,pagerank,year,decade
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
3931349,1976-01-06,0.009029,0.022682,0.001033,0.353835,0.641222,0.0,0.0,0,0.011216,0.0,0.0
3935991,1976-02-03,0.002257,0.004736,0.001205,0.27067,0.642102,0.0,0.138408,3,0.001409,0.0,0.0
3943789,1976-03-16,0.005643,0.001496,0.001205,0.316234,0.683412,0.0,0.0,3,0.000505,0.0,0.0
3944004,1976-03-16,0.012415,0.001246,0.001377,0.277968,0.639338,0.0,0.625,2,0.000662,0.0,0.0
3945191,1976-03-23,0.003386,0.004736,0.000688,0.131093,0.63065,0.0,0.0,2,0.010436,0.0,0.0


In [24]:
# #maybe nb fit does not accept nomalized data, so i using data without normalize
# #but in that case, i have to transform the categorical variables

# obj_cols=list(df.select_dtypes(include=[object]).columns.values)

# for col in obj_cols:
#     df[col] = df[col].astype('category')

# df=pd.get_dummies(df, columns=obj_cols, prefix=obj_cols)

In [25]:
# List of IVs
chosenColumns=df.columns.values.tolist()
len(chosenColumns)

12

In [26]:
chosenColumns.remove('pagerank')
chosenColumns.remove('date')
chosenColumns.remove('wipo_sector_id')
len(chosenColumns)

9

In [27]:
wipo_sectors=df.wipo_sector_id.unique()

In [28]:
df[df.wipo_sector_id=='1'].head()

Unnamed: 0_level_0,date,num_claims,cit_received,cit_made,cit_received_delay,cit_made_delay,originality,generality,wipo_sector_id,pagerank,year,decade
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
3967052,1976-06-29,0.034989,0.002991,0.000688,0.219422,0.664514,0.0,0.0,1,0.008602,0.0,0.0
3967295,1976-06-29,0.002257,0.009721,0.001377,0.171463,0.636298,0.0,0.0,1,0.099932,0.0,0.0
3970590,1976-07-20,0.003386,0.001994,0.000861,0.180507,0.634416,0.0,0.462963,1,0.004822,0.0,0.0
3970799,1976-07-20,0.011287,0.004487,0.000861,0.14789,0.637869,0.0,0.0,1,0.038665,0.0,0.0
3970838,1976-07-20,0.010158,0.00349,0.000516,0.36238,0.640615,0.0,0.0,1,0.003883,0.0,0.0


In [29]:
#function to write results to a latex file 
def export_table(content, name):
    basename='output/'+ name
    i=1
    while os.path.exists(basename+"_"+"{:03d}".format(i)+'.out'):
        i += 1
    with open(basename+str(i),'w') as fh:
        fh.write( content.as_latex() )
 

In [30]:
for wipo_sector in wipo_sectors:
    print(wipo_sector)
#     print(df[df.wipo_sector_id==wipo_sector].head())

    print("\n")
    
    myX = df[df.wipo_sector_id==wipo_sector].as_matrix(columns=chosenColumns)
    myY = df[df.wipo_sector_id==wipo_sector].as_matrix(columns=['pagerank'])

    x = sm.add_constant(myX)
    model = sm.OLS(myY, x)
    results = model.fit()
    export_table(results.summary(yname="PageRank", xname=chosenColumns, title="OLS of WIPO: "+wipo_sector ), "wipo_"+wipo_sector)
#     results.summary()

0


3


2


1


4




### DV: Parent citation (rename the variable)

In [31]:
# myX = df.as_matrix(columns=chosenColumns)

# myY = df.as_matrix(columns=['parent_citation'])

# xTrain, xTest, yTrain, yTest = train_test_split(myX, myY, train_size=0.7, random_state=3) 
# testSize = yTest.shape[0]
# trainSize = yTrain.shape[0]
# namesList, errorList = best_num_attr(myX, xTrain, xTest, yTrain, yTest, chosenColumns, regtype='linear')

### DV: pagerank (centrality)

In [32]:
# myX = df.as_matrix(columns=chosenColumns)
# myY = df.as_matrix(columns=['pagerank'])

# xTrain, xTest, yTrain, yTest = train_test_split(myX, myY, train_size=0.7, random_state=3) 
# testSize = yTest.shape[0]
# trainSize = yTrain.shape[0]

In [33]:
# namesList, errorList = best_num_attr(myX, xTrain, xTest, yTrain, yTest, chosenColumns, regtype='linear')

### DV: katz (centrality)

In [34]:
# myX = df.as_matrix(columns=chosenColumns)
# myY = df.as_matrix(columns=['katz'])

# xTrain, xTest, yTrain, yTest = train_test_split(myX, myY, train_size=0.7, random_state=3) 
# testSize = yTest.shape[0]
# trainSize = yTrain.shape[0]

In [35]:
# namesList, errorList = best_num_attr(myX, xTrain, xTest, yTrain, yTest, chosenColumns, regtype='linear')

### DV: eigen (centrality)

In [36]:
# myX = df.as_matrix(columns=chosenColumns)
# myY = df.as_matrix(columns=['eigen'])

# xTrain, xTest, yTrain, yTest = train_test_split(myX, myY, train_size=0.7, random_state=3) 
# testSize = yTest.shape[0]
# trainSize = yTrain.shape[0]

In [37]:
# namesList, errorList = best_num_attr(myX, xTrain, xTest, yTrain, yTest, chosenColumns, regtype='linear')

In [38]:
# This selector does not work because almost every attribute is p-value significant

# selector = SelectKBest(f_classif, k=4) #initialize 
# selector.fit(myX, myY) #fit
# scores = -np.log10(selector.pvalues_) #transform pvalues (why?)
# scores /= scores.max() #normalize 
# plt.bar(myX - .45, scores, width=.2,
#         label=r'Univariate score ($-Log(p_{value})$)', color='darkorange',
#         edgecolor='black')

In [39]:
# nbayes(xTrain, yTrain, xTest, yTest)

In [40]:
# df.dropna()

In [41]:
# #Let's do something else
# #Change the DV 

# myX = df.as_matrix(columns=chosenColumns)
# myY = df.as_matrix(columns=['parent_back_citation'])

# xTrain, xTest, yTrain, yTest = train_test_split(myX, myY, train_size=0.7, random_state=3) 

In [42]:
# nbayes(xTrain, yTrain, xTest, yTest)

In [43]:
# df.parent_back_citation.boxplot()

In [44]:
#and graphs of back citation in time

In [45]:
# for i in classifications:
#     rank=df.groupby(i).count().iloc[:,2].sort_values(ascending=False).reset_index().set_index(i)
#     description=df_class[df_class['class']==i].set_index('id')
#     display(rank.join(description))

In [46]:
# for i in obj_cols:
#     if i.isin(classifications):
#         df.join(df.groupby(i).count().iloc[:,2].sort_values(ascending=False)
# #     display.display(df.pivot_table(values=df.reset_index().id, index=i, columns='decade', aggfunc='count', fill_value=0, margins=False, dropna=True))
#     print(i)
#     display.display(df.groupby(i).count().iloc[:,2].sort_values(ascending=False))

In [47]:
# def cramers_v(x, y):
#     confusion_matrix = pd.crosstab(x,y)
#     chi2 = ss.chi2_contingency(confusion_matrix)[0]
#     n = confusion_matrix.sum().sum()
#     phi2 = chi2/n
#     r,k = confusion_matrix.shape
#     phi2corr = max(0, phi2-((k-1)*(r-1))/(n-1))
#     rcorr = r-((r-1)**2)/(n-1)
#     kcorr = k-((k-1)**2)/(n-1)
#     return np.sqrt(phi2corr/min((kcorr-1),(rcorr-1)))

In [48]:
# all variables
# dtypes={'id':object, 'type':object, 'kind':object, 'num_claims':float, 'cit_received':float, 'cit_made':float,
#        'cit_received_delay':float, 'cit_made_delay':float, 'parent_citation':float,
#        'originality':float, 'generality':float, 'wipo_field_id':object, 'ipcr_section':object,
#        'cpc_section_id':object,'nber_category_id':object,'uspc_mainclass_id':object}