In [1]:
#Script to evaluate the determinants of patent citation 
#Renato Kogeyama


#Miami, December 24th, 2019
# Prof. Rafael Corredoira suggested:
# - Inclusion of a tree of citations
#   To track back the source of citations. This is information is not given by direct count of citations.
# - Consider policy changes in the way patents are cited
#   Policy changes in 2000 changed the time frame of citation, and 2010 partially moved citation to applications
# - Track classification changes 
#   The original classification system in USPTO changed from a technical based to a market based classification system
#   See if there is an impact
# - Consider a text analysis of the claims
#   Classification is based on the claims but it is not clear how many claims are related to each classification category
# - Include moderation effect from classification
#   Citations patterns may change across industries, so some effects may disappear if industry is not accounted for.

# In summary, his ideas help increase structure of the current work


#Syracuse, December 3rd, 2019

#The original script is getting too complex
#There was many tentative scripts to play with data
#Here I am writing a script to show the relevance of variables to patent citation

#11-12-2019
#Introducing normalization

#10-11-2019
#I introduced log backward citation, what corrects for very dispersed results
#but the major problem is that few patents receive citations
#bring back binary output

#10-10-2019
#Added graphics and new distributions

#10-03-2019
#I rewrote the citation data to clean the strings

#09-15-2019
#O naive bayes tem algum problema com distribuicoes desbalanceadas
#o scikit learn tem um modulo que corrige count distributions com muitos zeros, o complementNB
#porem este nao esta disponivel na atual versao disponibilizada no HPC da FIU

#09-10-2019
#o trabalho pede uma abordagem mais sistematica e cuidadosa
#estou agrupando o codigo antigo comentado e vou comecar um novo codigo

#09-27-2019
#I am renaming citation as forward citation and backward citation

#09-17-2018

#Alto uso de memoria - rodar no Amazon AWS 



In [2]:
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn import linear_model, datasets
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif

from scipy import stats
import datetime
import matplotlib.pyplot as plot
import pylab as pl

from math import sqrt


import sys
sys.path.append('/home/rkogeyam/scripts/')

from best_num_attr import best_num_attr
from xattrSelect import xattrSelect
from sampler import sampler
from normalize import normalize




In [3]:
%matplotlib inline

In [4]:
fname='/home/rkogeyam/PATENT_CITATION/df_w_pat_cit_191229.csv'

df=pd.read_csv(fname)

# sample_size=1000
# df=sampler(fname, sample_size)

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
df.drop(['Unnamed: 0'], axis=1, inplace=True) #it is an automated generated column

In [6]:
df.set_index('id', inplace=True)

In [7]:
df.drop(['number', 'filename', 'abstract', 'title'], axis=1, inplace=True)

In [8]:
df.sample(n=5)

Unnamed: 0_level_0,type,country,date,kind,num_claims,back_citation,parent_back_citation,year,month,day
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
6410894,utility,US,2002-06-25,B1,7.0,6.0,9.0,2002.0,6.0,25.0
7940058,utility,US,2011-05-10,B2,20.0,3.0,0.0,2011.0,5.0,10.0
6338715,utility,US,2002-01-15,B1,10.0,23.0,246.0,2002.0,1.0,15.0
4736919,utility,US,1988-04-12,A,5.0,0.0,0.0,1988.0,4.0,12.0
6516206,utility,US,2003-02-04,B2,16.0,9.0,51.0,2003.0,2.0,4.0


In [9]:
df.drop('date', axis=1, inplace=True)

In [10]:
df.shape

(6488261, 9)

In [11]:
df=df.dropna()

In [12]:
df.shape

(6488250, 9)

In [13]:
# defensive publication and statutory registration are not wrong entries
# I will just rename them and leave them in the dataset
df.rename(columns={'defensive publication': 'defensive', 'statutory invention registration': 'statutory'}, inplace=True)

In [14]:
df.groupby('type').count()

Unnamed: 0_level_0,country,kind,num_claims,back_citation,parent_back_citation,year,month,day
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
TVPP,3,3,3,3,3,3,3,3
defensive publication,509,509,509,509,509,509,509,509
design,563003,563003,563003,563003,563003,563003,563003,563003
plant,24790,24790,24790,24790,24790,24790,24790,24790
reissue,17812,17812,17812,17812,17812,17812,17812,17812
statutory invention registration,2254,2254,2254,2254,2254,2254,2254,2254
utility,5879879,5879879,5879879,5879879,5879879,5879879,5879879,5879879


In [15]:
# defensive publication and statutory registration are not wrong entries
# I will just rename them and leave them in the dataset
df.groupby('kind').count()

Unnamed: 0_level_0,type,country,num_claims,back_citation,parent_back_citation,year,month,day
kind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
A,2391379,2391379,2391379,2391379,2391379,2391379,2391379,2391379
B1,568083,568083,568083,568083,568083,568083,568083,568083
B2,2920417,2920417,2920417,2920417,2920417,2920417,2920417,2920417
E,8692,8692,8692,8692,8692,8692,8692,8692
E1,9120,9120,9120,9120,9120,9120,9120,9120
H,1982,1982,1982,1982,1982,1982,1982,1982
H1,272,272,272,272,272,272,272,272
I4,509,509,509,509,509,509,509,509
I5,3,3,3,3,3,3,3,3
P,8282,8282,8282,8282,8282,8282,8282,8282


In [16]:
df.groupby('country').count()

Unnamed: 0_level_0,type,kind,num_claims,back_citation,parent_back_citation,year,month,day
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
US,6488250,6488250,6488250,6488250,6488250,6488250,6488250,6488250


In [17]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
num_claims,6488250.0,14.143219,12.004228,0.0,6.0,12.0,20.0,887.0
back_citation,6488250.0,7.257143,25.692059,0.0,0.0,0.0,5.0,3433.0
parent_back_citation,6488250.0,100.546808,1066.788244,0.0,0.0,0.0,13.0,156715.0
year,6488250.0,2002.72344,11.322681,1976.0,1995.0,2005.0,2013.0,2017.0
month,6488250.0,6.590932,3.402092,1.0,4.0,7.0,10.0,12.0
day,6488250.0,15.642284,8.827,1.0,8.0,16.0,23.0,31.0


In [18]:
df.dtypes

type                     object
country                  object
kind                     object
num_claims              float64
back_citation           float64
parent_back_citation    float64
year                    float64
month                   float64
day                     float64
dtype: object

In [19]:
#normalization
df=normalize(df)

Total number of observations and attributes
(6488250, 9)
Number of numerical attributes: 6
Number of non-numerical attributes: 3


In [20]:
# List of IVs
chosenColumns=df.columns.values.tolist()
len(chosenColumns)

chosenColumns.remove('back_citation')
len(chosenColumns)

# chosenColumns.remove(['parent_back_citation'])
# len(chosenColumns)

# chosenColumns.rmove('citation_bi')
# chosenColumns.remove('log_back_cit')

myX = df.as_matrix(columns=chosenColumns)
myY = df.as_matrix(columns=['back_citation'])

xTrain, xTest, yTrain, yTest = train_test_split(myX, myY, train_size=0.7, random_state=3) 
testSize = yTest.shape[0]
trainSize = yTrain.shape[0]



In [21]:
# best_num_attr(myX, xTrain, xTest, yTrain, yTest, chosenColumns, regtype='linear')

In [22]:
df.var()

num_claims                               1.831560e-04
back_citation                            5.600802e-05
parent_back_citation                     4.633779e-05
year                                     7.626597e-02
month                                    9.565482e-02
day                                      8.657325e-02
type_TVPP                                4.623741e-07
type_defensive publication               7.844336e-05
type_design                              7.924321e-02
type_plant                               3.806155e-03
type_reissue                             2.737734e-03
type_statutory invention registration    3.472766e-04
type_utility                             8.497317e-02
country_US                               0.000000e+00
kind_A                                   2.327264e-01
kind_B1                                  7.988968e-02
kind_B2                                  2.475109e-01
kind_E                                   1.337858e-03
kind_E1                     

In [23]:
# This selector do not work because almost every attribute is p-value significant

# selector = SelectKBest(f_classif, k=4) #initialize 
# selector.fit(myX, myY) #fit
# scores = -np.log10(selector.pvalues_) #transform pvalues (why?)
# scores /= scores.max() #normalize 
# plt.bar(myX - .45, scores, width=.2,
#         label=r'Univariate score ($-Log(p_{value})$)', color='darkorange',
#         edgecolor='black')