In [1]:
#!/usr/bin/env python
# coding: utf-8
# Script to evaluate citation delay
# citation_id - patent making a citation 
# patent_id - patent receiving a citation 

# Renato Kogeyama


# Oct 22, 2020
# The original script requires more than 32 GB RAM
# Changing from pd to dd (dask dataframe)

# Aug 19, 2020
# Included gzip
# Run with latest database


# Feb 07, 2020
# The main offensor of performance in this script is the transformation to timedelta
# the solution is to change to numpy
# https://stackoverflow.com/questions/52274356/conversion-of-a-timedelta-to-int-very-slow-in-python

# Jan 17 2020
# Join cit_delay with var_builder
# The only thing var_builder was doing was including kind and type 


# Jan 03 2020
# Miami
# I am using this script to calculate the average delay in citation - to follow Hall et al, 2001
# patent.csv has the following columns
# id 	type 	number 	country 	date 	abstract 	title 	kind 	num_claims 	filename
# interest on id, type, date, kind, num_claims

# I use two sources, uspatentcitation.tsv and patent.csv
# The first is a citation-level dataset with information about the citing patent
# The second is a patent-level dataset with information about the patent

# Cleaning
# I tested in other scripts the quality of the patent identifier
# It does not require cleaning - only 4 erros from 6 million patents
# The cleaning script is there anyway

# Merging
# I merge on the citation level (df)


# --

# First U.S. Patent Issued Today in 1790


# July 31, 2001
# Press Release
# #01-33

# On July 31, 1790 Samuel Hopkins was issued the first patent for a process 
# of making potash, an ingredient used in fertilizer. The patent was signed by 
# President George Washington. Hopkins was born in Vermont, but was living in 
# Philadelphia, Pa. when the patent was granted.

# The first patent, as well as the more than 6 million patents issued since then, 
# can be seen on the Department of Commerce's United States Patent and Trademark 
# Office website at www.uspto.gov. The original document is in the collections of 
# the Chicago Historical Society.


In [2]:
import pandas as pd
import numpy as np
#import matplotlib.pyplot as plt
import gzip
import dask.dataframe as dd
from dask.delayed import delayed
import datetime
import glob


In [3]:
def convert_and_subtract_dates(df):
    #conversao de string para data
    df["citation_date"] = df["citation_date"].astype(int)
    df["patent_date"] = df["patent_date"].astype(int)

    # delay is the time interval between grant and citation
    # following https://stackoverflow.com/questions/55395387/converting-a-dask-column-to-a-date-and-applying-a-lambda-function?rq=1
    df=df.assign(cit_delay=df["citation_date"] - df["patent_date"])
    return df

In [4]:
def write_report(df, report, report_dst):
    # if I do not drop nans, the script raises an error later when converting day interval into years
    # I could substitute with average instead of dropping, this way I do not lose the citation info
    # however, not always it will be possible to average - cases where there is ony one citation, for example
    # For this reason, at this point, I'll keep the NAN and circumvent the issues as they arise

    # df=df.dropna()

    report.append("largest citation delays\n")
    report.append(df.nlargest(15, 'cit_delay').to_latex())
    report.append("smallest citation delays \n")
    report.append(df.nsmallest(15, 'cit_delay').to_latex())

    report.append("describe\n")
    report.append(df.describe().to_latex())

    report.append("head\n")
    report.append(df.head().to_latex())

    #get_ipython().run_cell_magic('time', '', 'df.hist()')

    #Check outliers
    #report.append("Check cit delay outliers - 0.15 quantile")
    #report.append(df[df["cit_delay"]>df["cit_delay"].quantile(0.15)].sort_values(by=['cit_delay'], ascending=True))

    #report.append("Check cit delay outliers -0.85 quantile")
    #report.append(df[df["cit_delay"]<df["cit_delay"].quantile(0.85)].sort_values(by=['cit_delay'], ascending=False))
    with open(report_dst, 'a') as f:
        f.write("VAR BUILDER\n"+str(datetime.datetime.now()) + "\n")
        f.writelines([str(x) + "\n" for x in report])

In [5]:
patent= 'data/cleanpatent.parquet.gz'
dst='data/var_builder'
report_dst='var_builder_report.tex'

report=[] #file to export report

file_list=glob.glob("data/citation/*")

dfs = [delayed(pd.read_parquet)(f) for f in file_list]

pt_df = dd.read_parquet(patent)

report.append("VAR BUILDER \n")
report.append("patent file head \n")
report.append(pt_df.head().to_latex())


In [6]:
file_list

['data/citation/clean_0.parquet.gz',
 'data/citation/clean_8.parquet.gz',
 'data/citation/clean_4.parquet.gz',
 'data/citation/clean_3.parquet.gz',
 'data/citation/clean_10.parquet.gz',
 'data/citation/clean_1.parquet.gz',
 'data/citation/clean_2.parquet.gz',
 'data/citation/clean_6.parquet.gz',
 'data/citation/clean_9.parquet.gz',
 'data/citation/clean_5.parquet.gz',
 'data/citation/clean_7.parquet.gz']

In [9]:
for i,df in enumerate(dfs):
    
    #df = dd.read_parquet(df)

    report.append("file "+ str(i) +" citation head \n")
    report.append(df.head().to_latex())

    df=df.rename(columns = {'date':'patent_date'})
    pt_df=pt_df.rename(columns = {'date':'citation_date'})

    # merge between patent data and citations on patent_id (citing)
    # merging on the citation dataset drops patents without citing
    # later i could standardize to make patent_id index and use join instead of merge
    df=df.merge(pt_df, how='inner', left_index=True, right_index=True)

    # report.append("Info after merging\n")
    # report.append(df.info().to_latex())

    df=delayed(convert_and_subtract_dates)(df)
    df=df.compute()
    filename=dst+str(i)+".parquet.gz"
    df.to_parquet(filename, compression="gzip")
    write_report(df, report, report_dst)