In [1]:
"""
Sao Paulo, October 30th, 2020
This script reads parquet uspatentcitation files, drops rows with errors on patent and date fields.
This cleaning aims to avoid later processing problems.

This script should offer a report on the dropped rows.
Alternatively, it could generate a flag indicating rows with errors.

# citation_id - patent making a citation 
# patent_id - patent receiving a citation 


"""
import pandas as pd
import numpy as np
import re
import csv
import dask.dataframe as dd
from dask.delayed import delayed
import glob
from fastparquet import ParquetFile
# import graphviz

def clean_field(df):
    cleaning_patent=lambda x:re.sub('([^a-zA-Z0-9]+)', "", x)
    df.dropna(inplace=True)
    df['patent_id']=df['patent_id'].apply(cleaning_patent)
    df['citation_id']=df['citation_id'].apply(cleaning_patent)
    return df

def date_within_boundaries(df):
    # Avoid TimeStamp limitations:
    # https://stackoverflow.com/questions/50265288/how-to-work-around-python-pandas-dataframes-out-of-bounds-nanosecond-timestamp
    df['date']=df['date'].str[:4].astype(int)
    #pd.Timestamp.min: Timestamp('1677-09-21 00:12:43.145225')
    df['date']=df['date'].apply(lambda x: x if x > 1677 else np.nan)
    #pd.Timestamp.max: Timestamp('2262-04-11 23:47:16.854775807')
    df['date']=df['date'].apply(lambda x: x if x < 2021 else np.nan)
    return df

In [2]:
file_list=glob.glob("parquet/uspatentcitation*")
dst='data/cleanuspatentcitation.parquet.gz'

In [3]:
# This is the date of the first patent ever granted, so patents with grant dates previous to these should be wrong
# first_patent = datetime.date(1790, 7, 31)
# small change from the actual first patent's grant date because one of the citations for n1 seems to be right
# first_patent = pd.to_datetime('1790-06-30', format="%Y-%m-%d") 

dfs = [delayed(pd.read_parquet)(f) for f in file_list[:4]]

myTypes={'patent_id':str, 'citation_id':str, 'date':str}
df = dd.from_delayed(dfs, meta=myTypes)

df=delayed(clean_field)(df)
df=delayed(date_within_boundaries)(df)

In [None]:
df=df.compute()

In [None]:
df.info()

In [None]:
df.dropna(subset=['date'], inplace=True)
df.info()


In [None]:
df.dropna(inplace=True)
df.info()
df['date']=df['date'].astype(int)

In [None]:
# df=df.compute(num_workers=8)
# date is the year where patent_id cites citation_id
df.set_index('citation_id').to_parquet(dst, compression='gzip')

In [None]:
df.set_index('citation_id').head()