## This scripts created the files necessary to run the breakthrough algorithms

### Patent/Year-index

In [2]:
# !pip install tqdm
import os
import pandas as pd
from tqdm import tqdm

In [3]:
# Folder path containing the .tsv files
folder_path = 'data/USPTO/brief_text'

# List to store unique values of "patent_id" and corresponding years
result_data = []

# Loop over all files in the folder
for filename in tqdm(os.listdir(folder_path)):
    if filename.endswith('.tsv'):
        # Extract year from the filename
        year = filename.split('_')[-1].split('.')[0]
        
        # Read the TSV file into a DataFrame
        file_path = os.path.join(folder_path, filename)
        df = pd.read_csv(file_path, sep='\t')
        
        # Filter out entries with non-numeric patent_id values
        df['patent_id'] = pd.to_numeric(df['patent_id'], errors='coerce', downcast='integer')
        df = df.dropna(subset=['patent_id'])
        
        # Extract unique values of "patent_id"
        unique_patents = df['patent_id'].unique()
        
        # Append data to the result list
        result_data.extend([(int(float(patent)), year) for patent in unique_patents])
        
        # Delete the DataFrame to free up memory
        del df

# Create a DataFrame from the result list
result_df = pd.DataFrame(result_data, columns=['pat', 'year'])

# Save the concatenated data to a new file
result_df.to_csv('data/USPTO/input_files/year.csv', index=False, sep="\t")


100%|█████████████████████████████████████████████| 7/7 [00:19<00:00,  2.75s/it]


In [42]:
year = pd.read_csv("data/USPTO/input_files/year.csv", delimiter="\t")

In [43]:
year.shape

(376178, 2)

In [44]:
year["pat"]

0         4000024
1         3944441
2         3953613
3         3945518
4         3932608
           ...   
376173    4070754
376174    4084606
376175    4066237
376176    4124819
376177    4124698
Name: pat, Length: 376178, dtype: int64

In [27]:
# Delete to free up memory
del year

### CPC-file

In [40]:
# cpc = pd.read_csv("data/USPTO/g_cpc_current.tsv", usecols=["patent_id","cpc_sequence","cpc_group"], chunksize=1000, delimiter="\t")
# cpc = next(cpc)
# cpc

Unnamed: 0,patent_id,cpc_sequence,cpc_group
0,3950000,0,A63C9/001
1,3950000,1,A63C9/00
2,3950000,2,A63C9/002
3,3950000,3,A63C9/081
4,3950001,0,A63C9/086
...,...,...,...
995,3950238,1,C08F8/00
996,3950238,2,C08F279/02
997,3950238,3,C08F285/00
998,3950238,4,C08F291/02


In [4]:
import pandas as pd

# Load file
cpc = pd.read_csv("data/USPTO/g_cpc_current.tsv", usecols=["patent_id","cpc_sequence","cpc_group"], delimiter="\t")

# Filter out entries with non-numeric patent_id values
cpc['patent_id'] = pd.to_numeric(cpc['patent_id'], errors='coerce', downcast='integer')
cpc = cpc.dropna(subset=['patent_id'])

# Change column names
column_mapping = {"patent_id": "pat", "cpc_sequence": "progr", "cpc_group": "CPC"}
cpc = cpc.rename(columns=column_mapping)
# cpc['CPC'] = cpc['CPC'].str[:4]
cpc = cpc.drop_duplicates(subset=['pat', 'CPC'])

# Save the concatenated data to a new file
cpc.to_csv('data/USPTO/input_files/GPCPCs.txt', index=False, sep="\t")

# Free memory
del cpc


### Patents brief text
Create mock files with 1000 patents only

In [46]:
# !pip install tqdm
import pandas as pd

# Folder path containing the .tsv files
folder_path = 'data/USPTO/brief_text'

# Loop over all files in the folder
for filename in tqdm(os.listdir(folder_path)):
    if filename.endswith('.tsv'):
        
        # Read the TSV file into a DataFrame
        file_path = os.path.join(folder_path, filename)
        df = pd.read_csv(file_path, sep='\t')
        df.iloc[0:1000,:].to_csv(f"data/USPTO/mock/{filename}", index=False, sep="\t")

100%|█████████████████████████████████████████████| 7/7 [00:19<00:00,  2.83s/it]


In [1]:
import pandas as pd
pat = pd.read_csv("data/USPTO/brief_text/g_brf_sum_text_1976.tsv", sep = "\t")

In [2]:
# pat[pat["patent_id"].str.startswith("RE")]
print(pat.iloc[0:5,1])

0    BACKGROUND OF THE INVENTION \nThis invention r...
1    The present invention relates to the methods o...
2    BACKGROUND OF THE INVENTION \nThe present inve...
3    BACKGROUND OF THE INVENTION \n1. Field of the ...
4    The present invention relates to a food compos...
Name: summary_text, dtype: object
