## This scripts created the files necessary to run the breakthrough algorithms

### CPC-file

In [40]:
# cpc = pd.read_csv("data/USPTO/g_cpc_current.tsv", usecols=["patent_id","cpc_sequence","cpc_group"], chunksize=1000, delimiter="\t")
# cpc = next(cpc)
# cpc

Unnamed: 0,patent_id,cpc_sequence,cpc_group
0,3950000,0,A63C9/001
1,3950000,1,A63C9/00
2,3950000,2,A63C9/002
3,3950000,3,A63C9/081
4,3950001,0,A63C9/086
...,...,...,...
995,3950238,1,C08F8/00
996,3950238,2,C08F279/02
997,3950238,3,C08F285/00
998,3950238,4,C08F291/02


In [4]:
import pandas as pd

# Load file
cpc = pd.read_csv("data/USPTO/g_cpc_current.tsv", usecols=["patent_id","cpc_sequence","cpc_group"], delimiter="\t")

# Filter out entries with non-numeric patent_id values
cpc['patent_id'] = pd.to_numeric(cpc['patent_id'], errors='coerce', downcast='integer')
cpc = cpc.dropna(subset=['patent_id'])

# Change column names
column_mapping = {"patent_id": "pat", "cpc_sequence": "progr", "cpc_group": "CPC"}
cpc = cpc.rename(columns=column_mapping)
# cpc['CPC'] = cpc['CPC'].str[:4]
cpc = cpc.drop_duplicates(subset=['pat', 'CPC'])

# Save the concatenated data to a new file
cpc.to_csv('data/USPTO/input_files/GPCPCs.txt', index=False, sep="\t")

# Free memory
del cpc


### Patent/Year-index

In [1]:
import os
from multiprocessing import Pool, cpu_count
import pandas as pd
from tqdm import tqdm

def process_tsv_file(file_path: str) -> list:
    """
    Process a TSV file and extract unique patent IDs along with the corresponding year.

    Parameters:
    -----------
    file_path : str
        Path to the TSV file.

    Returns:
    --------
    list
        List of tuples containing unique patent IDs and the corresponding year.
    """
    result_data = []
    year = file_path.split('_')[-1].split('.')[0]

    df = pd.read_csv(file_path, sep='\t')
    df['patent_id'] = pd.to_numeric(df['patent_id'], errors='coerce', downcast='integer')
    df = df.dropna(subset=['patent_id'])

    unique_patents = df['patent_id'].unique()
    result_data.extend([(int(float(patent)), year) for patent in unique_patents])

    del df
    return result_data

def create_year_csv(folder_path: str, output_path: str, cpc_path: str, num_cores: int) -> None:
    """
    Create a CSV file with unique patent IDs and corresponding years from TSV files in the specified folder.

    Parameters:
    -----------
    folder_path : str
        Path to the folder containing TSV files.
    output_path : str
        Path to save the resulting CSV file.
    cpc_path : pd.DataFrame
        DataFrame containing data from cpc.csv.
    num_cores : int
        Number of CPU cores to use for parallel processing.
    """

    # Load cpc.csv into a DataFrame
    cpc_df = pd.read_csv(cpc_path, delimiter="\t")
    
    tsv_files = [os.path.join(folder_path, filename) for filename in os.listdir(folder_path) if filename.endswith('.tsv')]

    result_data = []

    with Pool(processes=num_cores or cpu_count()) as pool:
        for file_data in tqdm(pool.imap(process_tsv_file, tsv_files), total=len(tsv_files), desc="Processing TSV Files"):
            result_data.extend(file_data)

    result_df = pd.DataFrame(result_data, columns=['pat', 'year'])
    
    # Filter only patents with integer 'pat' and present in filtered_cpc_df
    result_df = result_df[result_df['pat'].astype(int).isin(cpc_df['pat'])]
    
    result_df.to_csv(output_path, index=False, sep="\t")

# Example usage:
create_year_csv('/home/smildinerm/data/volume_2/data/USPTO/brief_summary',
                '/home/smildinerm/data/volume_2/data/USPTO/input_files/year.csv',
                '/home/smildinerm/data/volume_2/data/USPTO/input_files/GPCPCs.txt', 20)


  df = pd.read_csv(file_path, sep='\t')
  df = pd.read_csv(file_path, sep='\t')
  df = pd.read_csv(file_path, sep='\t')
  df = pd.read_csv(file_path, sep='\t')
  df = pd.read_csv(file_path, sep='\t')
  df = pd.read_csv(file_path, sep='\t')
  df = pd.read_csv(file_path, sep='\t')
  df = pd.read_csv(file_path, sep='\t')
Processing TSV Files: 100%|██████████| 47/47 [00:39<00:00,  1.18it/s]


In [5]:
year = pd.read_csv("/home/smildinerm/data/volume_2/data/USPTO/input_files/year.csv", delimiter="\t")

In [6]:
year.shape

(7561282, 2)

In [2]:
year[year["pat"] == 8621736]

NameError: name 'year' is not defined

In [27]:
# Delete to free up memory
del year

### Patents brief text
Create mock files with 1000 patents only

In [46]:
# !pip install tqdm
import pandas as pd

# Folder path containing the .tsv files
folder_path = 'data/USPTO/brief_text'

# Loop over all files in the folder
for filename in tqdm(os.listdir(folder_path)):
    if filename.endswith('.tsv'):
        
        # Read the TSV file into a DataFrame
        file_path = os.path.join(folder_path, filename)
        df = pd.read_csv(file_path, sep='\t')
        df.iloc[0:1000,:].to_csv(f"data/USPTO/mock/{filename}", index=False, sep="\t")

100%|█████████████████████████████████████████████| 7/7 [00:19<00:00,  2.83s/it]


In [1]:
import pandas as pd
pat = pd.read_csv("data/USPTO/brief_text/g_brf_sum_text_1976.tsv", sep = "\t")

In [2]:
# pat[pat["patent_id"].str.startswith("RE")]
print(pat.iloc[0:5,1])

0    BACKGROUND OF THE INVENTION \nThis invention r...
1    The present invention relates to the methods o...
2    BACKGROUND OF THE INVENTION \nThe present inve...
3    BACKGROUND OF THE INVENTION \n1. Field of the ...
4    The present invention relates to a food compos...
Name: summary_text, dtype: object
