In [3]:
import pandas as pd
import numpy as np
import os, shutil

In [4]:
os.chdir('../../cmuMine/')
os.listdir()

['.DS_Store', 'test', 'indepn_depn', 'Helper doc.docx', 'train', 'val']

In [5]:
os.getcwd()

'/Users/sachin.murthy/Desktop/cmuMine'

In [8]:
def process_patent_data(data_path):
    """
    Process patent data given a path and number of patents to process.
    
    Parameters:
    - data_path: str, the path to the dataset directory.
    - num_patents: int, the maximum number of patents to process (process all if None).
    
    Returns:
    - df: DataFrame, containing processed patent data.
    """
    # Create a list of feature names
    features = ["abstract", "summary", "detailed_description", "firstclaim", "claims"]

    # Create an empty list to store data
    data_list = []

    # We'll use a set to track which patent numbers we have processed
    processed_patents = set()

    # As an initial step, let's collect patent application numbers from the 'abstract' folder
    abstract_path = os.path.join(data_path, "abstract")
    if not os.path.exists(abstract_path):
        raise Exception(f"The abstract directory does not exist at the provided path: {abstract_path}")

    patent_application_numbers = [file.split("_")[0] for file in os.listdir(abstract_path)]

    # Only process the specified number of patents if provided
    for patent_application_no in patent_application_numbers:
        data_row = {"patent_number": patent_application_no}
        for feature in features:
            feature_file_path = os.path.join(data_path, feature, f"{patent_application_no}_{feature}")  # added .txt assuming the files are text files

            if os.path.exists(feature_file_path):
                with open(feature_file_path, 'r', encoding='utf-8') as f:  # added encoding for safer file reading
                    data_row[feature] = f.read()
            else:
                data_row[feature] = None  # or some default value if the feature doesn't exist for the patent

        # Append the data row to the data list
        data_list.append(data_row)

        # Mark this patent as processed
        processed_patents.add(patent_application_no)

    # Convert the list of data to a dataframe
    return pd.DataFrame(data_list)



base_path = r"/Users/sachin.murthy/Desktop/cmuMine"
test_path = os.path.join(base_path, "test/test_2006")
train_path= os.path.join(base_path, "train/train_2006")
val_path= os.path.join(base_path, "val/val_2006")


# Create a list of feature names

In [9]:
# Process the test dataset
test_path = os.path.join(base_path, "test/test_2006")
test_df = process_patent_data(test_path)
print(test_df.shape)

/Users/sachin.murthy/Desktop/cmuMine/test/test_2006/detailed_description/US20060217621A1_detailed_description
/Users/sachin.murthy/Desktop/cmuMine/test/test_2006/detailed_description/US20060092454A1_detailed_description
/Users/sachin.murthy/Desktop/cmuMine/test/test_2006/detailed_description/US20060258483A1_detailed_description
/Users/sachin.murthy/Desktop/cmuMine/test/test_2006/detailed_description/US20060009871A1_detailed_description
/Users/sachin.murthy/Desktop/cmuMine/test/test_2006/detailed_description/US20060055991A1_detailed_description
/Users/sachin.murthy/Desktop/cmuMine/test/test_2006/detailed_description/US20060160576A1_detailed_description
/Users/sachin.murthy/Desktop/cmuMine/test/test_2006/detailed_description/US20060006746A1_detailed_description
/Users/sachin.murthy/Desktop/cmuMine/test/test_2006/detailed_description/US20060100537A1_detailed_description
/Users/sachin.murthy/Desktop/cmuMine/test/test_2006/detailed_description/US20060002545A1_detailed_description
/Users/sac

In [10]:
test_df

Unnamed: 0,patent_number,abstract,summary,detailed_description,firstclaim,claims
0,US20060200002A1,A tubular access sleeve and suction tool for a...,One aspect of the present invention provides ...,FIG. 1 is an illustration of the preparation ...,1. A method of accessing an anatomic space of ...,1. A method of accessing an anatomic space of ...
1,US20060221139A1,The present invention is intended to alleviate...,An object of the present invention is to prov...,"Now, embodiments of this invention will be de...",1. An ink jet print head comprising: \na heate...,1. An ink jet print head comprising: \na heate...
2,US20060237988A1,While a corner part 35 of a skin 7 formed betw...,The present invention has been made to solve ...,The present invention will be described herei...,1. An integral skin interior trim comprising: ...,1. An integral skin interior trim comprising: ...
3,US20060217621A1,Systems and methods are described for classify...,The present invention is directed to a method...,,"1. A method for classifying a cardiac rhythm, ...","1. A method for classifying a cardiac rhythm, ..."
4,US20060027128A1,A projectile for small munitions is provided h...,"Accordingly, the small munitions firearm proj...","Turning now to the FIGS., and particularly to...",1. A small munitions projectile for a firearm ...,1. A small munitions projectile for a firearm ...
...,...,...,...,...,...,...
16032,US20060131479A1,A laser beam receiver with non-interdigitated ...,These needs are met by a long laser beam rece...,FIG. 1 is a schematic illustration of a laser...,1. A laser beam receiver for detecting the pos...,1. A laser beam receiver for detecting the pos...
16033,US20060180464A1,An apparatus for the production of hydrogen is...,It is a general object of the disclosed inven...,Most metals can be produced in a colloidal st...,1. An apparatus for the production of hydrogen...,1. An apparatus for the production of hydrogen...
16034,US20060254670A1,A service unit for resource replenishment in a...,This invention relates to a service unit for ...,FIG. 1 shows a service unit 1 for replenishme...,1. A service unit for resource replenishment i...,1. A service unit for resource replenishment i...
16035,US20060277800A1,The invention relates to a system for securing...,"Therefore, it is an object of the invention t...",FIGS. 1a and 1b depict the system 10 for secu...,"1. A system for securing a reversible cleat, c...","1. A system for securing a reversible cleat, c..."


In [16]:
# Process the val dataset
val_path = os.path.join(base_path, "val/val_2006")
val_df = process_patent_data(val_path)
print(val_df.shape)

(16109, 7)


In [15]:
# Process the train dataset
train_path = os.path.join(base_path, "train/train_2006")
train_df = process_patent_data(train_path)
print(train_df.shape)

(127971, 7)
