In [None]:
## Kaggle Dataset is downloaded from here : 
# https://www.kaggle.com/c/uspto-explainable-ai/data
# USPTO - Explainable AI for Patent Professionals
# Help patent professionals understand AI results through a familiar query language
# patent_data has all .parquet files downloaded

## Check sample file 

In [4]:
import pandas as pd

# Path to your .parquet file
parquet_file_path = "D:\\Topcoder\\patent_documentation\\patent_data\\1837_12.parquet"

# Read the .parquet file
df = pd.read_parquet(parquet_file_path)

# Display the contents of the .parquet file
print(df)

# If you want to display the first few rows
print(df.head())


   publication_number                                              title  \
0            US-492-A                                        Machine foe   
1            US-493-A                                      Cooking-stove   
2            US-494-A                           Steering wheel for ships   
3            US-495-A                           Machine for sizing paper   
4            US-496-A  Mode of com-stbuctiire sawmills fob sawing timber   
5            US-497-A  Improvement in the machine for preparing ice f...   
6            US-498-A  Improvement in mode of constructing saw-cylind...   
7            US-499-A                   Machine fob steaming and mashing   
8            US-500-A                        Improved excavating-machine   
9            US-501-A                               Improvement in plows   
10           US-502-A                                      Through trees   
11           US-503-A  Improvement in the manufacture of india-rubber...   
12          

In [5]:
df.columns

Index(['publication_number', 'title', 'abstract', 'claims', 'description'], dtype='object')

In [3]:
df[df['publication_number'] == 'US-496-A']

Unnamed: 0,publication_number,title,abstract,claims,description
4,US-496-A,Mode of com-stbuctiire sawmills fob sawing timber,,,"UNTTan sTaTns PATE T OFFICE. \n JOHN AMBLER, ..."


## Check sample file - for non-abstract columns files

In [12]:
import os
import random
import pandas as pd
import pyarrow.parquet as pq

def load_random_parquet_files(folder_path, num_files=3):
    try:
        print(f"Searching for parquet files in: {folder_path}")
        
        # Ensure the folder path exists
        if not os.path.exists(folder_path):
            raise FileNotFoundError(f"The folder path does not exist: {folder_path}")
        
        # List all .parquet files in the directory
        parquet_files = [file for file in os.listdir(folder_path) if file.endswith('.parquet')]
        
        if not parquet_files:
            print(f"No parquet files found in {folder_path}")
            return None
        
        print(f"Found {len(parquet_files)} parquet files")
        
        # Select random files
        random_files = random.sample(parquet_files, min(num_files, len(parquet_files)))
        print(f"Randomly selected {len(random_files)} files: {random_files}")

        dfs = []
        for file_name in random_files:
            file_path = os.path.join(folder_path, file_name)
            print(f"Loading file: {file_path}")
            parquet_file = pq.ParquetFile(file_path)
            table = parquet_file.read()
            df = table.to_pandas()
            dfs.append(df)
        
        if len(dfs) > 1:
            result_df = pd.concat(dfs, ignore_index=True)
        elif len(dfs) == 1:
            result_df = dfs[0]
        else:
            result_df = None
        
        print(f"Total rows in combined DataFrame: {len(result_df) if result_df is not None else 0}")
        return result_df
    
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None

# Usage
folder_path = r"D:\Topcoder\patent_documentation\patent_data"  # Note the 'r' prefix for raw string
random_df = load_random_parquet_files(folder_path)

if random_df is not None:
    print(random_df.describe())
else:
    print("Failed to load parquet files.")

Searching for parquet files in: D:\Topcoder\patent_documentation\patent_data
Found 2251 parquet files
Randomly selected 3 files: ['1975_3.parquet', '1889_5.parquet', '1891_9.parquet']
Loading file: D:\Topcoder\patent_documentation\patent_data\1975_3.parquet
Loading file: D:\Topcoder\patent_documentation\patent_data\1889_5.parquet
Loading file: D:\Topcoder\patent_documentation\patent_data\1891_9.parquet
Total rows in combined DataFrame: 9063
       publication_number   title abstract claims description
count                9063    9063     9063   9063        9063
unique               9063    8274     5245   5228        9059
top          US-3868726-A  Island                            
freq                    1      49     3807   3836           5


In [25]:
import pandas as pd

# Assuming you've already created the DataFrame named unique_patents_df

# View the first 5 rows (default for head())
print(unique_patents_df.head())

# If you want to see more or fewer rows, you can specify a number:
# For example, to see the first 3 rows:
print(unique_patents_df.head(3))

# To see all columns clearly, you might want to display them vertically:
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', None)  # Use full width of the notebook/console
pd.set_option('display.max_colwidth', None)  # Show full content of each column

# Now display the first row vertically
print(unique_patents_df.iloc[0].to_string())

# If you want to see specific columns, you can do:
columns_to_show = ['publication_number', 'title', 'abstract']
print(unique_patents_df[columns_to_show].head())

  publication_number                                    title abstract claims  \
0        US-618818-A                                William t                   
1        US-618819-A                   Portable fence-machine                   
2        US-618820-A                                   Qtjier                   
3        US-618821-A                            Document-file                   
4        US-618822-A  Window-screen and controlling apparatus                   

                                         description  
0  N0. 618,818. Patentedfeb. 7, |899.  \n W. T. A...  
1  Patented Feb. 7, I899.  \n  T. J. ARMSTRONG. P...  
2  No. 6|8,820. Patented Feb. 7, 1899.  \n  F. E....  
3  No. s|8,s2|. Patented Feb. 7, I899. A. c. BARL...  
4  No. 6I8,822. Patented Feb. 7, I899.  \n  T. E....  
  publication_number                   title abstract claims  \
0        US-618818-A               William t                   
1        US-618819-A  Portable fence-machine           

# Main File generated Here for selecting Finetuned dataset

### The idea is to select all the patents with abstract not null values . This will ensure I have all the columns information.
### Main idea is to have the important columns like title, abstract, claims, description from 50 files of .parquet files. Please remember that each .parquet file have many patent publications.

In [5]:
import pandas as pd
import pyarrow.parquet as pq
import os
import random
from tqdm import tqdm
import time

# Timer start
start_time = time.time()
def load_patents_with_abstracts_from_random_files(folder_path, num_files=50):
    # Get all parquet files in the directory
    all_parquet_files = [f for f in os.listdir(folder_path) if f.endswith('.parquet')]
    
    # Randomly select num_files
    selected_files = random.sample(all_parquet_files, min(num_files, len(all_parquet_files)))
    
    all_patents = []
    
    print(f"Processing {len(selected_files)} randomly selected files")
    
    # Use tqdm for the progress bar
    for file in tqdm(selected_files, desc="Processing files"):
        file_path = os.path.join(folder_path, file)
        
        try:
            # Read the parquet file
            df = pd.read_parquet(file_path)
            
            # Filter for non-null abstracts
            df_with_abstract = df[df['abstract'].notna()]
            
            all_patents.append(df_with_abstract)
        except Exception as e:
            print(f"Error processing file {file}: {str(e)}")
    
    # Combine all dataframes
    combined_df = pd.concat(all_patents, ignore_index=True)
    
    # Get unique patents based on abstract
    unique_patents = combined_df.drop_duplicates(subset='abstract')
    
    # Sample up to 1000 patents
    sample_1000 = unique_patents.sample(n=min(1000, len(unique_patents)))
    
    # Save the sample to CSV
    sample_1000.to_csv('random_1000_abstractnotnull_1.csv', index=False)
    # unique_patents.to_csv('unique_patents_abstractnotnull_200files.csv', index=False)
    
    # print(f"Sample of 5 patents saved to 'abstractnotnull.csv'")
    print(f"Total patents processed: {len(combined_df)}")
    print(f"Total unique patents with non-null abstracts: {len(unique_patents)}")
    
    return unique_patents

# Usage
folder_path = r"D:\Topcoder\patent_documentation\patent_data"
result = load_patents_with_abstracts_from_random_files(folder_path, num_files=50)

# Timer end
end_time = time.time()
execution_time = end_time - start_time

print(f"Execution time: {execution_time} seconds")


Processing 50 randomly selected files


Processing files: 100%|████████████████████████████████████████████████████████████████| 50/50 [00:52<00:00,  1.06s/it]


Total patents processed: 376878
Total unique patents with non-null abstracts: 255805
Execution time: 125.4606397151947 seconds


In [8]:
import pandas as pd
df = pd.read_csv("random_1000_abstractnotnull_1.csv")
df.head()

Unnamed: 0,publication_number,title,abstract,claims,description
0,US-11114351-B2,Dummy element and method of examining defect o...,A dummy element includes: a semiconductor subs...,What is claimed is: \n \n 1. A dummy...,CROSS-REFERENCE TO RELATED APPLICATION \n ...
1,US-10946109-B2,Polymer-type fluorescent molecule probe,The present invention provides a fluorescent m...,The invention claimed is: \n \n 1. A...,TECHNICAL FIELD \n The present application...
2,US-11112260-B2,Geospatial navigation methods and systems for ...,An exemplary geospatial navigation system defi...,What is claimed is: \n \n 1. A metho...,BACKGROUND INFORMATION \n Use of mobile na...
3,US-10940384-B2,Inciting user action for motion sensor calibra...,"In a method of motion sensor calibration, a mo...",What is claimed is: \n \n 1. A metho...,CROSS-REFERENCE TO RELATED APPLICATION—PROVISI...
4,US-2021298305-A1,Use of a difluoro-(2-hydroxypropyl)pyridine co...,The present disclosure is related to the field...,What is claimed is: \n \n 1 . A me...,CROSS-REFERENCE TO RELATED APPLICATION(S) \n ...


In [9]:
df.shape

(1000, 5)