# Synthetic USPTO Data Generator(Fake not real)

First we Import some libraries to help us get the datas.


In [9]:
import pandas as pd  # For creating and handling structured data (like spreadsheets)
import numpy as np  # For generating random numbers and statistical patterns
from datetime import datetime, timedelta  # For working with realistic dates

Then we set a random seed to ensure results are reproducible (same data every time you run it)

In [10]:
np.random.seed(42)

After that we defined the start date for our 6-month synthetic timeline. we use it as a sample.

In [11]:
start_date = datetime(2022, 1, 1)

Then we create a list of 180 consecutive dates (6 months = ~180 days)

In [12]:
dates = [start_date + timedelta(days=i) for i in range(180)]

Then we define realistic technology areas that startups commonly research.
These mimic actual trends in patent filings.

In [13]:
technology_areas = [
    "blockchain healthcare",
    "AI medical diagnosis",
    "quantum computing",
    "renewable energy storage",
    "autonomous vehicles",
    "biotech therapeutics",
    "cybersecurity encryption",
    "3D printing medical devices",
]

We again define types of organizations that file patents to simulate real-world diversity.

In [14]:
company_types = [
    "startup",
    "big_tech",
    "pharma_giant",
    "automotive_corp",
    "energy_company",
]

Finally by looping we generate a realistic but entirely synthetic dataset of US patent records—mimicking daily filing patterns, technology categories, and applicant types—then saves it as a clean CSV file for privacy-safe analysis.

In [None]:
# Initialize an empty list to store each synthetic patent record as a dictionary
synthetic_patents = []

# Loop through each day in the 6-month period
for date in dates:
    # Simulate a realistic daily patent volume using a Poisson distribution
    # (Poisson mimics real-world event counts—e.g., ~15 patents filed per day on average)
    num_patents = np.random.poisson(15)

    # For each patent filed on this day...
    for _ in range(num_patents):
        # Randomly pick a technology area (e.g., "AI medical diagnosis")
        tech = np.random.choice(technology_areas)

        # Randomly pick a filing entity type (e.g., "startup")
        company = np.random.choice(company_types)

        # Generate a fake but realistic US patent number (e.g., US2022123456)
        patent_number = f"US{np.random.randint(2022000000, 2023999999)}"

        # Simulate a realistic patent timeline:
        # Application date is randomly 30 to 365 days BEFORE the grant date
        application_date = date - timedelta(days=np.random.randint(30, 365))

        # Grant date = the current day in our loop (when the patent is "issued")
        grant_date = date

        # Create a simple but plausible abstract (description) for the patent
        abstract = f"Advanced method for {tech} using novel algorithms and systems"

        # Store all this info as a dictionary (one row in the future CSV)
        synthetic_patents.append(
            {
                "patent_number": patent_number,
                "application_date": application_date,
                "grant_date": grant_date,
                "technology_category": tech,
                "filing_entity": company,
                "abstract": abstract,
            }
        )

# Convert the list of dictionaries into a pandas DataFrame (like a spreadsheet in memory)
patent_df = pd.DataFrame(synthetic_patents)

# Save the DataFrame to a CSV file named exactly "synthetic_uspto_data.raw.csv"
# index=False ensures row numbers (0, 1, 2...) are NOT saved—only the data columns
patent_df.to_csv("synthetic_uspto_data.raw.csv", index=False, preview=True)

So we have our datasets ready but just to check lets us open the csv file.

In [16]:
patent_df.head()

Unnamed: 0,patent_number,application_date,grant_date,technology_category,filing_entity,abstract
0,US2022999890,2021-05-02,2022-01-01,cybersecurity encryption,big_tech,Advanced method for cybersecurity encryption u...
1,US2023947735,2021-08-25,2022-01-01,quantum computing,pharma_giant,Advanced method for quantum computing using no...
2,US2022329365,2021-01-28,2022-01-01,3D printing medical devices,pharma_giant,Advanced method for 3D printing medical device...
3,US2022327069,2021-02-12,2022-01-01,AI medical diagnosis,automotive_corp,Advanced method for AI medical diagnosis using...
4,US2023284372,2021-06-25,2022-01-01,AI medical diagnosis,automotive_corp,Advanced method for AI medical diagnosis using...
