In [None]:
# IMPORTING DATA

import sys
sys.path.append('../')

from src.data_loader import PatentDataLoader
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load patent data
loader = PatentDataLoader("../data/patent_data_small")
patents = loader.load_all_patents()
df = loader.patents_to_dataframe(patents)

print(f"Loaded {len(df)} patents")
print(f"Columns: {df.columns.tolist()}")

In [None]:
# ANALYSIS

if len(df) > 0:
    # Show first few patents
    display(df.head())
    
    # Data quality analysis
    analysis = loader.analyze_data_structure(patents)
    
    print("\nField Coverage:")
    for field, coverage in analysis['field_coverage'].items():
        print(f"{field}: {coverage:.1%}")

In [None]:
# DATA COVERAGE

if len(df) > 0:
    plt.figure(figsize=(12, 6))
    coverage_data = analysis['field_coverage']
    
    plt.subplot(1, 2, 1)
    plt.bar(range(len(coverage_data)), list(coverage_data.values()))
    plt.xticks(range(len(coverage_data)), list(coverage_data.keys()), rotation=45)
    plt.title('Field Coverage Across Patents')
    plt.ylabel('Coverage %')
    
    # Sample text lengths
    if 'Claims' in df.columns:
        plt.subplot(1, 2, 2)
        claim_lengths = df['Claims'].fillna('').str.len()
        plt.hist(claim_lengths, bins=50, edgecolor='black')
        plt.title('Distribution of Claims Text Length')
        plt.xlabel('Character Count')
        plt.ylabel('Frequency')
    
    plt.tight_layout()
    plt.show()

In [None]:
# PATENT SAMPLES

if len(df) > 0 and 'title' in df.columns:
    print("Sample Patent Titles:")
    sample_titles = df['title'].dropna().head(10)
    for i, title in enumerate(sample_titles, 1):
        print(f"{i}. {title}")