In [1]:
import os
import random
from dotenv import load_dotenv
from datasets import load_from_disk, DatasetDict
import random
from collections import defaultdict
import cv2
import numpy as np
from PIL import Image
import io

In [2]:
load_dotenv()
token = os.getenv("HF_TOKEN")

In [3]:
dataset_path = "/mnt/storage/fincolqwen_data/datasets/pdf_dataset"

# Load the dataset
print("Loading dataset from disk...")
orig_dataset = load_from_disk(dataset_path)

# Print dataset info
print(f"Dataset loaded successfully with {len(orig_dataset)} examples")
print("\nDataset features:")
print(orig_dataset.features)

Loading dataset from disk...
Dataset loaded successfully with 2972 examples

Dataset features:
{'document_id': Value(dtype='string', id=None), 'page': Value(dtype='int64', id=None), 'image_filename': Value(dtype='string', id=None), 'image': Image(decode=True, id=None), 'query': Value(dtype='string', id=None), 'answer': Value(dtype='string', id=None), 'source': Value(dtype='string', id=None), 'model': Value(dtype='string', id=None), 'prompt': Value(dtype='string', id=None)}


  table = cls._concat_blocks(blocks, axis=0)


In [4]:
orig_dataset.features

{'document_id': Value(dtype='string', id=None),
 'page': Value(dtype='int64', id=None),
 'image_filename': Value(dtype='string', id=None),
 'image': Image(decode=True, id=None),
 'query': Value(dtype='string', id=None),
 'answer': Value(dtype='string', id=None),
 'source': Value(dtype='string', id=None),
 'model': Value(dtype='string', id=None),
 'prompt': Value(dtype='string', id=None)}

In [5]:
def is_mostly_blank(image, threshold_percent=99, debug=False):
    """
    Determine if an image is mostly blank space (more strict version)
    
    Args:
        image: PIL Image or numpy array
        threshold_percent: Percentage of white space to consider "mostly blank" (default: 99%)
        debug: If True, returns more detailed information for debugging
        
    Returns:
        bool or tuple: True if the image is mostly blank, False otherwise.
                      If debug=True, returns (is_blank, white_percentage)
    """
    # Convert PIL Image to numpy array if needed
    if isinstance(image, Image.Image):
        img_np = np.array(image.convert('L'))  # Convert to grayscale
    else:
        # Ensure grayscale
        if len(image.shape) == 3:
            img_np = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
        else:
            img_np = image
    
    # Use adaptive thresholding to better handle different lighting/contrast
    binary = cv2.adaptiveThreshold(
        img_np, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, 
        cv2.THRESH_BINARY, 11, 2
    )
    
    # Calculate percentage of white pixels (blank space)
    white_pixel_count = np.sum(binary == 255)
    total_pixels = binary.shape[0] * binary.shape[1]
    white_percentage = (white_pixel_count / total_pixels) * 100
    
    is_blank = white_percentage > threshold_percent
    
    if debug:
        return is_blank, white_percentage
    return is_blank

def filter_non_blank_pages(dataset, threshold=99.2):
    """
    Filter out mostly blank pages from the dataset
    
    Args:
        dataset: HuggingFace dataset
        threshold: Percentage threshold for blankness
        
    Returns:
        list: Indices of blank pages that exceeded the threshold
    """
    blank_indices = []
    non_blank_indices = []
    
    for i, example in enumerate(dataset):
        img = example['image']
        is_blank, white_percentage = is_mostly_blank(img, threshold, debug=True)
        
        if is_blank:
            blank_indices.append(i)
        else:
            non_blank_indices.append(i)
    
    print(f"Found {len(blank_indices)} blank pages out of {len(dataset)} total pages")
    return blank_indices, non_blank_indices

# Get blank page indices
blank_indices, non_blank_indices = filter_non_blank_pages(orig_dataset, threshold=99)
print(f"Blank page indices: {blank_indices}")

Found 142 blank pages out of 2972 total pages
Blank page indices: [17, 21, 34, 70, 74, 97, 99, 103, 104, 105, 109, 164, 169, 170, 178, 182, 183, 184, 193, 194, 222, 234, 250, 251, 252, 451, 477, 500, 504, 521, 546, 547, 563, 585, 620, 636, 646, 654, 664, 672, 678, 682, 686, 726, 756, 758, 761, 770, 774, 780, 784, 826, 843, 925, 991, 992, 1021, 1023, 1055, 1061, 1073, 1076, 1084, 1134, 1175, 1215, 1219, 1233, 1335, 1336, 1337, 1339, 1341, 1345, 1395, 1401, 1410, 1411, 1414, 1416, 1422, 1472, 1494, 1566, 1590, 1673, 1685, 1723, 1751, 1777, 1874, 1876, 1919, 1925, 2015, 2058, 2062, 2076, 2078, 2086, 2092, 2112, 2168, 2177, 2278, 2282, 2320, 2346, 2348, 2367, 2443, 2455, 2463, 2477, 2487, 2499, 2529, 2531, 2533, 2535, 2543, 2555, 2590, 2593, 2638, 2647, 2678, 2696, 2700, 2716, 2729, 2731, 2735, 2741, 2759, 2763, 2782, 2871, 2908, 2916, 2925, 2961]


In [6]:
for i in blank_indices:
    print(f"'{orig_dataset[i]['image_filename']}',")

'The Presentation Materials_3Q24_ADA_page_18.jpg',
'The Presentation Materials_3Q24_ADA_page_22.jpg',
'The Presentation Materials_3Q24_ADA_page_35.jpg',
'CBO-HouseholdIncome_page_36.jpg',
'CBO-HouseholdIncome_page_40.jpg',
'financial-stability-report-20241122_page_2.jpg',
'financial-stability-report-20241122_page_4.jpg',
'financial-stability-report-20241122_page_8.jpg',
'financial-stability-report-20241122_page_9.jpg',
'financial-stability-report-20241122_page_10.jpg',
'financial-stability-report-20241122_page_14.jpg',
'financial-stability-report-20241122_page_69.jpg',
'An_Introduction_to_Quantitative_Finance_page_4.jpg',
'An_Introduction_to_Quantitative_Finance_page_5.jpg',
'An_Introduction_to_Quantitative_Finance_page_13.jpg',
'An_Introduction_to_Quantitative_Finance_page_17.jpg',
'An_Introduction_to_Quantitative_Finance_page_18.jpg',
'An_Introduction_to_Quantitative_Finance_page_19.jpg',
'An_Introduction_to_Quantitative_Finance_page_28.jpg',
'An_Introduction_to_Quantitative_Finance_

In [7]:
dataset = orig_dataset.select(indices=non_blank_indices)
print(f"Filtered dataset now has {len(dataset)} examples")

Filtered dataset now has 2830 examples


In [8]:
testing_sample = [random.randint(1000, len(dataset)) for _ in range(100)] # [0,1,2,3,4,17,34,70,74,97] 
for i in testing_sample:
    print(dataset[i])

{'document_id': '3fcdd092-bf6c-4465-867a-17025920429f', 'page': 30, 'image_filename': 'Macroeconomics2e-OP_08uAIKN_page_30.jpg', 'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=2550x3300 at 0x73434FCADB80>, 'query': '', 'answer': '', 'source': 'Macroeconomics2e-OP_08uAIKN.pdf', 'model': '', 'prompt': ''}
{'document_id': '5f0d9f0a-0c74-42a9-8a72-6ef172407d6a', 'page': 44, 'image_filename': 'PrinciplesofFinance-WEB_page_44.jpg', 'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=2550x3300 at 0x73434FCAE180>, 'query': '', 'answer': '', 'source': 'PrinciplesofFinance-WEB.pdf', 'model': '', 'prompt': ''}
{'document_id': 'c08c4404-93f9-4a49-84be-704284967388', 'page': 72, 'image_filename': 'a4T9b5_corporate finance_10_ed_page_72.jpg', 'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=2682x3300 at 0x73434FCADE50>, 'query': '', 'answer': '', 'source': 'a4T9b5_corporate finance_10_ed.pdf', 'model': '', 'prompt': ''}
{'document_id': '3c71d018-d37a-49e1-

In [9]:
testing_dataset = dataset.select(testing_sample)
testing_dataset.save_to_disk("/mnt/storage/fincolqwen_data/datasets/experiment_dataset")

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]

In [11]:
# random.seed(42)

# all_indices = list(range(len(dataset)))
# random.shuffle(all_indices)

# val_size = 250
# test_size = 500
# val_indices = all_indices[:val_size]
# test_indices = all_indices[val_size:val_size+test_size]
# train_indices = all_indices[val_size+test_size:]

# val_dataset = dataset.select(val_indices)
# test_dataset = dataset.select(test_indices)
# train_dataset = dataset.select(train_indices)

# dataset_with_splits = DatasetDict({
#     'train': train_dataset,
#     'validation': val_dataset,
#     'test': test_dataset
# })

# dataset_with_splits.save_to_disk("/mnt/storage/fincolqwen_data/datasets/fincolqwen-main")

In [12]:
# Groups examples by document_id and calculates how many examples each document contains
# Randomly selects documents for validation until we reach approximately the target size (250)
# Includes a margin of error (10), and checks if skipping the last document would get us closer to the target
# Repeats the process for the test set (target 500)
# Assigns all remaining documents to the training set
# Verifies there's no document_id overlap between splits

random.seed(42)

# Group indices by document_id
doc_id_to_indices = defaultdict(list)
for idx, item in enumerate(dataset):
    doc_id_to_indices[item['document_id']].append(idx)

# Get document_ids with their counts
doc_id_counts = {doc_id: len(indices) for doc_id, indices in doc_id_to_indices.items()}
doc_ids = list(doc_id_counts.keys())
random.shuffle(doc_ids)

# Target sizes
target_val_size = 250
target_test_size = 500
margin_error = 10  # Acceptable margin of error

# Select documents for validation set
val_doc_ids = []
val_count = 0
for doc_id in doc_ids:
    if val_count < target_val_size:
        val_doc_ids.append(doc_id)
        val_count += doc_id_counts[doc_id]
        # If adding this document would exceed our target + margin, see if skipping would be closer
        if val_count > target_val_size + margin_error:
            potential_count = val_count - doc_id_counts[doc_id]
            if target_val_size - potential_count < val_count - target_val_size:
                # Skip this document if it would get us closer to target
                val_doc_ids.pop()
                val_count -= doc_id_counts[doc_id]
                break
    else:
        break

# Select documents for test set from remaining docs
remaining_docs = [d for d in doc_ids if d not in val_doc_ids]
test_doc_ids = []
test_count = 0
for doc_id in remaining_docs:
    if test_count < target_test_size:
        test_doc_ids.append(doc_id)
        test_count += doc_id_counts[doc_id]
        # Similar check for test set
        if test_count > target_test_size + margin_error:
            potential_count = test_count - doc_id_counts[doc_id]
            if target_test_size - potential_count < test_count - target_test_size:
                test_doc_ids.pop()
                test_count -= doc_id_counts[doc_id]
                break
    else:
        break

# All remaining documents go to train
train_doc_ids = [d for d in doc_ids if d not in val_doc_ids and d not in test_doc_ids]

# Create indices for each split
val_indices = [idx for doc_id in val_doc_ids for idx in doc_id_to_indices[doc_id]]
test_indices = [idx for doc_id in test_doc_ids for idx in doc_id_to_indices[doc_id]]
train_indices = [idx for doc_id in train_doc_ids for idx in doc_id_to_indices[doc_id]]

In [13]:
# Print split information
print(f"Total examples: {len(dataset)}")
print(f"Training: {len(train_indices)} examples from {len(train_doc_ids)} documents")
print(f"Validation: {len(val_indices)} examples from {len(val_doc_ids)} documents (target: {target_val_size})")
print(f"Testing: {len(test_indices)} examples from {len(test_doc_ids)} documents (target: {target_test_size})")

# Create the splits
val_dataset = dataset.select(val_indices)
test_dataset = dataset.select(test_indices)
train_dataset = dataset.select(train_indices)

# Verify no document_id overlap between splits
train_doc_ids_set = set(train_dataset['document_id'])
val_doc_ids_set = set(val_dataset['document_id'])
test_doc_ids_set = set(test_dataset['document_id'])

print(f"Document overlap between train and val: {len(train_doc_ids_set.intersection(val_doc_ids_set))}")
print(f"Document overlap between train and test: {len(train_doc_ids_set.intersection(test_doc_ids_set))}")
print(f"Document overlap between val and test: {len(val_doc_ids_set.intersection(test_doc_ids_set))}")

Total examples: 2830
Training: 2088 examples from 46 documents
Validation: 233 examples from 5 documents (target: 250)
Testing: 509 examples from 9 documents (target: 500)
Document overlap between train and val: 0
Document overlap between train and test: 0
Document overlap between val and test: 0


In [14]:
dataset_with_splits = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

dataset_with_splits.save_to_disk("/mnt/storage/fincolqwen_data/datasets/fincolqwen-main")

Saving the dataset (0/3 shards):   0%|          | 0/2088 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/233 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/509 [00:00<?, ? examples/s]

In [16]:
repo_id = "smith-nathanh/fincolqwen-main"

# Push to Hub with description and tags
dataset_with_splits.push_to_hub(
    repo_id,
    private=False,
    token=token,
    embed_external_files=True,  # Important to include images!
)

Map:   0%|          | 0/696 [00:00<?, ? examples/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/3 [00:00<?, ?it/s]

Map:   0%|          | 0/696 [00:00<?, ? examples/s]

Map:   0%|          | 0/696 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/7 [00:00<?, ?ba/s]

Deleting unused files from dataset repository:   0%|          | 0/3 [00:00<?, ?it/s]

Map:   0%|          | 0/233 [00:00<?, ? examples/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/509 [00:00<?, ? examples/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/991 [00:00<?, ?B/s]

In [9]:
#dataset_path = "file:///mnt/storage/fincolqwen_data/datasets/experiment_dataset2"
dataset_path = "file:///mnt/storage/fincolqwen_data/datasets/fincolqwen-main"

# Load the dataset
print("Loading dataset from disk...")
testing_dataset = load_from_disk(dataset_path)

Loading dataset from disk...


  table = cls._concat_blocks(blocks, axis=0)


In [6]:
testing_dataset['train']

Dataset({
    features: ['document_id', 'page', 'image_filename', 'image', 'source', 'prompt', 'qa_data', 'query', 'answer', 'model'],
    num_rows: 10
})