<a href="https://colab.research.google.com/github/saicharan2804/encryptcon/blob/main/dataloader.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from transformers import DonutProcessor, VisionEncoderDecoderModel
from pdf2image import convert_from_path
from torch.utils.data import Dataset, DataLoader
import torch
import pandas as pd

In [None]:
def load_and_preprocess_csv(csv_path):
    # Load CSV
    df = pd.read_csv(csv_path)

    # Drop specified columns
    columns_to_drop = ['Unnamed: 0', 'ARB Project', 'State', 'Project Site Location',
                       'Reversals Covered by Buffer Pool', 'Reversals Not Covered by Buffer']
    df.drop(columns=columns_to_drop, inplace=True, errors='ignore')

    # Merge specified columns into arrays
    vintage_issue_cols = [str(year) for year in range(2009, 2024)]
    retired_credits_cols = vintage_issue_cols.copy()  #change this line after reanming the columns

    df['vintage_issue'] = df[vintage_issue_cols].values.tolist()
    df['retired_credits'] = df[retired_credits_cols].values.tolist()

    # Drop the original year columns
    df.drop(columns=vintage_issue_cols + retired_credits_cols, inplace=True, errors='ignore')

    return df

In [None]:
# Function to convert a PDF file to images
def convert_pdf_to_images(pdf_path):
    return convert_from_path(pdf_path)

# Function to get concatenated representation of a document
def get_concatenated_representation(pdf_path, processor, model):
    image_array = convert_pdf_to_images(pdf_path)
    concatenated_outputs = []
    for image in image_array:
        pixel_values = processor(image, return_tensors="pt").pixel_values.to(model.device)
        outputs = model.encoder(pixel_values)
        concatenated_outputs.append(outputs.pooler_output)
    return torch.cat(concatenated_outputs, dim=1)

# Custom Dataset
class PDFDocumentDataset(Dataset):
    def __init__(self, csv_path, pdf_folder_path, processor, model):
        self.dataframe = load_and_preprocess_csv(csv_path)
        self.pdf_folder_path = pdf_folder_path
        self.processor = processor
        self.model = model

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        pdf_path = f"{self.pdf_folder_path}/{row['Project ID']}.pdf"
        embeddings = get_concatenated_representation(pdf_path, self.processor, self.model)

        # Create a comma-separated text from the row, excluding embeddings-related data
        text_data = row.drop(['Project ID', 'vintage_issue', 'retired_credits']).to_csv(header=False, index=False).strip('\n')

        # Prepare the output dictionary
        return {
            "project_id": row['Project ID'],
            "description": text_data,
            "vintage_issue": row['vintage_issue'],
            "retired_credits": row['retired_credits'],
            "embeddings": embeddings
        }

In [None]:
# Initialize the processor and model
processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2")
model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2")
model.to("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# List of PDF file paths
csv_path = 'path_to_csv_file.csv'
pdd_folder_path = 'path_to_pdf_folder'

In [None]:
# Create Dataset and DataLoader
dataset = PDFDocumentDataset(csv_path, pdd_folder_path, processor, model)
dataloader = DataLoader(dataset, batch_size=1, shuffle=False)

In [None]:
# Iterate over DataLoader
for data in dataloader:
    print(f"Project ID: {data['project_id']}")
    print(f"Description: {data['description']}")
    print(f"Vintage Issue: {data['vintage_issue']}")
    print(f"Retired Credits: {data['retired_credits']}")
    print(f"Document Embeddings Shape: {data['embeddings'].shape}")