In [1]:
# Load all required Libraries
import os, json, re, random, warnings
from typing import List, Dict, Any, Tuple
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import transformers, torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoModelForCausalLM
from datasets import Dataset

# Repro
SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)

print("Torch device:", "cuda" if torch.cuda.is_available() else "cpu")

Torch device: cpu


In [2]:
# Read Passages from the Datasets and Drop rows if they are NA or empty
passages = pd.read_parquet("hf://datasets/rag-datasets/rag-mini-wikipedia/data/passages.parquet/part.0.parquet")

print(passages.shape)
display(passages.head())

(3200, 1)


Unnamed: 0_level_0,passage
id,Unnamed: 1_level_1
0,"Uruguay (official full name in ; pron. , Eas..."
1,"It is bordered by Brazil to the north, by Arge..."
2,Montevideo was founded by the Spanish in the e...
3,The economy is largely based in agriculture (m...
4,"According to Transparency International, Urugu..."


In [3]:
# Clean: ensure strings, drop blanks
passages["passage"] = passages["passage"].astype(str).str.strip()
passages = passages[passages["passage"].str.len() > 0].reset_index(drop=True)
print("After clean:", passages.shape)

After clean: (3200, 1)


In [4]:
# Code for EDA
passages["char_len"] = passages["passage"].str.len()
passages["tok_len"]  = passages["passage"].str.split().apply(len)

eda = {
    "n_rows": int(len(passages)),
    "char_len_min": int(passages["char_len"].min()),
    "char_len_max": int(passages["char_len"].max()),
    "char_len_mean": float(passages["char_len"].mean()),
    "tok_len_min": int(passages["tok_len"].min()),
    "tok_len_max": int(passages["tok_len"].max()),
    "tok_len_mean": float(passages["tok_len"].mean()),
    "tok_len_p95": float(passages["tok_len"].quantile(0.95)),
}
print(json.dumps(eda, indent=2))

{
  "n_rows": 3200,
  "char_len_min": 1,
  "char_len_max": 2515,
  "char_len_mean": 389.7125,
  "tok_len_min": 1,
  "tok_len_max": 425,
  "tok_len_mean": 62.10375,
  "tok_len_p95": 169.0
}
