# 01. Environment & Repo Setup (GothiRead)

Run each cell in order. If a step fails, re-run the cell after fixing the issue.

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
# Check GPU
import torch, platform, sys
print("Python:", sys.version)
print("PyTorch version (pre):", torch.__version__)
print("CUDA available:", torch.cuda.is_available())

Python: 3.12.11 (main, Jun  4 2025, 08:56:18) [GCC 11.4.0]
PyTorch version (pre): 2.8.0+cu126
CUDA available: False


In [7]:
# Install core packages (HF + metrics). Re-run if Colab restarts.
!pip -q install -U pip
!pip -q install -U transformers accelerate datasets evaluate jiwer Pillow regex editdistance sentencepiece timm

In [8]:
# (Optional) Install a stable PyTorch if missing. On Colab this is usually preinstalled.
# You can uncomment and run if needed:
# !pip -q install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
import torch
print("PyTorch version (post):", torch.__version__)

PyTorch version (post): 2.8.0+cu126


In [9]:
# Verify imports
import transformers, datasets, jiwer, PIL, regex, editdistance, sentencepiece, timm
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
print("Transformers:", transformers.__version__)
print("Ready.")

Transformers: 4.57.0
Ready.


In [10]:
# Create folders for data and experiments (idempotent)
import os
base = "/content/drive/MyDrive/GothiRead"
subdirs = ["src/data", "src/eval", "src/models", "scripts", "data/train", "data/val", "data/test_public", "exp"]
for sd in subdirs:
    os.makedirs(os.path.join(base, sd), exist_ok=True)
print("Project folders ready at", base)

Project folders ready at /content/drive/MyDrive/GothiRead


In [None]:
# Copy scaffold from the uploaded zip (if you uploaded it) or from Drive.
# If you downloaded the provided zip, upload it to Colab and run:
# from google.colab import files
# files.upload()  # then select 'icdar24-multifont.zip'
# import zipfile, os
# with zipfile.ZipFile('icdar24-multifont.zip', 'r') as z:
#     z.extractall('/content')

In [None]:
# (Optional) Mount Google Drive to persist data/experiments
# from google.colab import drive
# drive.mount('/content/drive')

## Optional: Install PaddleOCR (CPU) — safer on Day 1
GPU wheels can be tricky; switch to GPU later.

In [11]:
# CPU PaddleOCR install (safe default)
!pip -q install paddlepaddle==3.0.0 paddleocr
import paddle, paddleocr
print("Paddle:", paddle.__version__)



Paddle: 3.0.0


## Quick zero-shot sanity check for TrOCR

In [12]:
# Download a tiny demo image or place your line image at /content/line.jpg
# For now, we'll generate a blank image as a placeholder.
from PIL import Image, ImageDraw, ImageFont
img = Image.new('RGB', (640, 96), color='white')
draw = ImageDraw.Draw(img)
draw.text((10, 30), "Demo Line", fill='black')
img.save('/content/line.jpg')
print("Saved /content/line.jpg")

Saved /content/line.jpg


In [15]:
# Zero-shot TrOCR test (printed)
from PIL import Image
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
proc = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed")
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-printed").eval()
img = Image.open("/content/line.jpg").convert("RGB")
inputs = proc(images=img, return_tensors="pt")
out_ids = model.generate(**inputs, max_length=64)
text = proc.batch_decode(out_ids, skip_special_tokens=True)[0]
print("Prediction:", text)

Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-base-printed and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Prediction: BANDAR AND ONCLUSIVE ONCLUSIVE OF RECEIPT FOR
