In [None]:
# run in a notebook cell with a leading ! or in terminal
!pip install --upgrade pip
!pip install datasets ultralytics paddlepaddle paddleocr easyocr opencv-python-headless matplotlib pillow tqdm

In [None]:
pip install nbstripout


In [None]:
import os, json
import numpy as np
import pandas as pd
from tqdm import tqdm
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras import layers, models, regularizers
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns


Load and Parse CORD Dataset

In [None]:
# Load dataset
ds = load_dataset("naver-clova-ix/cord-v2")
print(ds)

In [None]:
def extract_text_labels_from_sample(sample):
    gt = json.loads(sample['ground_truth'])
    gt_parse = gt.get('gt_parse', {})
    texts, labels = [], []

    # Menu items
    if 'menu' in gt_parse:
        for item in gt_parse['menu']:
            if isinstance(item, dict):
                cnt = item.get('cnt','')
                nm  = item.get('nm','')
                price = item.get('price','')
                combined = f"{cnt} {nm} {price}".strip()
                if combined:
                    texts.append(combined)
                    labels.append('menu_item')

    # Subtotal / Total / Service / Tax / etc
    for key in ['sub_total', 'total', 'service_price', 'tax_price', 'etc']:
        val = gt_parse.get(key)
        if val:
            if isinstance(val, dict):
                for k,v in val.items():
                    texts.append(f"{k}: {v}")
                    labels.append(k)
            else:
                texts.append(f"{key}: {val}")
                labels.append(key)

    return texts, labels


In [None]:
sample = ds['train'][0]
texts, labels = extract_text_labels_from_sample(sample)
print(len(texts), texts[:5])
print(labels[:5])

Build Dataset Lists

In [None]:
def build_text_label_lists(dataset_split):
    all_texts, all_labels = [], []
    for sample in tqdm(dataset_split):
        t, l = extract_text_labels_from_sample(sample)
        all_texts.extend(t)
        all_labels.extend(l)
    return all_texts, all_labels

train_texts, train_labels = build_text_label_lists(ds['train'])
test_texts, test_labels = build_text_label_lists(ds['test'])
print("Train samples:", len(train_texts))
print("Test samples:", len(test_texts))

Clean & Merge Rare Classes

In [None]:
train_df = pd.DataFrame({'text': train_texts, 'label': train_labels})
test_df = pd.DataFrame({'text': test_texts, 'label': test_labels})

# Merge ultra-rare labels
train_df['label'] = train_df['label'].replace({
    'othersvc_price': 'etc',
    'sub_total': 'subtotal_price'
})
test_df['label'] = test_df['label'].replace({
    'othersvc_price': 'etc',
    'sub_total': 'subtotal_price'
})

print(train_df['label'].value_counts())


Encode Labels & Vectorize Text

In [None]:
# Encode labels
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_df['label'])
y_test = label_encoder.transform(test_df['label'])

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=3000, ngram_range=(1,2))
X_train = vectorizer.fit_transform(train_df['text'])
X_test = vectorizer.transform(test_df['text'])

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)
print("Classes:", label_encoder.classes_)


In [None]:
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weight_dict = dict(enumerate(class_weights))
print("Class Weights:\n", class_weight_dict)


Build a Simple Classifier (using Keras)

In [None]:
model = models.Sequential([
    layers.Input(shape=(X_train.shape[1],)),
    layers.Dense(512, activation='relu', kernel_regularizer=regularizers.l2(0.001)),
    layers.Dropout(0.5),
    layers.Dense(256, activation='relu', kernel_regularizer=regularizers.l2(0.001)),
    layers.Dropout(0.4),
    layers.Dense(len(label_encoder.classes_), activation='softmax')
])

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

model.summary()

Prepare Labels & Train Model

In [None]:
y_train_encoded = to_categorical(y_train, num_classes=len(label_encoder.classes_))
y_test_encoded = to_categorical(y_test, num_classes=len(label_encoder.classes_))

history = model.fit(
    X_train.toarray(), y_train_encoded,
    validation_data=(X_test.toarray(), y_test_encoded),
    epochs=20,
    batch_size=32,
    class_weight=class_weight_dict,
    verbose=1
)

Evaluate Model

In [None]:
y_pred = np.argmax(model.predict(X_test.toarray()), axis=1)

print("\nClassification Report:\n")
print(classification_report(
    y_test, y_pred, target_names=label_encoder.classes_
))

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(12, 10))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=label_encoder.classes_,
            yticklabels=label_encoder.classes_)
plt.title("Confusion Matrix")
plt.show()


In [None]:
import joblib

model.save("invoice_classifier.h5")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")
joblib.dump(label_encoder, "label_encoder.pkl")

print("âœ… Model and artifacts saved!")
