In [None]:
!rm *.pth

In [None]:
# Cell 1: Imports and Setup
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from torch.cuda.amp import autocast, GradScaler
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from PIL import Image
import os
import timm
from torchvision.transforms import RandAugment, RandomErasing
import re
import itertools

# Device setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Cell 2: Load and Preprocess Dataset
image_dir = '../grade_comparisons/'  # Update this path as needed

# List all JPEG images in the directory
file_names = [f for f in os.listdir(image_dir) if os.path.isfile(os.path.join(image_dir, f)) and f.lower().endswith('.jpg')]

def extract_grade(filename):
    pattern = r'cropped_(\d+)_cert'
    match = re.search(pattern, filename)
    if match:
        return int(match.group(1))
    else:
        print(f"no match for {filename}")
        return None

# Create DataFrame
df = pd.DataFrame({'filename': file_names})
df['grade'] = df['filename'].apply(extract_grade)
df['full_path'] = df['filename'].apply(lambda x: os.path.join(image_dir, x))

# Filter out missing images
df = df[df['full_path'].apply(os.path.exists)]
print(f"Dataset size after filtering: {len(df)}")

# Check class distribution
print("Unique grades in full dataset:", df['grade'].unique())
print("Number of unique grades in full dataset:", df['grade'].nunique())
grade_counts = df['grade'].value_counts().reset_index()
grade_counts.columns = ['grade', 'count']
grade_counts['percent'] = grade_counts['count'] / len(df) * 100
print(grade_counts)

# Split into train and validation sets
SPLIT_FRAC = 0.8
train_df = df.sample(frac=SPLIT_FRAC, random_state=42)
val_df = df.drop(train_df.index)
print(f"Training samples: {len(train_df)}, Validation samples: {len(val_df)}")

# Encode labels
le = LabelEncoder()
le.fit(df['grade'])
train_df['label'] = le.transform(train_df['grade'])
val_df['label'] = le.transform(val_df['grade'])
num_classes = len(le.classes_)
print(f"Number of classes: {num_classes}")

# Cell 3: Define Transforms
def pad_to_square(image, fill=0, padding_mode="constant"):
    w, h = image.size
    max_wh = max(w, h)
    pad_left = (max_wh - w) // 2
    pad_top = (max_wh - h) // 2
    pad_right = max_wh - w - pad_left
    pad_bottom = max_wh - h - pad_top
    return transforms.Pad((pad_left, pad_top, pad_right, pad_bottom), fill=fill, padding_mode=padding_mode)(image)

train_transform = transforms.Compose([
    transforms.Lambda(lambda img: pad_to_square(img, fill=0)),
    transforms.Resize(224),
    RandAugment(num_ops=2, magnitude=9),
    transforms.ToTensor(),
    transforms.RandomErasing(p=0.5),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

val_transform = transforms.Compose([
    transforms.Lambda(lambda img: pad_to_square(img, fill=0)),
    transforms.Resize(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Cell 4: Create Dataset and DataLoader
class CardDataset(Dataset):
    def __init__(self, df, transform=None):
        self.df = df
        self.transform = transform
    def __len__(self):
        return len(self.df)
    def __getitem__(self, idx):
        img_path = self.df.iloc[idx]['full_path']
        label = self.df.iloc[idx]['label']
        image = Image.open(img_path).convert('RGB')
        if self.transform:
            image = self.transform(image)
        return image, label

BATCH_SIZE = 128
train_dataset = CardDataset(train_df, transform=train_transform)
val_dataset = CardDataset(val_df, transform=val_transform)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)
print("Data loaders created")

# Cell 5: Define CORAL Head and Modify Model
class CoralHead(nn.Module):
    def __init__(self, in_features, num_classes):
        super().__init__()
        self.fc = nn.Linear(in_features, num_classes - 1)
    def forward(self, x):
        return self.fc(x)

# Cell 6: Define CORAL Loss Function
def coral_loss(logits, levels, class_weights=None):
    batch_size = logits.size(0)
    levels = levels.view(-1, 1).to(device)
    targets = (levels > torch.arange(num_classes - 1).to(device)).float()
    loss = nn.functional.binary_cross_entropy_with_logits(logits, targets, reduction='none')
    if class_weights is not None:
        sample_weights = class_weights[levels.squeeze()].to(device)
        loss = loss * sample_weights.view(-1, 1)
    return loss.mean()

# Cell 7: Define Validation Function
def validate_coral(model, val_loader, criterion):
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            logits = model(inputs)
            loss = criterion(logits, labels)
            val_loss += loss.item()
            probs = torch.sigmoid(logits)
            preds = (probs > 0.5).int()
            pred_levels = torch.sum(preds, dim=1).cpu().numpy()
            true_levels = labels.cpu().numpy()
            correct += np.sum(pred_levels == true_levels)
            total += labels.size(0)
    val_loss /= len(val_loader)
    val_acc = correct / total
    return val_loss, val_acc

# Cell 8: Define Training Function with Save Path
def train_model_coral(model, train_loader, val_loader, criterion, optimizer, scheduler, num_epochs, phase='initial', save_path='best_model.pth'):
    best_val_loss = float('inf')
    patience_counter = 0
    scaler = GradScaler()  # Updated for future compatibility
    history = {'train_loss': [], 'val_loss': [], 'val_accuracy': []}
    
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            with autocast():  # Updated for future compatibility
                logits = model(inputs)
                loss = criterion(logits, labels)
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            train_loss += loss.item()
        train_loss /= len(train_loader)
        
        val_loss, val_acc = validate_coral(model, val_loader, criterion)
        history['train_loss'].append(train_loss)
        history['val_loss'].append(val_loss)
        history['val_accuracy'].append(val_acc)
        
        print(f'{phase} Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}')
        scheduler.step()
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
            torch.save(model.state_dict(), save_path)
        else:
            patience_counter += 1
            if patience_counter >= PATIENCE:
                print('Early stopping triggered')
                break
    
    return history

# Cell 9: Hyperparameter Tuning Loop
import itertools

# Define hyperparameters to tune
lr_initial_list = [1e-3, 5e-4]
lr_fine_tune_list = [1e-4, 5e-5]
activate_weights_list = [0, 1]
dropout_prob_list = [0.0, 0.3]
patience_list = [10, 15]
epochs_list = [50]

# Generate all combinations
combinations = list(itertools.product(
    lr_initial_list, lr_fine_tune_list, activate_weights_list, 
    dropout_prob_list, patience_list, epochs_list
))

# Initialize list to store results
results = []

WEIGHT_DECAY = 1e-4  # Fixed weight decay

for idx, (lr_initial, lr_fine_tune, activate_weights, dropout_prob, patience, epochs) in enumerate(combinations):
    print(f"\nTraining combination {idx+1}/{len(combinations)}: "
          f"LR_INITIAL={lr_initial}, LR_FINE_TUNE={lr_fine_tune}, "
          f"ACTIVATE_WEIGHTS={activate_weights}, DROPOUT_PROB={dropout_prob}, "
          f"PATIENCE={patience}, EPOCHS={epochs}")
    
    # Set hyperparameters
    LR_INITIAL = lr_initial
    LR_FINE_TUNE = lr_fine_tune
    ACTIVATE_WEIGHTS_TENSOR = activate_weights
    DROPOUT_PROB = dropout_prob
    PATIENCE = patience
    EPOCHS_INITIAL = epochs
    EPOCHS_FINE_TUNE = epochs
    
    # Create a fresh model instance with the specified dropout probability
    model = timm.create_model('convnext_base', pretrained=True, drop_rate=DROPOUT_PROB)
    in_features = model.head.fc.in_features
    model.head.fc = CoralHead(in_features, num_classes)
    model = model.to(device)
    
    # Compute class weights if enabled
    if ACTIVATE_WEIGHTS_TENSOR:
        class_weights = compute_class_weight('balanced', classes=np.unique(train_df['label']), y=train_df['label'])
        class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(device)
    else:
        class_weights_tensor = None
    
    # Initial training (train only the head)
    for name, param in model.named_parameters():
        if 'head' not in name:
            param.requires_grad = False
    optimizer = optim.AdamW(
        filter(lambda p: p.requires_grad, model.parameters()), 
        lr=LR_INITIAL, 
        weight_decay=WEIGHT_DECAY
    )
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS_INITIAL, eta_min=1e-6)
    history_initial = train_model_coral(
        model, 
        train_loader, 
        val_loader,
        lambda logits, labels: coral_loss(logits, labels, class_weights_tensor),
        optimizer, 
        scheduler, 
        EPOCHS_INITIAL, 
        'Initial',
        save_path=f'best_model_initial_{idx}.pth'
    )
    
    # Load the best model from initial training
    model.load_state_dict(torch.load(f'best_model_initial_{idx}.pth'))
    
    # Fine-tuning (train all layers)
    for param in model.parameters():
        param.requires_grad = True
    optimizer = optim.AdamW(
        model.parameters(), 
        lr=LR_FINE_TUNE, 
        weight_decay=WEIGHT_DECAY
    )
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS_FINE_TUNE, eta_min=1e-6)
    history_fine = train_model_coral(
        model, 
        train_loader, 
        val_loader,
        lambda logits, labels: coral_loss(logits, labels, class_weights_tensor),
        optimizer, 
        scheduler, 
        EPOCHS_FINE_TUNE, 
        'Fine-tune',
        save_path=f'best_model_fine-tune_{idx}.pth'
    )
    
    # Load the best fine-tuned model
    model.load_state_dict(torch.load(f'best_model_fine-tune_{idx}.pth'))
    
    # Evaluate on validation set
    val_loss, val_acc = validate_coral(
        model, 
        val_loader, 
        lambda logits, labels: coral_loss(logits, labels, class_weights_tensor)
    )
    print(f"Combination {idx+1}: Final Val Loss: {val_loss:.4f}, Final Val Acc: {val_acc:.4f}")
    
    # Store the results
    results.append({
        'LR_INITIAL': lr_initial,
        'LR_FINE_TUNE': lr_fine_tune,
        'ACTIVATE_WEIGHTS': activate_weights,
        'DROPOUT_PROB': dropout_prob,
        'PATIENCE': patience,
        'EPOCHS': epochs,
        'val_acc': val_acc,
        'val_loss': val_loss
    })

# Convert results to DataFrame and save to CSV
results_df = pd.DataFrame(results)
results_df.to_csv('hyperparameter_tuning_results.csv', index=False)

# Find and display the best combination
best_idx = results_df['val_acc'].idxmax()
best_combination = results_df.iloc[best_idx]
print(f"\nBest Combination Found:")
print(f"LR_INITIAL: {best_combination['LR_INITIAL']}")
print(f"LR_FINE_TUNE: {best_combination['LR_FINE_TUNE']}")
print(f"ACTIVATE_WEIGHTS: {best_combination['ACTIVATE_WEIGHTS']}")
print(f"DROPOUT_PROB: {best_combination['DROPOUT_PROB']}")
print(f"PATIENCE: {best_combination['PATIENCE']}")
print(f"EPOCHS: {best_combination['EPOCHS']}")
print(f"Validation Accuracy: {best_combination['val_acc']:.4f}")
print(f"Validation Loss: {best_combination['val_loss']:.4f}")

# Cell 10: Prediction Function (Optional)
def predict_grade_coral(img_path, model, le, transform):
    model.eval()
    image = Image.open(img_path).convert('RGB')
    image = transform(image).unsqueeze(0).to(device)
    with torch.no_grad():
        logits = model(image)
        probs = torch.sigmoid(logits)
        preds = (probs > 0.5).int()
        pred_level = torch.sum(preds).item()
        if pred_level == 0:
            return le.inverse_transform([0])[0]
        elif pred_level == num_classes - 1:
            return le.inverse_transform([num_classes - 1])[0]
        else:
            return le.inverse_transform([pred_level])[0]

# Example usage (uncomment to test)
# sample_img_path = val_df.iloc[0]['full_path']
# predicted_grade = predict_grade_coral(sample_img_path, model, le, val_transform)
# print(f"Predicted grade for {sample_img_path}: {predicted_grade}")

In [None]:
# make sure to use the gpu for inference
    
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

# show a progress bar for inference
from tqdm import tqdm
tqdm.pandas()

# infer on only the validation set
val_df['predicted'] = val_df['filename'].progress_apply(lambda x: predict_grade_coral(x, model.to(device), le, val_transform))
val_df['correct'] = val_df['predicted'] == val_df['grade']
print(f"Overall Validation Set Accuracy: {val_df['correct'].mean():.4f}")
# print accuracy by grade, sorted by accuracy, accuracy in percent
print(val_df.groupby('grade')['correct'].mean().sort_values(ascending=False) * 100)


In [None]:
# Set device (assumes GPU like H200 is available)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

# Load dataset
big_df = pd.read_csv('../scrape/psa_sales4.csv')  # Replace with your CSV path
image_dir = '../scrape/cropped4'  # Replace with your image directory
big_df['filename'] = big_df['certNumber'].apply(lambda x: os.path.join(image_dir, f"cert_{x}.jpg"))

# Print how many images are missing
print(f"Missing images: {len(big_df[big_df['filename'].apply(lambda x: not os.path.exists(x))])}")

# Remove non-existing images
big_df = big_df[big_df['filename'].apply(os.path.exists)]

# drop grades that are not in the original dataset
big_df = big_df[big_df['grade'].isin(df['grade'].unique())]
print(f"Dataset size after filtering: {len(big_df)}")

# show a progress bar for inference
from tqdm import tqdm
tqdm.pandas()

SPLIT_FRAC = 0.8

# Split into training and validation sets
big_train_df = big_df.sample(frac=SPLIT_FRAC, random_state=42)
big_val_df = big_df.drop(big_train_df.index)

# Encode labels
le = LabelEncoder()
le.fit(df['grade'])
big_train_df['label'] = le.transform(big_train_df['grade'])
big_val_df['label'] = le.transform(big_val_df['grade'])

# infer on only the validation set
big_val_df['predicted'] = big_val_df['filename'].progress_apply(lambda x: predict_grade_coral(x, model.to(device), le, val_transform))
big_val_df['correct'] = big_val_df['predicted'] == big_val_df['grade']
print(f"Overall Validation Set Accuracy: {big_val_df['correct'].mean():.4f}")
# print accuracy by grade, sorted by accuracy, accuracy in percent
print(big_val_df.groupby('grade')['correct'].mean().sort_values(ascending=False) * 100)