# AI Document Intelligence - Model Training Example

This notebook demonstrates training custom models for document intelligence tasks.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
from datasets import Dataset
import mlflow
import mlflow.pytorch

# Set up MLflow
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("document-classification")

## Data Preparation

In [None]:
# Load and prepare training data
def load_training_data():
    # Sample data - replace with actual dataset
    data = {
        'text': [
            "This is a contract document with terms and conditions...",
            "Financial report showing quarterly earnings...",
            "Technical specification for software requirements...",
            "Legal document outlining compliance requirements..."
        ],
        'category': ['contract', 'financial', 'technical', 'legal']
    }
    return pd.DataFrame(data)

df = load_training_data()
print(f"Dataset shape: {df.shape}")
print(df.head())

## Model Training

In [None]:
# Initialize tokenizer and model
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, 
    num_labels=len(df['category'].unique())
)

# Tokenize data
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding=True)

# Create label mapping
label_to_id = {label: i for i, label in enumerate(df['category'].unique())}
id_to_label = {i: label for label, i in label_to_id.items()}

# Prepare dataset
df['labels'] = df['category'].map(label_to_id)
dataset = Dataset.from_pandas(df[['text', 'labels']])
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Split data
train_dataset = tokenized_dataset.train_test_split(test_size=0.2)['train']
eval_dataset = tokenized_dataset.train_test_split(test_size=0.2)['test']

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

# Start MLflow run
with mlflow.start_run():
    # Log parameters
    mlflow.log_param("model_name", model_name)
    mlflow.log_param("num_epochs", training_args.num_train_epochs)
    mlflow.log_param("batch_size", training_args.per_device_train_batch_size)
    
    # Train model
    trainer.train()
    
    # Evaluate model
    eval_results = trainer.evaluate()
    
    # Log metrics
    mlflow.log_metrics(eval_results)
    
    # Save model
    mlflow.pytorch.log_model(model, "model")
    
    print(f"Training completed. Eval loss: {eval_results['eval_loss']:.4f}")

## Model Evaluation

In [None]:
# Make predictions
predictions = trainer.predict(eval_dataset)
predicted_labels = np.argmax(predictions.predictions, axis=1)
true_labels = predictions.label_ids

# Calculate metrics
accuracy = accuracy_score(true_labels, predicted_labels)
report = classification_report(true_labels, predicted_labels, target_names=list(label_to_id.keys()))

print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(report)

## Model Deployment Preparation

In [None]:
# Save model for deployment
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")

# Create model metadata
model_metadata = {
    "model_name": "document-classifier",
    "version": "1.0.0",
    "accuracy": accuracy,
    "labels": list(label_to_id.keys()),
    "model_path": "./fine_tuned_model"
}

import json
with open("./fine_tuned_model/metadata.json", "w") as f:
    json.dump(model_metadata, f, indent=2)

print("Model saved and ready for deployment!")