In [None]:
# uv init
# uv venv torch_env
# uv venv torch_gpu --python=3.13
# uv venv torch_gpu --python "C:\Users\user\AppData\Local\Programs\Python\Python313\python.exe"

# torch_env\Scripts\activate
# uv add matplotlib pycocotools opencv-python Pillow torch ultralytics rfdetr pyyaml ipykernel

## Had to use pip, uv still have some issues with GPU 
# uv pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
# uv pip install ultralytics
# uv pip install rfdetr

#### Single GPU

In [None]:

import os
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from datautils import MyTrainDataset
from utils import print_nvidia_smi


class Trainer:
    def __init__(
        self,
        model: torch.nn.Module,
        train_dataloader: DataLoader,
        optimizer: torch.optim.Optimizer,
        gpu_id: int,
        save_every: int,
    ) -> None:
        self.model = model.to(gpu_id)
        self.train_dataloader = train_dataloader
        self.optimizer = optimizer        
        self.gpu_id = gpu_id
        self.save_every = save_every

    def _run_batch(self, source, targets):
        self.optimizer.zero_grad()
        output = self.model(source)
        loss = F.cross_entropy(output, targets)
        loss.backward()
        self.optimizer.step()

    def _run_epoch(self, epoch):
        # b_sz = len(next(iter(self.train_dataloader))[0]) 
        # print(f"[GPU{self.gpu_id}] Epoch {epoch} | Batchsize: {b_sz} | Steps: {len(self.train_dataloader)}")
        print(f"[GPU{self.gpu_id}] Epoch {epoch} | Steps: {len(self.train_dataloader)}")
        for source, targets in self.train_dataloader:
            source = source.to(self.gpu_id)
            targets = targets.to(self.gpu_id)
            self._run_batch(source, targets)

    def _save_checkpoint(self, epoch):
        ckp = self.model.state_dict()
        # PATH = "final_checkpoint.pt"
        if not os.path.exists("trained_models"):
            os.makedirs("trained_models")
        PATH = f"trained_models/checkpoint_epoch_{epoch}.pt"
        torch.save(ckp, PATH)
        print(f"Epoch {epoch} | Training checkpoint saved at {PATH}")

    def train(self, max_epochs: int):
        for epoch in range(max_epochs):
            self._run_epoch(epoch)
            if epoch % self.save_every == 0:
                self._save_checkpoint(epoch)


def load_train_objs():
    train_dataset = MyTrainDataset(2048)
    model = torch.nn.Linear(20, 1)
    optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
    return train_dataset, model, optimizer


def prepare_dataloader(dataset: Dataset, batch_size: int):
    return DataLoader(
        dataset,
        batch_size=batch_size,
        pin_memory=True,
        shuffle=True
    )


def main(device, total_epochs, save_every, batch_size):
    #check if GPU is available
    if not torch.cuda.is_available():
        raise RuntimeError("GPU is not available. This script requires a GPU to run.")
    else:
        print("GPU is available. Proceeding with training on GPU.")

    dataset, model, optimizer = load_train_objs()
    train_dataloader = prepare_dataloader(dataset, batch_size)
    trainer = Trainer(model, train_dataloader, optimizer, device, save_every)
    trainer.train(total_epochs)


if __name__ == "__main__": 
    import argparse
    parser = argparse.ArgumentParser(description='simple distributed training job')
    parser.add_argument('--total_epochs', type=int, default=10, help='Total epochs to train the model')
    parser.add_argument('--save_every', type=int, default=2, help='How often to save a snapshot')
    parser.add_argument('--batch_size', type=int, default=32, help='Input batch size on each device (default: 32)')
    args = parser.parse_args()

    print_nvidia_smi()
    #     
    device = 0  # shorthand for cuda:0
    main(device, args.total_epochs, args.save_every, args.batch_size)

## Usage example: 
# py 1-single_gpu.py --total_epochs 10 --save_every 2 --batch_size 64



In [None]:
from urllib.parse import urlparse


url = "https://www.example.com:8080/path/to/page?query=123&sort=asc#section2"

parsed = urlparse(url)

print("Scheme:", parsed.scheme)
print("Netloc:", parsed.netloc)
print("Hostname:", parsed.hostname)
print("Port:", parsed.port)
print("Path:", parsed.path)
print("Query:", parsed.query)
print("Fragment:", parsed.fragment)


Scheme: https
Netloc: www.example.com:8080
Hostname: www.example.com
Port: 8080
Path: /path/to/page
Query: query=123&sort=asc
Fragment: section2


In [2]:
print(parsed.scheme)

https


# DataParallel Notebook
-------------------------
Inspired by: [Pytorch Data Parallelism Tutorial](https://docs.pytorch.org/tutorials/beginner/blitz/data_parallel_tutorial.html)

# Imports and Initial Setup
------------------------------------
This cell imports the necessary PyTorch libraries.

In [1]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

In [2]:
print(f"PyTorch Version: {torch.__version__}")
print("-" * 30)

# Check for available GPUs
if torch.cuda.is_available():
    num_gpus = torch.cuda.device_count()
    print(f"Found {num_gpus} GPUs.")
    # Set the primary device
    device = torch.device("cuda:0")
else:
    num_gpus = 0
    print("No GPUs found. Running on CPU.")
    device = torch.device("cpu")

PyTorch Version: 2.9.0+cu128
------------------------------
Found 1 GPUs.


# Data Parallel
-----------------------------
Source: [DataParallel vs. DistributedDataParallel in PyTorch: Whatâ€™s the Difference?](https://medium.com/@mlshark/dataparallel-vs-distributeddataparallel-in-pytorch-whats-the-difference-0af10bb43bc7)

# Define a Simple Model
-----------------------------
We'll create a basic neural network for this demonstration.
DataParallel will replicate this model on each available GPU.

In [4]:
class SimpleModel(nn.Module):
    def __init__(self, input_size, output_size):
        super(SimpleModel, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(128, output_size)

    def forward(self, x, debug=False):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        if debug:
            print("\tInside the Model: input size", x.size(), "output size", out.size())
        return out







# Data Preparation and Training Loop
------------------------------------------
This is the main part where we wrap our model with DataParallel
and run the training process.

## 1. Hyperparameters and Data

In [7]:
input_size = 784
output_size = 10
batch_size = 256  # A larger batch size helps utilize multiple GPUs
learning_rate = 0.01
num_epochs = 20

# Create dummy data
# We create a dataset of 10000 samples
inputs = torch.randn(10000, input_size)
targets = torch.randint(0, output_size, (10000,))

# Use DataLoader for batching
dataset = TensorDataset(inputs, targets)
# The batch size will be split across GPUs. If you have 2 GPUs,
# each will process batch_size / 2 samples.
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

## 2. Initialize and Wrap the Model
Instantiate the model

In [8]:
model = SimpleModel(input_size, output_size)

# IMPORTANT: Wrap the model with nn.DataParallel
# This is the key step for data parallelism.
# If multiple GPUs are available, this wrapper will handle the data distribution.
if num_gpus > 1:
    print(f"Using {num_gpus} GPUs for training!")
    device_ids = list(range(num_gpus))  # Explicitly specify all available GPUs
    model = nn.DataParallel(model, device_ids=device_ids)
else:
    print("Training on a single device (CPU or 1 GPU).")

# Move the model to the primary device. DataParallel will handle the rest.
model.to(device)

Training on a single device (CPU or 1 GPU).


SimpleModel(
  (fc1): Linear(in_features=784, out_features=128, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=128, out_features=10, bias=True)
)

## 3. Loss and Optimizer

In [9]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

## 4. Training Loop

In [10]:
print("\nStarting training...")

for epoch in range(num_epochs):
    total_loss = 0
    for i, (batch_inputs, batch_targets) in enumerate(data_loader):
        # Move data to the primary device. DataParallel will scatter it.
        batch_inputs = batch_inputs.to(device)
        batch_targets = batch_targets.to(device)

        # Forward pass
        # DataParallel automatically splits the batch, sends it to the GPUs,
        # executes the forward pass, and gathers the outputs on the primary device.
        debug = epoch == 0 and i == 0
        outputs = model(batch_inputs, debug=debug)
        if debug:
            print("Outside: input size", batch_inputs.size(), "output_size", outputs.size())
        loss = criterion(outputs, batch_targets)

        # Backward and optimize
        # The loss is computed on the primary GPU. The backward pass calculates
        # gradients on each GPU, which are then summed on the primary GPU.
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(data_loader)
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}')

print("\nTraining finished!")




Starting training...
	Inside the Model: input size torch.Size([256, 784]) output size torch.Size([256, 10])
Outside: input size torch.Size([256, 784]) output_size torch.Size([256, 10])
Epoch [1/20], Loss: 2.3264
Epoch [2/20], Loss: 2.3170
Epoch [3/20], Loss: 2.3090
Epoch [4/20], Loss: 2.3047
Epoch [5/20], Loss: 2.2997
Epoch [6/20], Loss: 2.2965
Epoch [7/20], Loss: 2.2870
Epoch [8/20], Loss: 2.2838
Epoch [9/20], Loss: 2.2795
Epoch [10/20], Loss: 2.2735
Epoch [11/20], Loss: 2.2701
Epoch [12/20], Loss: 2.2666
Epoch [13/20], Loss: 2.2619
Epoch [14/20], Loss: 2.2582
Epoch [15/20], Loss: 2.2539
Epoch [16/20], Loss: 2.2488
Epoch [17/20], Loss: 2.2424
Epoch [18/20], Loss: 2.2399
Epoch [19/20], Loss: 2.2346
Epoch [20/20], Loss: 2.2285

Training finished!


## 5. Accessing the Original Model
If you need to save the model's state dict or access the original model
without the DataParallel wrapper, you need to use .module

In [11]:
if isinstance(model, nn.DataParallel):
    original_model = model.module
    print("\nModel was wrapped in DataParallel. Accessing the original model via .module")
    torch.save(original_model.state_dict(), 'model_state.pth')
else:
    original_model = model
    print("\nModel was not wrapped. Saving the model directly.")
    torch.save(original_model.state_dict(), 'model_state.pth')


Model was not wrapped. Saving the model directly.


#### Distributed Data Parallel (DDP) Tutorial Series

In [1]:
import argparse                     # For parsing command-line arguments (though unused here)
import os                           # For interacting with the operating system (e.g., env vars, paths)
import sys                          # For system-specific parameters and functions (e.g., exit)
import tempfile                     # For creating temporary files/directories (used on Windows)
from urllib.parse import urlparse   # For parsing URL-style init_method strings

import torch                        # Core PyTorch library
import torch.distributed as dist    # PyTorch distributed communication package
import torch.nn as nn               # Neural network modules
import torch.optim as optim         # Optimization algorithms (e.g., SGD)

from torch.nn.parallel import DistributedDataParallel as DDP  # Wrapper for model parallelism


In [8]:
def verify_min_gpu_count(min_gpus: int = 2) -> bool:
    """ verification that we have at least 2 gpus to run dist examples """
    has_gpu = torch.accelerator.is_available()              # Check if any accelerator (GPU) is available
    gpu_count = torch.accelerator.device_count()            # Get number of available accelerators
    return has_gpu and gpu_count >= min_gpus                # Return True if enough GPUs exist

print(verify_min_gpu_count(2))
print(verify_min_gpu_count(1))

False
True


In [9]:
os.getpid()

24388

In [2]:
import torch

world_size = torch.cuda.device_count()
print(f"World Size (Number of GPUs): {world_size}")

World Size (Number of GPUs): 1
