In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from torchvision.datasets import OxfordIIITPet
import torchvision.transforms.functional as TF
import numpy as np
import matplotlib.pyplot as plt

# MyUNet.py として分割予定

In [2]:
class DoubleConv(nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(in_channels, out_channels, 3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(out_channels, out_channels, 3, padding=1),
            nn.ReLU(inplace=True)
        )

    def forward(self, x):
        return self.conv(x)

class UNet(nn.Module):
    def __init__(self, n_channels, n_classes):
        super().__init__()
        self.enc1 = DoubleConv(n_channels, 64)
        self.enc2 = DoubleConv(64, 128)
        self.enc3 = DoubleConv(128, 256)
        self.enc4 = DoubleConv(256, 512)
        self.bottleneck = DoubleConv(512, 1024)

        self.up1 = nn.ConvTranspose2d(1024, 512, 2, stride=2)
        self.dec1 = DoubleConv(1024, 512)
        self.up2 = nn.ConvTranspose2d(512, 256, 2, stride=2)
        self.dec2 = DoubleConv(512, 256)
        self.up3 = nn.ConvTranspose2d(256, 128, 2, stride=2)
        self.dec3 = DoubleConv(256, 128)
        self.up4 = nn.ConvTranspose2d(128, 64, 2, stride=2)
        self.dec4 = DoubleConv(128, 64)

        self.out_conv = nn.Conv2d(64, n_classes, 1)

    def forward(self, x):
        e1 = self.enc1(x)
        e2 = self.enc2(F.max_pool2d(e1, 2))
        e3 = self.enc3(F.max_pool2d(e2, 2))
        e4 = self.enc4(F.max_pool2d(e3, 2))
        b = self.bottleneck(F.max_pool2d(e4, 2))
        d1 = self.up1(b)
        # d1のサイズに合わせてe4をクロップ
        e4_cropped = self.crop_tensor(e4, d1)
        d1_cat = torch.cat([e4_cropped, d1], dim=1)
        d1 = self.dec1(d1_cat)

        d2 = self.up2(d1)
        # d2のサイズに合わせてe3をクロップ
        e3_cropped = self.crop_tensor(e3, d2)
        d2_cat = torch.cat([e3_cropped, d2], dim=1)
        d2 = self.dec2(d2_cat)

        d3 = self.up3(d2)
        # d3のサイズに合わせてe2をクロップ
        e2_cropped = self.crop_tensor(e2, d3)
        d3_cat = torch.cat([e2_cropped, d3], dim=1)
        d3 = self.dec3(d3_cat)

        d4 = self.up4(d3)
        # d4のサイズに合わせてe1をクロップ
        e1_cropped = self.crop_tensor(e1, d4)
        d4_cat = torch.cat([e1_cropped, d4], dim=1)
        d4 = self.dec4(d4_cat)

        return self.out_conv(d4)

    def crop_tensor(self, source, target):
        """
        sourceテンソルをtargetテンソルのサイズに合わせて中央でクロップする
        """
        target_size_h = target.size()[2]
        target_size_w = target.size()[3]
        source_size_h = source.size()[2]
        source_size_w = source.size()[3]
        
        # クロップの開始位置を計算 (中央揃え)
        delta_h = (source_size_h - target_size_h) // 2
        delta_w = (source_size_w - target_size_w) // 2
        
        # スライシングでクロップ
        return source[:, :, delta_h:delta_h + target_size_h, delta_w:delta_w + target_size_w]


# Mydataset.pyとして分割予定

In [None]:
from torch.utils.data import Dataset
import torch
import os
import json
from PIL import Image
import numpy as np
import cv2
import torchvision.transforms.functional as functinal

class PreTrainDataset(Dataset):
    def __init__(self,
                 test_doc_id_list,
                 test_mode = False,
                #  input_path = '../kuzushiji_recognition/synthetic_images/input_images/',
                 input_path = '../kuzushiji_recognition/synthetic_images/tmp_entire_data/',
                 json_path = '../kuzushiji_recognition/synthetic_images/gt_json.json',
                 transform = None,
                 image_downsample_rate = 10):
        super().__init__()
        self.test_doc_id_list = test_doc_id_list
        self.input_path = input_path
        self.transform = transform
        self.image_downsample_rate = image_downsample_rate
        # 画像のIDをリストにして保管。
        self.input_imageID_list = []
        for file_name in os.listdir(self.input_path):
            file_path = os.path.join(self.input_path, file_name)
            if os.path.isfile(file_path):
                if not (file_name.split('_sep_')[0] in self.test_doc_id_list) ^ test_mode:
                    self.input_imageID_list.append(file_name.split('.')[0])
        # アノテーションデータを保持するjsonファイルをロード
        self.gt_json = self.load_GT_json(json_path)

        # 入力画像に対応するアノテーションデータが存在するか確認。修正
        # for i in range(len(self.input_imageID_list)-1, -1,-1):
        #     if not self.input_imageID_list[i] in self.gt_json:
        #         del self.input_imageID_list[i]
    def __len__(self):
        return len(self.input_imageID_list)
    
    def __getitem__(self, index):
        image_id = self.input_imageID_list[index]
        image = Image.open(self.input_path+image_id+'.jpg')
        # ここで正解マップを復元 ( 4チャネルの torch tensor を出力。)
        tensor_gt = self.return_tensor_gt(gt_info_dic=self.gt_json['files'][image_id], image=image)
        
        # 1. 元の画像のサイズを取得
        original_w, original_h = image.size
        # 2. ターゲットとなる新しいサイズを計算 (高さ、幅を半分に)
        new_size = (original_h //self.image_downsample_rate , original_w // self.image_downsample_rate)
        # 3. functional.resizeを使って、入力画像と正解データの両方をリサイズ
        #    補間方法(interpolation)は、連続値なので両方ともBILINEAR(双線形補間)が適しています。
        image = functinal.resize(image, new_size, interpolation=functinal.InterpolationMode.BILINEAR)
        tensor_gt = functinal.resize(tensor_gt, new_size, interpolation=functinal.InterpolationMode.BILINEAR)

        image = self.transform(image)
        return image, tensor_gt

    def load_GT_json(self, file_path):
        with open(file_path, 'r', encoding='utf-8') as f:
            # print('check : ', end='')
            # print(f.readlines()[-1])
            data = json.load(f)
        print("json データを読み込みました。")
        return data

    def return_ground_truth_canvas(self, image):
        w, h = image.size
        main_region = np.zeros((h, w), dtype=np.float64)
        main_affinity = np.zeros((h, w), dtype=np.float64)
        furi_region = np.zeros((h, w), dtype=np.float64)
        furi_affinity = np.zeros((h, w), dtype=np.float64)
        return main_region, main_affinity, furi_region, furi_affinity

    def add_perspective_gaussian_to_canvas(self, canvas, points, amplitude=1.0):
        # 領域の4点を取得
        src_points = np.array(points, dtype=np.float32)

        # ガウス分布を生成するための仮想的な正方形領域を定義
        width = int(max(np.linalg.norm(src_points[0] - src_points[1]), np.linalg.norm(src_points[2] - src_points[3])))
        height = int(max(np.linalg.norm(src_points[0] - src_points[3]), np.linalg.norm(src_points[1] - src_points[2])))
        dst_points = np.array([[0, 0], [width - 1, 0], [width - 1, height - 1], [0, height - 1]], dtype=np.float32)

        # ガウス分布を生成
        x = np.linspace(-width / 2, width / 2, width)
        y = np.linspace(-height / 2, height / 2, height)
        x, y = np.meshgrid(x, y)
        sigma_x = width / 5.0
        sigma_y = height / 5.0
        gaussian = amplitude * np.exp(-((x**2) / (2 * sigma_x**2) + (y**2) / (2 * sigma_y**2)))

        # Perspective Transformation行列を計算
        matrix = cv2.getPerspectiveTransform(dst_points, src_points)

        # ガウス分布をPerspective Transformationで変形
        transformed_gaussian = cv2.warpPerspective(gaussian, matrix, (canvas.shape[1], canvas.shape[0]))

        # キャンバスにガウス分布を追加
        canvas += transformed_gaussian

        return canvas
    # <main_region と、gt_info_dic['main_region'] を入力に main_region をデザインし、ガウス分布マップを出力する。>
    def design_gaussian_map(self, canvas, point_list):    
        for points in point_list:
            p1x, p1y, p2x, p2y, p3x, p3y, p4x, p4y = points
            canvas = PreTrainDataset.add_perspective_gaussian_to_canvas('_', canvas, ((p1x, p1y), (p2x, p2y), (p3x, p3y), (p4x, p4y)), amplitude=1.0)
        return canvas
    # < main_region などの辞書と image を入力にとり、4チャネルの torch tensor を出力する。>
    def return_tensor_gt(self, gt_info_dic, image):
        main_region, main_affinity, furi_region, furi_affinity = PreTrainDataset.return_ground_truth_canvas('_', image)

        # <main_region, main_affinity, furi_region, furi_affinity をgt_info に基づいてデザイン>
        canvas_list = []
        for canvas in ['main_region', 'main_affinity', 'furi_region', 'furi_affinity']:
            canvas_list.append(self.design_gaussian_map(eval(canvas), gt_info_dic[canvas]))

        # <それぞれをtorch tensor に変換して、4チャネルのtorch tensor>
        tensor_list = []
        for canvas in canvas_list:
            tensor_list.append(torch.from_numpy(canvas.astype(np.float32)).clone())
        return_tensor = torch.stack((tensor_list[0],tensor_list[1],tensor_list[2],tensor_list[3]))
        return return_tensor

In [4]:
from torch.utils.data import Dataset
import torch
import torch.nn.functional as F
import os
import json
from PIL import Image
import numpy as np
import cv2
import torchvision.transforms.functional as functional
from typing import List, Tuple
import multiprocessing as mp
from functools import lru_cache

class PreTrainDataset(Dataset):
    def __init__(self,
                 test_doc_id_list,
                 test_mode = False,
                 input_path = '../kuzushiji-recognition/synthetic_images/input_images/',
                 json_path = '../kuzushiji-recognition/synthetic_images/gt_json.json',
                 transform = None,
                 image_downsample_rate = 10,
                 device = None,
                 precompute_gt = True,
                 num_workers = None):
        super().__init__()
        self.test_doc_id_list = test_doc_id_list
        self.input_path = input_path
        self.transform = transform
        self.image_downsample_rate = image_downsample_rate
        
        # デバイス設定
        if device is None:
            self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        else:
            self.device = device
            
        print(f"Using device: {self.device}")
        
        # ワーカー数設定
        if num_workers is None:
            self.num_workers = min(mp.cpu_count(), 4)
        else:
            self.num_workers = num_workers
        
        # 画像のIDをリストにして保管
        self.input_imageID_list = []
        for file_name in os.listdir(self.input_path):
            file_path = os.path.join(self.input_path, file_name)
            if os.path.isfile(file_path):
                if not (file_name.split('_sep_')[0] in self.test_doc_id_list) ^ test_mode:
                    self.input_imageID_list.append(file_name.split('.')[0])
        
        # アノテーションデータを保持するjsonファイルをロード
        self.gt_json = self.load_GT_json(json_path)
        
        # 正解データの事前計算（オプション）
        self.precomputed_gt = {}
        if precompute_gt:
            print("Pre-computing ground truth data...")
            self._precompute_ground_truth()
            print("Pre-computation completed.")

    def __len__(self):
        return len(self.input_imageID_list)
    
    def __getitem__(self, index):
        image_id = self.input_imageID_list[index]
        image = Image.open(self.input_path + image_id + '.jpg')
        
        # 事前計算されたデータがあれば使用
        if image_id in self.precomputed_gt:
            tensor_gt = self.precomputed_gt[image_id]
        else:
            # リアルタイムで正解データを生成
            tensor_gt = self.return_tensor_gt_optimized(
                gt_info_dic=self.gt_json['files'][image_id], 
                image=image
            )
        
        # 1. 元の画像のサイズを取得
        original_w, original_h = image.size
        # 2. ターゲットとなる新しいサイズを計算
        new_size = (original_h // self.image_downsample_rate, original_w // self.image_downsample_rate)
        
        # 3. リサイズ処理
        image = functional.resize(image, new_size, interpolation=functional.InterpolationMode.BILINEAR)
        tensor_gt = F.interpolate(
            tensor_gt.unsqueeze(0), 
            size=new_size, 
            mode='bilinear', 
            align_corners=False
        ).squeeze(0)

        if self.transform:
            image = self.transform(image)
        
        return image, tensor_gt

    def _precompute_ground_truth(self):
        """正解データを事前計算してメモリに保存"""
        for image_id in self.input_imageID_list:
            try:
                image = Image.open(self.input_path + image_id + '.jpg')
                tensor_gt = self.return_tensor_gt_optimized(
                    gt_info_dic=self.gt_json['files'][image_id], 
                    image=image
                )
                self.precomputed_gt[image_id] = tensor_gt
            except Exception as e:
                print(f"Error precomputing {image_id}: {e}")
                continue

    def load_GT_json(self, file_path):
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        print("jsonデータを読み込みました。")
        return data

    def return_tensor_gt_optimized(self, gt_info_dic, image):
        """最適化された正解データ生成メソッド"""
        w, h = image.size
        
        # GPUでテンソルを直接作成
        canvas_tensors = torch.zeros(4, h, w, dtype=torch.float32, device=self.device)
        
        # 各チャネルを並列処理
        channel_names = ['main_region', 'main_affinity', 'furi_region', 'furi_affinity']
        
        for i, channel_name in enumerate(channel_names):
            if channel_name in gt_info_dic and gt_info_dic[channel_name]:
                canvas_tensors[i] = self.design_gaussian_map_gpu(
                    canvas_tensors[i], 
                    gt_info_dic[channel_name], 
                    w, h
                )
        
        return canvas_tensors

    def design_gaussian_map_gpu(self, canvas_tensor, point_list, width, height):
        """GPU上でガウス分布マップを生成"""
        if not point_list:
            return canvas_tensor
            
        # バッチ処理のためにポイントリストを整理
        batch_points = torch.tensor(point_list, dtype=torch.float32, device=self.device)
        
        for points in batch_points:
            p1x, p1y, p2x, p2y, p3x, p3y, p4x, p4y = points
            
            # 四角形の各頂点
            src_points = torch.tensor([
                [p1x, p1y], [p2x, p2y], [p3x, p3y], [p4x, p4y]
            ], dtype=torch.float32, device=self.device)
            
            canvas_tensor = self.add_perspective_gaussian_gpu(
                canvas_tensor, src_points, width, height
            )
        
        return canvas_tensor

    def add_perspective_gaussian_gpu(self, canvas, src_points, canvas_width, canvas_height):
        """GPU上で透視変換されたガウス分布を追加"""
        # 四角形のサイズを計算
        width = max(
            torch.norm(src_points[0] - src_points[1]).item(),
            torch.norm(src_points[2] - src_points[3]).item()
        )
        height = max(
            torch.norm(src_points[0] - src_points[3]).item(),
            torch.norm(src_points[1] - src_points[2]).item()
        )
        
        width = int(width) + 1
        height = int(height) + 1
        
        # ガウス分布を生成
        gaussian = self.create_gaussian_kernel_gpu(width, height)
        
        # 透視変換行列を計算（CPUで実行）
        src_np = src_points.cpu().numpy()
        dst_np = np.array([
            [0, 0], [width-1, 0], [width-1, height-1], [0, height-1]
        ], dtype=np.float32)
        
        try:
            matrix = cv2.getPerspectiveTransform(dst_np, src_np)
            
            # 変換をGPU上で実行
            transformed_gaussian = self.warp_perspective_gpu(
                gaussian, matrix, canvas_width, canvas_height
            )
            
            canvas += transformed_gaussian
            
        except cv2.error:
            # 透視変換が失敗した場合はスキップ
            pass
            
        return canvas

    @lru_cache(maxsize=128)
    def create_gaussian_kernel_gpu(self, width, height):
        """GPU上でガウシアンカーネルを生成（キャッシュ付き）"""
        x = torch.linspace(-width/2, width/2, width, device=self.device)
        y = torch.linspace(-height/2, height/2, height, device=self.device)
        
        # メッシュグリッドを作成
        y_grid, x_grid = torch.meshgrid(y, x, indexing='ij')
        
        # ガウス分布のパラメータ
        sigma_x = width / 5.0
        sigma_y = height / 5.0
        
        # ガウス分布を計算
        gaussian = torch.exp(-(x_grid**2 / (2 * sigma_x**2) + y_grid**2 / (2 * sigma_y**2)))
        
        return gaussian

    def warp_perspective_gpu(self, image_tensor, matrix, output_width, output_height):
        """GPU上で透視変換を実行"""
        # 変換行列をテンソルに変換
        matrix_tensor = torch.from_numpy(matrix).float().to(self.device)
        
        # グリッドを生成
        grid = self.create_transformation_grid(
            matrix_tensor, output_height, output_width
        )
        
        # grid_sampleを使用して変換
        image_batch = image_tensor.unsqueeze(0).unsqueeze(0)  # [1, 1, H, W]
        grid_batch = grid.unsqueeze(0)  # [1, H, W, 2]
        
        transformed = F.grid_sample(
            image_batch, grid_batch, 
            mode='bilinear', 
            padding_mode='zeros',
            align_corners=False
        )
        
        return transformed.squeeze(0).squeeze(0)

    def create_transformation_grid(self, matrix, height, width):
        """変換グリッドを作成"""
        # 出力座標を生成
        y_coords = torch.arange(height, dtype=torch.float32, device=self.device)
        x_coords = torch.arange(width, dtype=torch.float32, device=self.device)
        
        y_grid, x_grid = torch.meshgrid(y_coords, x_coords, indexing='ij')
        
        # 同次座標に変換
        ones = torch.ones_like(x_grid)
        coords = torch.stack([x_grid, y_grid, ones], dim=-1)  # [H, W, 3]
        
        # 逆変換行列を適用
        try:
            inv_matrix = torch.inverse(matrix)
        except:
            # 逆行列が計算できない場合は単位行列を使用
            inv_matrix = torch.eye(3, device=self.device)
        
        # 変換を適用
        transformed_coords = torch.matmul(coords, inv_matrix.T)  # [H, W, 3]
        
        # 正規化座標に変換
        x_norm = transformed_coords[..., 0] / transformed_coords[..., 2]
        y_norm = transformed_coords[..., 1] / transformed_coords[..., 2]
        
        # grid_sampleの座標系に変換 [-1, 1]
        grid_x = 2.0 * x_norm / (width - 1) - 1.0
        grid_y = 2.0 * y_norm / (height - 1) - 1.0
        
        grid = torch.stack([grid_x, grid_y], dim=-1)
        
        return grid


# 使用例とパフォーマンス比較
def benchmark_dataset(dataset, num_samples=10):
    """データセットのパフォーマンスをベンチマーク"""
    import time
    
    start_time = time.time()
    for i in range(min(num_samples, len(dataset))):
        image, gt = dataset[i]
    end_time = time.time()
    
    avg_time = (end_time - start_time) / num_samples
    print(f"Average time per sample: {avg_time:.4f} seconds")
    return avg_time


# データローダー用の高速化設定
def create_optimized_dataloader(dataset, batch_size=8, num_workers=4):
    """最適化されたDataLoaderを作成"""
    from torch.utils.data import DataLoader
    
    return DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=num_workers,
        pin_memory=True if torch.cuda.is_available() else False,
        persistent_workers=True if num_workers > 0 else False,
        prefetch_factor=2 if num_workers > 0 else 2
    )

# メインの実行

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torchvision import transforms
from PIL import Image
import os
import json
import numpy as np
import cv2
from tqdm import tqdm
import matplotlib.pyplot as plt

def crop_labels_to_match(labels_to_crop, target_tensor):
    target_h, target_w = target_tensor.shape[2:]
    source_h, source_w = labels_to_crop.shape[2:]
    delta_h = (source_h - target_h) // 2
    delta_w = (source_w - target_w) // 2
    return labels_to_crop[:, :, delta_h:delta_h + target_h, delta_w:delta_w + target_w]
transform = transforms.Compose([
    # transforms.Resize((128, 128)),
    transforms.ToTensor()
])
# --- Dataset ---
test_doc_id_list = ['100241706', '100249371', '100249376', '100249416', '100249476', '100249537', '200003076', '200003803', '200003967', '200004107']
train_dataset = PreTrainDataset(test_doc_id_list,
                            test_mode = False,
                            # input_path = '../kuzushiji_recognition/synthetic_images/input_images/',
                            input_path = '../kuzushiji-recognition/synthetic_images_backup/input_images/',
                            json_path = '../kuzushiji-recognition/synthetic_images_backup/gt_json_backup.json',
                            transform = transform)
test_dataset = PreTrainDataset(test_doc_id_list,
                            test_mode = True,
                            # input_path = '../kuzushiji_recognition/synthetic_images/input_images/',
                            input_path = '../kuzushiji-recognition/synthetic_images_backup/input_images/',
                            json_path = '../kuzushiji-recognition/synthetic_images_backup/gt_json_backup.json',
                            transform = transform)


# 最適化されたDataLoaderの作成
train_dl = create_optimized_dataloader(train_dataset, batch_size=1, num_workers=1)
test_dl = create_optimized_dataloader(train_dataset, batch_size=1, num_workers=1)

# --- データセットとデータローダの準備 ---
# train_dl = DataLoader(train_dataset, batch_size=1, shuffle=True)
# test_dl = DataLoader(test_dataset, batch_size=1, shuffle=True)

# --- モデル、損失関数、最適化手法の定義 ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("mps" if torch.cuda.is_available() else "cpu")
model = UNet(3, 4).to(device)
criterion = nn.MSELoss() # 回帰問題なのでMSE損失を使用
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# --- 学習ループの拡張 ---

num_epochs = 100 # エポック数を定義

# 損失の履歴を保存するリストを初期化
train_loss_history = []
test_loss_history = []

print("学習を開始します...")
for epoch in range(num_epochs):
    print(f'start epcoch')
    # --- 訓練フェーズ ---
    model.train() # モデルを訓練モードに設定
    train_loss_total = 0
    
    # tqdmでプログレスバーを表示
    train_bar = tqdm(train_dl, desc=f"Epoch {epoch+1}/{num_epochs} [Train]")
    for imgs, masks in train_bar:
        imgs, masks = imgs.to(device), masks.to(device)
        
        preds = model(imgs)
        cropped_masks = crop_labels_to_match(masks, preds)

        loss = criterion(preds, cropped_masks)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        train_loss_total += loss.item()
        # プログレスバーに現在のロスを表示
        train_bar.set_postfix(loss=loss.item())

    avg_train_loss = train_loss_total / len(train_dl)
    train_loss_history.append(avg_train_loss)

    # --- 評価フェーズ ---
    model.eval() # モデルを評価モードに設定
    test_loss_total = 0
    
    # 勾配計算を無効化して、メモリ効率を良くする
    with torch.no_grad():
        test_bar = tqdm(test_dl, desc=f"Epoch {epoch+1}/{num_epochs} [Test]")
        for imgs, masks in test_bar:
            imgs, masks = imgs.to(device), masks.to(device)
            preds = model(imgs)
            cropped_masks = crop_labels_to_match(masks, preds)
            
            loss = criterion(preds, cropped_masks)
            test_loss_total += loss.item()
            test_bar.set_postfix(loss=loss.item())

    avg_test_loss = test_loss_total / len(test_dl)
    test_loss_history.append(avg_test_loss)
    
    # 各エポックの最後に訓練ロスとテストロスを表示
    print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {avg_train_loss:.4f} | Test Loss: {avg_test_loss:.4f}")

print("学習が完了しました。")

# --- 損失の推移をグラフで表示 ---
plt.figure(figsize=(10, 5))
plt.plot(train_loss_history, label="Train Loss")
plt.plot(test_loss_history, label="Test Loss")
plt.title("Loss Trend")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.grid(True)
plt.show()

Using device: cuda


KeyboardInterrupt: 

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torchvision import transforms
from PIL import Image
import os
import json
import numpy as np
import cv2
from tqdm import tqdm
import matplotlib.pyplot as plt

def crop_labels_to_match(labels_to_crop, target_tensor):
    target_h, target_w = target_tensor.shape[2:]
    source_h, source_w = labels_to_crop.shape[2:]
    delta_h = (source_h - target_h) // 2
    delta_w = (source_w - target_w) // 2
    return labels_to_crop[:, :, delta_h:delta_h + target_h, delta_w:delta_w + target_w]
transform = transforms.Compose([
    # transforms.Resize((128, 128)),
    transforms.ToTensor()
])
# --- Dataset ---
test_doc_id_list = ['100241706', '100249371', '100249376', '100249416', '100249476', '100249537', '200003076', '200003803', '200003967', '200004107']
# train_dataset = PreTrainDataset(test_doc_id_list,
#                             test_mode = False,
#                             # input_path = '../kuzushiji_recognition/synthetic_images/input_images/',
#                             input_path = '../kuzushiji-recognition/synthetic_images_backup/input_images/',
#                             json_path = '../kuzushiji-recognition/synthetic_images_backup/gt_json_backup.json',
#                             transform = transform)
# test_dataset = PreTrainDataset(test_doc_id_list,
#                             test_mode = True,
#                             # input_path = '../kuzushiji_recognition/synthetic_images/input_images/',
#                             input_path = '../kuzushiji-recognition/synthetic_images_backup/input_images/',
#                             json_path = '../kuzushiji-recognition/synthetic_images_backup/gt_json_backup.json',
#                             transform = transform)

train_dataset = PreTrainDataset(
    test_doc_id_list=test_doc_id_list,
    test_mode=False,
    device=torch.device('cuda'),  # GPUを明示的に指定
    precompute_gt=False,  # 事前計算を有効化
    # num_workers=None
)
test_dataset = PreTrainDataset(
    test_doc_id_list=test_doc_id_list,
    test_mode=True,
    device=torch.device('cuda'),  # GPUを明示的に指定
    precompute_gt=False,  # 事前計算を有効化
    # num_workers=4
)

# 最適化されたDataLoaderの作成
train_dl = create_optimized_dataloader(train_dataset, batch_size=1, num_workers=1)
test_dl = create_optimized_dataloader(train_dataset, batch_size=1, num_workers=1)

# --- データセットとデータローダの準備 ---
# train_dl = DataLoader(train_dataset, batch_size=1, shuffle=True)
# test_dl = DataLoader(test_dataset, batch_size=1, shuffle=True)

# --- モデル、損失関数、最適化手法の定義 ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("mps" if torch.cuda.is_available() else "cpu")
model = UNet(3, 4).to(device)
criterion = nn.MSELoss() # 回帰問題なのでMSE損失を使用
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# --- 学習ループの拡張 ---

num_epochs = 100 # エポック数を定義

# 損失の履歴を保存するリストを初期化
train_loss_history = []
test_loss_history = []

print("学習を開始します...")
for epoch in range(num_epochs):
    print(f'start epcoch')
    # --- 訓練フェーズ ---
    model.train() # モデルを訓練モードに設定
    train_loss_total = 0
    
    # tqdmでプログレスバーを表示
    train_bar = tqdm(train_dl, desc=f"Epoch {epoch+1}/{num_epochs} [Train]")
    for imgs, masks in train_bar:
        imgs, masks = imgs.to(device), masks.to(device)
        
        preds = model(imgs)
        cropped_masks = crop_labels_to_match(masks, preds)

        loss = criterion(preds, cropped_masks)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        train_loss_total += loss.item()
        # プログレスバーに現在のロスを表示
        train_bar.set_postfix(loss=loss.item())

    avg_train_loss = train_loss_total / len(train_dl)
    train_loss_history.append(avg_train_loss)

    # --- 評価フェーズ ---
    model.eval() # モデルを評価モードに設定
    test_loss_total = 0
    
    # 勾配計算を無効化して、メモリ効率を良くする
    with torch.no_grad():
        test_bar = tqdm(test_dl, desc=f"Epoch {epoch+1}/{num_epochs} [Test]")
        for imgs, masks in test_bar:
            imgs, masks = imgs.to(device), masks.to(device)
            preds = model(imgs)
            cropped_masks = crop_labels_to_match(masks, preds)
            
            loss = criterion(preds, cropped_masks)
            test_loss_total += loss.item()
            test_bar.set_postfix(loss=loss.item())

    avg_test_loss = test_loss_total / len(test_dl)
    test_loss_history.append(avg_test_loss)
    
    # 各エポックの最後に訓練ロスとテストロスを表示
    print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {avg_train_loss:.4f} | Test Loss: {avg_test_loss:.4f}")

print("学習が完了しました。")

# --- 損失の推移をグラフで表示 ---
plt.figure(figsize=(10, 5))
plt.plot(train_loss_history, label="Train Loss")
plt.plot(test_loss_history, label="Test Loss")
plt.title("Loss Trend")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.grid(True)
plt.show()

Using device: cuda
jsonデータを読み込みました。
Using device: cuda
jsonデータを読み込みました。
学習を開始します...
start epcoch


Epoch 1/100 [Train]:   0%|          | 0/4965 [00:00<?, ?it/s]

In [9]:
torch.cuda.is_available()

True

# PreTrainDataset　がうまく動作するか確認のためのメイン

In [None]:

test_doc_id_list = ['100241706']
train_dataset = PreTrainDataset(test_doc_id_list,
                            test_mode = False,
                            input_path = '../kuzushiji_recognition/synthetic_images/input_images/',
                            json_path = '../kuzushiji_recognition/synthetic_images/gt_json.json',
                            transform = None)
test_dataset = PreTrainDataset(test_doc_id_list,
                            test_mode = True,
                            input_path = '../kuzushiji_recognition/synthetic_images/input_images/',
                            json_path = '../kuzushiji_recognition/synthetic_images/gt_json.json',
                            transform = None)
train_dl = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_dl = DataLoader(test_dataset, batch_size=4, shuffle=True)

for imgs, masks in train_dl:
        imgs, masks = imgs.to(device), masks.to(device)
        preds = model(imgs)
        loss = criterion(preds, masks)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

In [18]:
aa = '100241706, 100249371, 100249376, 100249416, 100249476, 100249537, 200003076, 200003803, 200003967, 200004107, 200004148, 200005598, 200005798, 200006663, 200006665, 200008003, 200008316, 200010454, 200014685, 200014740, 200015779, 200015843, 200017458, 200018243, 200019865, 200020019, 200021063, 200021071, 200021086, 200021637, 200021644, 200021660, 200021712, 200021763, 200021802, 200021851, 200021853, 200021869, 200021925, 200022050, 200025191'
print(aa.split(', '))

['100241706', '100249371', '100249376', '100249416', '100249476', '100249537', '200003076', '200003803', '200003967', '200004107', '200004148', '200005598', '200005798', '200006663', '200006665', '200008003', '200008316', '200010454', '200014685', '200014740', '200015779', '200015843', '200017458', '200018243', '200019865', '200020019', '200021063', '200021071', '200021086', '200021637', '200021644', '200021660', '200021712', '200021763', '200021802', '200021851', '200021853', '200021869', '200021925', '200022050', '200025191']
