<a href="https://colab.research.google.com/github/romjiik/DL_cours_HSE/blob/main/hw05_letters_classification_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import cv2
import torch
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import matplotlib.pyplot as plt
from torchvision import transforms
from sklearn.model_selection import train_test_split
import random
from PIL import Image, ImageDraw, ImageFont
from collections import Counter
from tqdm.auto import tqdm
import os

# Преобразования картинки

In [None]:
Image.MAX_IMAGE_PIXELS = None
path = '/kaggle/input/letters-26/leters_26.png'
img = Image.open(path).convert('RGB')

Размер картинки 30000 на 30000. Всего 10000 букв и размер картинки кратен количеству букв. Значит размер одной картинки 300х300

In [None]:
# plt.imshow(img[:300, :300, :])

Создадим массив, где каждый элемент - это одна буква. Размер такого массива должен быть 10000

In [None]:
save_path = '/kaggle/working/dl_hw/data_test'
count = 1
for i in range(0, 30000, 300):
    for j in range(0, 30000, 300):
        box = (i, j, i+300, j+300)
        letter_image = img.crop(box)
        letter_image = letter_image.rotate(-(i // 300 + j // 300)*5, Image.NEAREST, expand = 1, fillcolor = 'white')
        if not os.path.exists(save_path):
              os.makedirs(save_path, exist_ok=True)
        letter_image.save(f"{save_path}/{count:05}.png")
        count += 1


# Генерация датасета

In [None]:
# набор шрифтов
font_list = [
    '/usr/local/lib/python3.10/dist-packages/matplotlib/mpl-data/fonts/ttf/DejaVuSans-Bold.ttf',
    '/usr/local/lib/python3.10/dist-packages/matplotlib/mpl-data/fonts/ttf/DejaVuSansMono-Bold.ttf',
    '/usr/local/lib/python3.10/dist-packages/matplotlib/mpl-data/fonts/ttf/DejaVuSansMono.ttf',
    '/usr/local/lib/python3.10/dist-packages/matplotlib/mpl-data/fonts/ttf/DejaVuSans.ttf',
    '/usr/local/lib/python3.10/dist-packages/matplotlib/mpl-data/fonts/ttf/DejaVuSerif-Italic.ttf',
    '/usr/local/lib/python3.10/dist-packages/matplotlib/mpl-data/fonts/ttf/DejaVuSerif.ttf',
 ]

In [None]:
def generate_dataset(letters, quantity, train=True):
  images_counter = Counter()
  images, chars = [], []
  for letter in letters:
    # создаю папку куда буду сохранять картинки
    if train:
      target_path = '/kaggle/working/dl_hw/data_train'
    else:
      target_path = '/kaggle/working/dl_hw/data_val'
    class_folder = target_path + "/" + str(letter)
    if not os.path.exists(class_folder):
      os.makedirs(class_folder, exist_ok=True)

    for i in range(quantity):
      # создаю картинку
      img = Image.new('RGBA', (300, 300), 'white')
      draw = ImageDraw.Draw(img)
      # цвет букв
      color_letter = (random.randint(0, 200), random.randint(0, 200), random.randint(0, 200))
      # размер букв
      size=random.randint(100, 200)
      # шрифт
      font = ImageFont.truetype(font_list[5], size=size) #перебором нашел, что у меня 5 шрифт
      # пишу текст
      draw.text((random.randint(0, 300 - size), random.randint(0, 300 - size)), letter, fill=color_letter, font=font)
      # рисую круги
      circles = random.randint(10, 40)
      for circle in range(circles):
        # цвет кружков
        color_circle = (random.randint(1, 200), random.randint(1, 200), random.randint(1, 200), random.randint(50, 70))
        # радиус
        r = random.randint(5, 70)
        # координаты
        x_0 = random.randint(0, 300 - r)
        y_0 = random.randint(0, 300 - r)
        x_1 = x_0 + r
        y_1 = y_0 + r
        overlay = Image.new('RGBA', img.size)
        draw = ImageDraw.Draw(overlay)
        draw.ellipse([x_0, y_0, x_1, y_1], fill=color_circle)
        img = Image.alpha_composite(img, overlay)
      img = img.convert('RGB')
      # сохраняю картинку
      images_counter[letter] += 1
      file_name = f"/{images_counter[letter]:05}.png"
      full_image_file_name = class_folder + file_name
      images.append(full_image_file_name)
      chars.append(letter)
      img = np.array(img)
      cv2.imwrite(full_image_file_name, img)

  pd.DataFrame({'image': images, 'letter': chars}).to_csv(f'{target_path}/my_data.csv', index=False)


In [None]:
import string

alphabet = list(string.ascii_uppercase)
quantity_train = 1000
quantity_val = 200

generate_dataset(alphabet, quantity_train)
generate_dataset(alphabet, quantity_val, train=False)

In [None]:
letters_mapping = {
    'A': 0,
    'B': 1,
    'C': 2,
    'D': 3,
    'E': 4,
    'F': 5,
    'G': 6,
    'H': 7,
    'I': 8,
    'J': 9,
    'K': 10,
    'L': 11,
    'M': 12,
    'N': 13,
    'O': 14,
    'P': 15,
    'Q': 16,
    'R': 17,
    'S': 18,
    'T': 19,
    'U': 20,
    'V': 21,
    'W': 22,
    'X': 23,
    'Y': 24,
    'Z': 25
}

# Модель

In [None]:
class LetterDataset(Dataset):
  def __init__(self, path, val=True, transform=None):
    super().__init__()
    self.data = pd.read_csv(path)
    self.transform = transform


  def __len__(self):
    return self.data.shape[0]

  def __getitem__(self, index):
    img = self.data.loc[index, 'image']
    letter = self.data.loc[index, 'letter']
    image = Image.open(img)
    if self.transform is not None:
      image = self.transform(image)
    return {'image': image, 'letter': letters_mapping[letter]}

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision.models import resnet18, ResNet18_Weights

class CustomResNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = resnet18(weights=ResNet18_Weights.IMAGENET1K_V1)
        hidden_size = self.model.fc.in_features
        self.model.fc = nn.Linear(hidden_size, 26)

    def forward(self, x):
        return self.model(x)

    def compute_all(self, batch):  # удобно сделать функцию, в которой вычисляется лосс по пришедшему батчу
        x = batch['image'].float()
        y = batch['letter']
        logits = self.forward(x)
        loss = F.cross_entropy(logits, y)
        acc = (logits.argmax(axis=1) == y).float().mean().cpu().numpy()
        metrics = dict(acc=acc)
        return loss, metrics

In [None]:
from torch.utils.tensorboard import SummaryWriter


class Trainer:
    def __init__(self, model, optimizer, train_dataset, val_dataset, batch_size=128):
        self.model = model
        self.optimizer = optimizer
        self.train_dataset = train_dataset
        self.val_dataset = val_dataset

        self.batch_size = batch_size

        self.device = 'cpu'
        if torch.cuda.is_available():
            self.device = torch.cuda.current_device()
            self.model = self.model.to(self.device)

        self.global_step = 0
        self.writer = SummaryWriter("./tmp/")

    def save_checkpoint(self, path):
        torch.save(self.model.state_dict(), path)

    def train(self, num_epochs):
        model = self.model
        optimizer = self.optimizer

        train_loader = DataLoader(self.train_dataset, shuffle=True, pin_memory=True, batch_size=self.batch_size)

        # валидационный датасет не стоит перемешивать!
        val_loader = DataLoader(self.val_dataset, shuffle=False, pin_memory=True, batch_size=self.batch_size)
        best_loss = float('inf')

        for epoch in range(num_epochs):
            model.train()
            for batch in tqdm(train_loader):
                batch = {k: v.to(self.device) for k, v in batch.items()}
                loss, metric = model.compute_all(batch)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                for k, v in metric.items():
                    self.writer.add_scalar(k, v, global_step=self.global_step)

                self.writer.add_scalar("loss", loss.item(), global_step=self.global_step)

                self.global_step += 1

            print(metric['acc'])

            model.eval()

            val_losses = []
            val_metrics = []
            for batch in tqdm(val_loader):
                batch = {k: v.to(self.device) for k, v in batch.items()}
                loss, metric = model.compute_all(batch)
                val_losses.append(loss.item())
                val_metrics.append(metric['acc'])

            val_loss = np.mean(val_losses)
            val_metric = np.mean(val_metrics)
            print(val_metric)
            if val_loss < best_loss:
                self.save_checkpoint("./best_checkpoint.pth")
                best_loss = val_loss
                best_acc = val_metric
        return best_loss, best_acc

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


# Обучение

In [None]:
model = CustomResNet()
opt = optim.Adam(model.parameters(), lr=0.01)

data_path_train = '/kaggle/working/dl_hw/data_train/my_data.csv'
data_path_val = '/kaggle/working/dl_hw/data_val/my_data.csv'

transform_to_tensor_train = transforms.Compose([
    transforms.Resize(256),
    transforms.ElasticTransform(alpha=25.0),
    transforms.ToTensor()
])

transform_to_tensor_val = transforms.Compose([
    transforms.Resize(256),
    transforms.ToTensor()
])

train = LetterDataset(data_path_train, transform=transform_to_tensor_train)
val = LetterDataset(data_path_val, transform=transform_to_tensor_val)

trainer = Trainer(model, opt, train, val, batch_size=128)

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 190MB/s]


In [None]:
print(trainer.train(3))
torch.save(model.state_dict(), "custom_resnet.pt")  # важно сохранить веса обученой модели!

  0%|          | 0/204 [00:00<?, ?it/s]

0.875


  0%|          | 0/41 [00:00<?, ?it/s]

0.050990853


  0%|          | 0/204 [00:00<?, ?it/s]

1.0


  0%|          | 0/41 [00:00<?, ?it/s]

0.9946646


  0%|          | 0/204 [00:00<?, ?it/s]

1.0


  0%|          | 0/41 [00:00<?, ?it/s]

0.5050686
(0.01766648275987813, 0.9946646)


# Предсказания

In [None]:
class ImageDataset(Dataset):
  def __init__(self, data_path, transform=None):
    super().__init__()
    self.data_path = data_path
    self.transform = transform
    self.all_files = []
    self.files = sorted(os.listdir(f'{data_path}/'))
    for i in tqdm(self.files, total=len(self.files)):
      self.all_files.append(i)

  def __len__(self):
    return len(self.all_files)

  def __getitem__(self, index):
    img = Image.open(f"{self.data_path}/{index+1:05}.png")
    if self.transform is not None:
      img = self.transform(img)
    return {'image': img}

In [None]:
transform_to_tensor = transforms.Compose([
    transforms.Resize(256),
    transforms.ToTensor()
])

test = ImageDataset(save_path, transform=transform_to_tensor)

  0%|          | 0/10000 [00:00<?, ?it/s]

In [None]:
test_loader = DataLoader(test, batch_size=128, shuffle=False, pin_memory=True)

letters_mapping = {
    'A': 0,
    'B': 1,
    'C': 2,
    'D': 3,
    'E': 4,
    'F': 5,
    'G': 6,
    'H': 7,
    'I': 8,
    'J': 9,
    'K': 10,
    'L': 11,
    'M': 12,
    'N': 13,
    'O': 14,
    'P': 15,
    'Q': 16,
    'R': 17,
    'S': 18,
    'T': 19,
    'U': 20,
    'V': 21,
    'W': 22,
    'X': 23,
    'Y': 24,
    'Z': 25
}

In [None]:
device = 'cpu'
if torch.cuda.is_available():
  device = torch.cuda.current_device()

pred = []
model.eval()
for images in tqdm(test_loader):
    images = images['image']
    images = images.to(device)
    logits = model(images)
    pred.extend(torch.argmax(logits, dim = 1).tolist())

  0%|          | 0/79 [00:00<?, ?it/s]

In [None]:
letters =[list(letters_mapping.keys())[list(letters_mapping.values()).index(letter)] \
          for letter in pred]

In [None]:
from collections import Counter
result = pd.DataFrame.from_dict(Counter(letters), orient='index')
result.to_csv('//kaggle/working/letters.csv', header=False)
