In [1]:

from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import csv
import math
import tqdm
from typing import Optional

from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import f1_score

import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader, random_split

from rdkit import Chem
import rdkit.Chem.Draw



In [12]:
DATA_DIR = Path("Z:\\Dropbox\\www\\studs.steelfeet.ru\\_hack\\2021-22\\global-ai")
DATA_FILE = "train.csv"
TEST_FILE = "test.csv"


TRAIN_DIR = Path("D:\\_hack\\2021-22\\global-ai\\train")
#исходные, неаугментированные изображения
TRAIN_DIR_1 = Path("D:\\_hack\\2021-22\\global-ai\\no_aug")

MODEL_FILENAME = "v5-mobilenet_v3_large-pretrained.pth"
VALID_PART = 0.1
BATCH = 128
EPOCHS = 10
LR = 0.01
IM_SIZE = 600



# Загрузка данных

In [24]:
# добавляем аугментированные картинки
# https://github.com/aleju/imgaug
# https://nbviewer.org/github/aleju/imgaug-doc/blob/master/notebooks/A01%20-%20Load%20and%20Augment%20an%20Image.ipynb
# https://imgaug.readthedocs.io/en/latest/source/examples_basics.html
import random
import imageio
import imgaug as ia
from imgaug import augmenters as iaa

seq = iaa.Sequential([
    iaa.Affine(
        scale={"x": (0.7, 0.9), "y": (0.6, 0.8)}, 
        rotate=(-25, 25)),
    #iaa.AdditiveGaussianNoise(scale=(10, 60)),
    #https://imgaug.readthedocs.io/en/latest/source/api_augmenters_geometric.html#imgaug.augmenters.geometric.ElasticTransformation
    iaa.ElasticTransformation(alpha=10, sigma=9),
    #iaa.AddToHueAndSaturation(60)
], random_order=True)


data_active_noaug = []
data_active = []
data_no_active = []
n = 1
with open(DATA_DIR.joinpath(DATA_FILE), 'r') as file_name:
    reader = csv.DictReader(file_name)
    for row in reader:
        if (row["Active"] == "True"):
            data_active.append((n, row["Smiles"]))
            data_active_noaug.append((n, row["Smiles"]))

            mol = Chem.MolFromSmiles(row["Smiles"])

            noaug_class_path = Path(TRAIN_DIR_1, "1")
            rdkit.Chem.Draw.MolToFile(mol, Path(noaug_class_path, str(n)+".png"), imageType="png")

            aug_class_path = Path(TRAIN_DIR, "1")
            rdkit.Chem.Draw.MolToFile(mol, Path(aug_class_path, str(n)+".png"), imageType="png")


        else:
            data_no_active.append((n, row["Smiles"]))

            noaug_class_path = Path(TRAIN_DIR_1, "0", str(n)+".png")
            mol = Chem.MolFromSmiles(row["Smiles"])
            rdkit.Chem.Draw.MolToFile(mol, Path(noaug_class_path), imageType="png")
            
            # все неактивные тоже аугментированные
            image = imageio.imread(noaug_class_path)
            image_aug = seq(image=image)        
            imageio.imwrite(Path(TRAIN_DIR, "0", str(n) + ".png"), image_aug)

        
        n += 1
print(len(data_active_noaug), len(data_no_active), len(data_active_noaug) / len(data_no_active))



for i in range(n, n + (len(data_no_active) - len(data_active_noaug))):
    line = random.choice(data_active_noaug)
    file_n, smile = line
    noaug_class_path = Path(TRAIN_DIR_1, "1", str(file_n)+".png")
    image = imageio.imread(noaug_class_path)

    image_aug = seq(image=image)        
    imageio.imwrite(Path(TRAIN_DIR, "1", str(i) + ".png"), image_aug)

    data_active.append((i, smile))
    

print(len(data_active), len(data_no_active), len(data_active) / len(data_no_active))



206 5351 0.03849747710708279
5351 5351 1.0


# Формируем Датасет

In [25]:
X_Train = []
Y_Train = []
NUM_CL = 0
for label_name in sorted(TRAIN_DIR.glob('*/')):
    if label_name.is_dir():
        class_dir = Path(TRAIN_DIR, label_name.name) #директория класса
        all_image_paths = list(class_dir.glob('*/'))
        for path in all_image_paths:
            X_Train.append(str(path))
            Y_Train.append(NUM_CL)
        NUM_CL = NUM_CL + 1

print("load image success, X_Train count:", str(len(X_Train)))



load image success, X_Train count: 10702


# Загружаем изображения

In [26]:
from PIL import Image

Transform = transforms.Compose(
    [transforms.ToTensor(),
    transforms.Resize((IM_SIZE, IM_SIZE)),
    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))])


class GetData(Dataset):
    def __init__(self, Dir, FNames, Labels, Transform):
        self.dir = Dir
        self.fnames = FNames
        self.transform = Transform
        self.labels = Labels         
        
    def __len__(self):
        return len(self.fnames)

    def __getitem__(self, index):       
        x = Image.open(self.fnames[index])
        x = x.convert('RGB')
    
        if "train" in self.dir:             
            return self.transform(x), self.labels[index]
        elif "test" in self.dir:            
            return self.transform(x), self.fnames[index]

In [27]:
train_set = GetData(str(TRAIN_DIR), X_Train, Y_Train, Transform)
train_size = int((1 - VALID_PART) * len(train_set))
valid_size = len(train_set) - train_size
train_set, valid_set = random_split(train_set,[train_size,valid_size])

trainloader = DataLoader(train_set, batch_size=BATCH, shuffle=True)
validloader = DataLoader(valid_set, batch_size=BATCH, shuffle=True)


In [28]:
print("trainloader shape: ")
print(next(iter(trainloader))[0].shape)

print()
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
print("device: ", device)

model = torchvision.models.mobilenet_v3_large(pretrained=True)
print()
# print(model)
# print()
print("prev features: ")
print(model.classifier[0].in_features) 
print(model.classifier[0].out_features)

trainloader shape: 
torch.Size([128, 3, 600, 600])

device:  cpu

prev features: 
960
1280


# Перенастраиваем модель под наши классы


In [29]:
for param in model.parameters():
    param.requires_grad = False

n_inputs = model.classifier[0].in_features
last_layer = nn.Linear(n_inputs, NUM_CL)
model.classifier = last_layer
if torch.cuda.is_available():
    model.cuda()

# print()
# print(model)
print("new features: ")
print(model.classifier.out_features)

new features: 
2


# Настройки обучения


In [30]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.classifier.parameters())

training_history = {'accuracy':[],'loss':[]}
validation_history = {'accuracy':[],'loss':[]}


# Training with Validation


In [31]:
min_valid_loss = np.inf
 
for e in range(EPOCHS):
    train_acc = 0.0
    train_loss = 0.0
    for data, labels in tqdm.tqdm(trainloader):
        # Transfer Data to GPU if available
        if torch.cuda.is_available():
            data, labels = data.cuda(), labels.cuda()
         
        # Clear the gradients
        optimizer.zero_grad()
        # Forward Pass
        target = model(data)
        # Find the Loss
        loss = criterion(target,labels)
        # Calculate gradients
        loss.backward()
        # Update Weights
        optimizer.step()
        # Calculate Loss
        train_loss += loss.item()
        # Calculate Accuracy
        acc = ((target.argmax(dim=1) == labels).float().mean())
        train_acc += acc
    train_acc = train_acc / len(trainloader) * 100
    train_loss = train_loss / len(trainloader)        
     
    valid_acc = 0.0
    valid_loss = 0.0
    model.eval()     # Optional when not using Model Specific layer
    for data, labels in tqdm.tqdm(validloader):
        # Transfer Data to GPU if available
        if torch.cuda.is_available():
            data, labels = data.cuda(), labels.cuda()
         
        # Forward Pass
        target = model(data)
        # Find the Loss
        loss = criterion(target,labels)
        # Calculate Loss
        valid_loss += loss.item()
        # Calculate Accuracy
        acc = ((target.argmax(dim=1) == labels).float().mean())
        valid_acc += acc
    valid_acc = valid_acc / len(validloader) * 100
    valid_loss = valid_loss / len(validloader)
 
    print(f'Epoch {e+1} | Train Acc: {train_acc:.6f} | Train Loss: {train_loss:.6f} | Valid Acc: {valid_acc:.6f} | Valid Loss: {valid_loss:.6f}')
     
    if min_valid_loss > valid_loss:
        print(f'Validation Loss Decreased({min_valid_loss:.6f}--->{valid_loss:.6f}) \t Saving The Model')
        min_valid_loss = valid_loss
        # Saving State Dict
        torch.save(model.state_dict(), str(Path("models", MODEL_FILENAME)))

    print()

print()
print("success")

100%|██████████| 76/76 [28:37<00:00, 22.60s/it]
100%|██████████| 9/9 [02:44<00:00, 18.33s/it]


Epoch 1 | Train Acc: 56.385624 | Train Loss: 0.677314 | Valid Acc: 48.358078 | Valid Loss: 0.774146
Validation Loss Decreased(inf--->0.774146) 	 Saving The Model



100%|██████████| 76/76 [24:43<00:00, 19.52s/it]
100%|██████████| 9/9 [02:44<00:00, 18.30s/it]


Epoch 2 | Train Acc: 56.201916 | Train Loss: 0.678282 | Valid Acc: 57.110668 | Valid Loss: 0.667964
Validation Loss Decreased(0.774146--->0.667964) 	 Saving The Model



100%|██████████| 76/76 [24:41<00:00, 19.50s/it]
100%|██████████| 9/9 [02:45<00:00, 18.35s/it]


Epoch 3 | Train Acc: 60.371460 | Train Loss: 0.657979 | Valid Acc: 60.510857 | Valid Loss: 0.657353
Validation Loss Decreased(0.667964--->0.657353) 	 Saving The Model



100%|██████████| 76/76 [24:41<00:00, 19.49s/it]
100%|██████████| 9/9 [02:45<00:00, 18.33s/it]


Epoch 4 | Train Acc: 61.975407 | Train Loss: 0.648040 | Valid Acc: 61.803703 | Valid Loss: 0.650946
Validation Loss Decreased(0.657353--->0.650946) 	 Saving The Model



100%|██████████| 76/76 [24:41<00:00, 19.50s/it]
100%|██████████| 9/9 [02:44<00:00, 18.31s/it]


Epoch 5 | Train Acc: 64.189507 | Train Loss: 0.638180 | Valid Acc: 61.827721 | Valid Loss: 0.645886
Validation Loss Decreased(0.650946--->0.645886) 	 Saving The Model



100%|██████████| 76/76 [24:41<00:00, 19.49s/it]
100%|██████████| 9/9 [02:44<00:00, 18.28s/it]


Epoch 6 | Train Acc: 64.676292 | Train Loss: 0.632128 | Valid Acc: 62.198948 | Valid Loss: 0.633584
Validation Loss Decreased(0.645886--->0.633584) 	 Saving The Model



100%|██████████| 76/76 [24:40<00:00, 19.48s/it]
100%|██████████| 9/9 [02:44<00:00, 18.30s/it]


Epoch 7 | Train Acc: 65.606758 | Train Loss: 0.625804 | Valid Acc: 64.769867 | Valid Loss: 0.633135
Validation Loss Decreased(0.633584--->0.633135) 	 Saving The Model



100%|██████████| 76/76 [24:40<00:00, 19.48s/it]
100%|██████████| 9/9 [02:45<00:00, 18.35s/it]


Epoch 8 | Train Acc: 66.286545 | Train Loss: 0.618916 | Valid Acc: 64.162231 | Valid Loss: 0.625069
Validation Loss Decreased(0.633135--->0.625069) 	 Saving The Model



100%|██████████| 76/76 [24:40<00:00, 19.48s/it]
100%|██████████| 9/9 [02:44<00:00, 18.31s/it]


Epoch 9 | Train Acc: 66.527954 | Train Loss: 0.616308 | Valid Acc: 65.599144 | Valid Loss: 0.619713
Validation Loss Decreased(0.625069--->0.619713) 	 Saving The Model



100%|██████████| 76/76 [24:39<00:00, 19.47s/it]
100%|██████████| 9/9 [02:44<00:00, 18.33s/it]

Epoch 10 | Train Acc: 66.868172 | Train Loss: 0.612954 | Valid Acc: 66.404404 | Valid Loss: 0.617077
Validation Loss Decreased(0.619713--->0.617077) 	 Saving The Model


success





# Предсказание

In [32]:
TEST_DIR = str(Path("D:\\_hack\\2021-22\\global-ai\\test"))


## Формируем Датасет

In [47]:
n = 1
X_Test = []

with open(DATA_DIR.joinpath(TEST_FILE), 'r') as file_name:
    reader = csv.DictReader(file_name)
    for row in reader:
        out_file = str(Path(TEST_DIR, str(n)+".png"))
        X_Test.append(out_file)

        #mol = Chem.MolFromSmiles(row["Smiles"])
        #rdkit.Chem.Draw.MolToFile(mol, out_file, imageType="png")

        n += 1


# Загружаем веса модели

In [48]:
print()
print("Load state dict:")
model.load_state_dict(torch.load(str(Path("models", MODEL_FILENAME))))
model.eval()


# Подготавливаем загрузчик иображений
testset = GetData(TEST_DIR, X_Test, None, Transform)
testloader = DataLoader(testset, batch_size=1, shuffle=False)
print()
print("success")
print("testset len:", str(len(testset)))




Load state dict:

success
testset len: 1614


# Аугментируем тестовую выборку

In [39]:
seq = iaa.Sequential([
    iaa.Affine(
        scale={"x": (0.7, 0.9), "y": (0.6, 0.8)}, 
        rotate=(-25, 25)),
    #iaa.AdditiveGaussianNoise(scale=(10, 60)),
    #https://imgaug.readthedocs.io/en/latest/source/api_augmenters_geometric.html#imgaug.augmenters.geometric.ElasticTransformation
    iaa.ElasticTransformation(alpha=10, sigma=9),
    #iaa.AddToHueAndSaturation(60)
], random_order=True)


X_test = []
n = 1
with open(DATA_DIR.joinpath(TEST_FILE), 'r') as file_name:
    reader = csv.DictReader(file_name)
    for row in reader:
        test_class_path = Path(TEST_DIR, str(n)+".png")
        mol = Chem.MolFromSmiles(row["Smiles"])
        rdkit.Chem.Draw.MolToFile(mol, Path(test_class_path), imageType="png")
      
        image = imageio.imread(test_class_path)
        image_aug = seq(image=image)        
        imageio.imwrite(Path(TEST_DIR, str(n) + ".png"), image_aug)

        n += 1

# Распознаем


In [49]:
y_test = []
i = 1
with torch.no_grad():
    model.eval()
    for image, fname in testloader:
        logits = model(image)        
        ps = torch.exp(logits)        
        _, top_class = ps.topk(1, dim=1)
        
        for pred in top_class:
            image_n = int(fname[0].split('\\')[-1][:-4])
            y_test.append([image_n, pred.item()])

        i += 1

# Сохраняем

In [37]:
text = ",Active\n"
n = 1
for y_pred in y_test:
  
  if (y_pred[0] == 0):
    text = text + str(n) + ",False\n"
  else:
    text = text + str(n) + ",True\n"
  
  n+=1

file = open('submission.csv', 'w')
file.write(text)
file.close()
