# Homework 11 - Transfer Learning (Domain Adversarial Training)

> Author: Arvin Liu (r09922071@ntu.edu.tw)

If there are any questions, please contact mlta-2022-spring@googlegroups.com

In [None]:
!nvidia-smi

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
import os

# your workspace in your drive
workspace = 'ML2022-hw11'


try:
  os.chdir(os.path.join('/content/gdrive/MyDrive/', workspace))
except:
  os.mkdir(os.path.join('/content/gdrive/MyDrive/', workspace))
  os.chdir(os.path.join('/content/gdrive/MyDrive/', workspace))

In [22]:
import cv2
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Function
 
import torch.optim as optim
import torchvision.transforms as transforms
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader

import random

In [23]:
train_data = './real_or_drawing/train_data'
test_data = './real_or_drawing/test_data'

batch_size = 64

In [24]:
def same_seeds(seed):
	  torch.manual_seed(seed)
	  if torch.cuda.is_available():
		    torch.cuda.manual_seed(seed)
		    torch.cuda.manual_seed_all(seed)
	  np.random.seed(seed)
	  random.seed(seed)
	  torch.backends.cudnn.benchmark = False
	  torch.backends.cudnn.deterministic = True

same_seeds(43854)

In [25]:
#ref: https://github.com/VisionLearningGroup/VisionLearningGroup.github.io/blob/master/M3SDA/code_MSDA_digit/metric/msda.py

def euclidean(x1,x2):
	return ((x1-x2)**2).sum().sqrt()

def k_moment(output_s1, output_s2, output_s3, output_s4, output_t, k):
	output_s1 = (output_s1**k).mean(0)
	output_s2 = (output_s2**k).mean(0)
	output_s3 = (output_s3**k).mean(0)
	output_t = (output_t**k).mean(0)
	return  euclidean(output_s1, output_t) + euclidean(output_s2, output_t) + euclidean(output_s3, output_t)+ \
		euclidean(output_s1, output_s2) + euclidean(output_s2, output_s3) + euclidean(output_s3, output_s1) +\
		euclidean(output_s4, output_s1) + euclidean(output_s4, output_s2) + euclidean(output_s4, output_s2) + \
		euclidean(output_s4, output_t)

def msda_regulizer(output_s1, output_s2, output_s3, output_s4, output_t, belta_moment):
	# print('s1:{}, s2:{}, s3:{}, s4:{}'.format(output_s1.shape, output_s2.shape, output_s3.shape, output_t.shape))
	s1_mean = output_s1.mean(0)
	s2_mean = output_s2.mean(0)
	s3_mean = output_s3.mean(0)
	t_mean = output_t.mean(0)
	output_s1 = output_s1 - s1_mean
	output_s2 = output_s2 - s2_mean
	output_s3 = output_s3 - s3_mean
	output_t = output_t - t_mean
	moment1 = euclidean(output_s1, output_t) + euclidean(output_s2, output_t) + euclidean(output_s3, output_t)+\
		euclidean(output_s1, output_s2) + euclidean(output_s2, output_s3) + euclidean(output_s3, output_s1) +\
		euclidean(output_s4, output_s1) + euclidean(output_s4, output_s2) + euclidean(output_s4, output_s2) + \
		euclidean(output_s4, output_t)
	reg_info = moment1
	#print(reg_info)
	for i in range(belta_moment-1):
		reg_info += k_moment(output_s1,output_s2,output_s3, output_s4, output_t,i+2)
	
	return reg_info/6
	#return euclidean(output_s1, output_t)

In [26]:
# Canny
Canny_transform = transforms.Compose([
    transforms.Grayscale(),
    transforms.Lambda(lambda x: cv2.Canny(np.array(x), 170, 300)),
    transforms.ToPILImage(),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(15),
    transforms.ToTensor()
])

# Sobel
Sobel_transform = transforms.Compose([
    transforms.Grayscale(),
    transforms.Lambda(lambda x: cv2.addWeighted(
        cv2.convertScaleAbs(cv2.Sobel(np.array(x), cv2.CV_16S, 1, 0, ksize=3)), 0.5,
        cv2.convertScaleAbs(cv2.Sobel(np.array(x), cv2.CV_16S, 1, 0, ksize=3)), 0.5, 0)),
    transforms.ToPILImage(),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(15),
    transforms.ToTensor()
])

# Laplacian
Laplacian_transform = transforms.Compose([
    transforms.Grayscale(),
    transforms.Lambda(lambda x: cv2.convertScaleAbs(cv2.Laplacian(np.array(x), cv2.CV_16S, ksize=3))),
    transforms.ToPILImage(),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(15),
    transforms.ToTensor()
])

# Gray
Gray_transform = transforms.Compose([
    transforms.Grayscale(),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(15),
    transforms.ToTensor()
])

# target
target_transform = transforms.Compose([
    transforms.Grayscale(),
    transforms.Resize((32, 32)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(15),
    transforms.ToTensor()
])

In [27]:
Canny_dataset = ImageFolder(train_data, transform=Canny_transform)
Sobel_dataset = ImageFolder(train_data, transform=Sobel_transform)
Laplacian_dataset = ImageFolder(train_data, transform=Laplacian_transform)
Gray_dataset = ImageFolder(train_data, transform=Gray_transform)
target_dataset = ImageFolder(test_data, transform=target_transform)

In [28]:
Canny_dataloader = DataLoader(Canny_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
Sobel_dataloader = DataLoader(Sobel_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
Laplacian_dataloader = DataLoader(Laplacian_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
Gray_dataloader = DataLoader(Gray_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
target_dataloader = DataLoader(target_dataset, batch_size=batch_size, shuffle=True, drop_last=True)

In [40]:
class FeatureExtractor(nn.Module):

    def __init__(self):
        super(FeatureExtractor, self).__init__()

        self.conv = nn.Sequential(
            nn.Conv2d(1, 64, 3, 1, 1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2),

            nn.Conv2d(64, 128, 3, 1, 1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(2),

            nn.Conv2d(128, 256, 3, 1, 1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(2),

            nn.Conv2d(256, 256, 3, 1, 1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(2),

            nn.Conv2d(256, 512, 3, 1, 1),
            nn.BatchNorm2d(512),
            nn.ReLU(),
            nn.MaxPool2d(2)
        )
        
    def forward(self, x):
        x = self.conv(x).squeeze()
        return x

class Classifier(nn.Module):

    def __init__(self):
        super(Classifier, self).__init__()

        self.layer = nn.Sequential(
            nn.Dropout(),
            nn.Linear(512, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),

            nn.Dropout(),
            nn.Linear(512, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),

            nn.Dropout(),
            nn.Linear(512, 10),
        )

    def forward(self, h):
        c = self.layer(h)
        return c

In [41]:
G = FeatureExtractor().cuda()
C1 = Classifier().cuda()
C2 = Classifier().cuda()

criterion = nn.CrossEntropyLoss().cuda()

lr = 0.00002
weight_decay = 0.0005
optimizer_G = optim.Adam(G.parameters(), lr=lr, weight_decay=weight_decay)
optimizer_C1 = optim.Adam(C1.parameters(), lr=lr, weight_decay=weight_decay)
optimizer_C2 = optim.Adam(C2.parameters(), lr=lr, weight_decay=weight_decay)

scheduler_G = optim.lr_scheduler.CosineAnnealingLR(optimizer_G, T_max=10)
scheduler_C1 = optim.lr_scheduler.CosineAnnealingLR(optimizer_C1, T_max=10)
scheduler_C2 = optim.lr_scheduler.CosineAnnealingLR(optimizer_C2, T_max=10)

In [51]:
def discrepancy(t1, t2):
    return torch.mean(torch.abs(F.softmax(t1, dim = 1) - F.softmax(t2, dim = 1)))

def feat_all_domain(img_s1, img_s2, img_s3, img_s4, img_t):
    return G(img_s1), G(img_s2), G(img_s3), G(img_s4), G(img_t)

def C1_all_domain(feat1, feat2, feat3, feat4, feat_t):
    return C1(feat1), C1(feat2), C1(feat3), C1(feat4), C1(feat_t)

def C2_all_domain(feat1, feat2, feat3, feat4, feat_t):
    return C2(feat1), C2(feat2), C2(feat3), C2(feat4), C2(feat_t)

In [32]:
def softmax_loss_all_domain(output1, output2, output3, output4, label_s1, label_s2, label_s3, label_s4):
    return criterion(output1, label_s1), criterion(output2, label_s2), criterion(output3, label_s3), criterion(output4, label_s4)

def loss_all_domain(img_s1, img_s2, img_s3, img_s4, img_t, label_s1, label_s2, label_s3, label_s4):
    feat1, feat2, feat3, feat4, feat_t = feat_all_domain(img_s1, img_s2, img_s3, img_s4, img_t)
    output_s1_c1, output_s2_c1, output_s3_c1, output_s4_c1, output_t_c1 = C1_all_domain(feat1, feat2, feat3, feat4, feat_t)
    output_s1_c2, output_s2_c2, output_s3_c2, output_s4_c2, output_t_c2 = C2_all_domain(feat1, feat2, feat3, feat4, feat_t)
    loss_msda = 0.0005 * msda_regulizer(feat1, feat2, feat3, feat4, feat_t, 5)
    loss_s1_c1, loss_s2_c1, loss_s3_c1, loss_s4_c1 = softmax_loss_all_domain(output_s1_c1, output_s2_c1, output_s3_c1, output_s4_c1, label_s1, label_s2, label_s3, label_s4)
    loss_s1_c2, loss_s2_c2, loss_s3_c2, loss_s4_c2 = softmax_loss_all_domain(output_s1_c2, output_s2_c2, output_s3_c2, output_s4_c2, label_s1, label_s2, label_s3, label_s4)
    return loss_s1_c1, loss_s2_c1, loss_s3_c1, loss_s4_c1, loss_s1_c2, loss_s2_c2, loss_s3_c2, loss_s4_c2 , loss_msda

In [33]:
def reset_grad():
    optimizer_G.zero_grad()
    optimizer_C1.zero_grad()
    optimizer_C2.zero_grad()

In [48]:
def adaptive_lambda(epoch, num_epoch):
    p = epoch / num_epoch
    return 2. / (1 + np.exp(-10 * p)) - 1

In [52]:
def train_msda(lamb=0.01):
    G.train()
    C1.train()
    C2.train()

    mean_loss_c1, mean_loss_c2, mean_loss_dis = 0.0, 0.0, 0.0
    s1_acc, s2_acc, s3_acc, s4_acc, total_num = 0.0, 0.0, 0.0, 0.0, 0.0

    for i, ((Canny_data, Canny_label), (Sobel_data, Sobel_label), (Laplacian_data, Laplacian_label),
            (Gray_data, Gray_label), (target_data, _)) in enumerate(zip(Canny_dataloader, Sobel_dataloader, Laplacian_dataloader, Gray_dataloader, target_dataloader)):
        img_s1, img_s2, img_s3, img_s4, img_t = Canny_data.cuda(), Sobel_data.cuda(), Laplacian_data.cuda(), Gray_data.cuda(), target_data.cuda()
        label_s1, label_s2, label_s3, label_s4 = Canny_label.long().cuda(), Sobel_label.long().cuda(), Laplacian_label.long().cuda(), Gray_label.long().cuda()
        
        reset_grad()

        loss_s1_c1, loss_s2_c1, loss_s3_c1, loss_s4_c1, loss_s1_c2, loss_s2_c2, loss_s3_c2, loss_s4_c2 , loss_msda = loss_all_domain(img_s1, img_s2, img_s3, img_s4, img_t, label_s1, label_s2, label_s3, label_s4)
        loss = loss_s1_c1 + loss_s2_c1 + loss_s3_c1 + loss_s4_c1 + loss_s1_c2 + loss_s2_c2 + loss_s3_c2 + loss_s4_c2 + loss_msda
        loss.backward()

        optimizer_G.step()
        optimizer_C1.step()
        optimizer_C2.step()

        reset_grad()

        loss_s1_c1, loss_s2_c1, loss_s3_c1, loss_s4_c1, loss_s1_c2, loss_s2_c2, loss_s3_c2, loss_s4_c2 , loss_msda = loss_all_domain(img_s1, img_s2, img_s3, img_s4, img_t, label_s1, label_s2, label_s3, label_s4)

        feat_t = G(img_t)
        output_t1 = C1(feat_t)
        output_t2 = C2(feat_t)
        loss_s_c1 = loss_s1_c1 + loss_s2_c1 + loss_s3_c1 + loss_s4_c1
        loss_s_c2 = loss_s1_c2 + loss_s2_c2 + loss_s3_c2 + loss_s4_c2
        loss_s = loss_s_c1 + loss_s_c2 + loss_msda
        loss_dis = discrepancy(output_t1, output_t2)

        loss = loss_s - lamb * loss_dis
        loss.backward()
        optimizer_C1.step()
        optimizer_C2.step()
        reset_grad()

        for j in range(4):
            feat_t = G(img_t)
            output_t1 = C1(feat_t)
            output_t2 = C2(feat_t)
            loss_dis = discrepancy(output_t1, output_t2)
            loss_dis.backward()
            optimizer_G.step()
            reset_grad()
        
        with torch.no_grad():
            feat_s1, feat_s2, feat_s3, feat_s4, feat_t = feat_all_domain(img_s1, img_s2, img_s3, img_s4, img_t)
            output_s1_c1, output_s2_c1, output_s3_c1, output_s4_c1, output_t_c1 = C1_all_domain(feat_s1, feat_s2, feat_s3, feat_s4, feat_t)
            s1_acc += torch.sum(torch.argmax(output_s1_c1, dim=1) == label_s1).item()
            s2_acc += torch.sum(torch.argmax(output_s2_c1, dim=1) == label_s2).item()
            s3_acc += torch.sum(torch.argmax(output_s3_c1, dim=1) == label_s3).item()
            s4_acc += torch.sum(torch.argmax(output_s4_c1, dim=1) == label_s4).item()
            total_num += Canny_data.shape[0]
            mean_loss_c1 += loss_s_c1.item()
            mean_loss_c2 += loss_s_c2.item()
            mean_loss_dis += loss_dis.item()
    
    return mean_loss_c1 / (i + 1), mean_loss_c2 / (i + 1), mean_loss_dis / (i + 1), s1_acc / total_num, s2_acc / total_num, s3_acc / total_num, s4_acc / total_num

In [53]:
num_epoch = 2000
min_loss = np.inf
for epoch in range(num_epoch):
    lamb = adaptive_lambda(epoch, num_epoch)
    loss_c1, loss_c2, loss_g, s1_acc, s2_acc, s3_acc, s4_acc = train_msda(lamb=lamb)
    print('epoch {:>3d} c1 loss: {:6.4f}, c2 loss: {:6.4f}, g loss: {:6.4f}, s1_acc {:6.4f}, s2_acc {:6.4f}, s3_acc {:6.4f}, s4_acc {:6.4f}'.format(epoch, loss_c1, loss_c2, loss_g, s1_acc, s2_acc, s3_acc, s4_acc))
    loss = loss_c1 + loss_c2 + loss_g
    if loss < min_loss:
        min_loss = loss
        torch.save(G.state_dict(), f'G.bin')
        torch.save(C1.state_dict(), f'C1.bin')
        torch.save(C2.state_dict(), f'C2.bin')
        print(f"Saved model!")
    scheduler_G.step()
    scheduler_C1.step()
    scheduler_C2.step()

epoch   0 c1 loss: 9.3673, c2 loss: 9.3279, g loss: 0.0562, s1_acc 0.1412, s2_acc 0.1404, s3_acc 0.1498, s4_acc 0.1274
Saved model!
epoch   1 c1 loss: 8.7475, c2 loss: 8.7169, g loss: 0.0533, s1_acc 0.2055, s2_acc 0.2087, s3_acc 0.2095, s4_acc 0.1705
Saved model!
epoch   2 c1 loss: 8.2536, c2 loss: 8.2337, g loss: 0.0522, s1_acc 0.2528, s2_acc 0.2510, s3_acc 0.2600, s4_acc 0.2224
Saved model!
epoch   3 c1 loss: 7.8176, c2 loss: 7.7600, g loss: 0.0519, s1_acc 0.2800, s2_acc 0.2806, s3_acc 0.2967, s4_acc 0.2817
Saved model!
epoch   4 c1 loss: 7.4728, c2 loss: 7.4443, g loss: 0.0514, s1_acc 0.3231, s2_acc 0.3245, s3_acc 0.3405, s4_acc 0.3137
Saved model!
epoch   5 c1 loss: 7.2167, c2 loss: 7.1838, g loss: 0.0508, s1_acc 0.3397, s2_acc 0.3456, s3_acc 0.3562, s4_acc 0.3504
Saved model!
epoch   6 c1 loss: 7.0409, c2 loss: 7.0126, g loss: 0.0507, s1_acc 0.3576, s2_acc 0.3674, s3_acc 0.3782, s4_acc 0.3660
Saved model!
epoch   7 c1 loss: 6.9781, c2 loss: 6.8989, g loss: 0.0510, s1_acc 0.3640, s

# Inference

We use pandas to generate our csv file.

BTW, the performance of the model trained for 200 epoches might be unstable. You can train for more epoches for a more stable performance.

In [56]:
test_batch_size = 1024
test_dataloader = DataLoader(target_dataset, batch_size=test_batch_size, shuffle=False)

In [57]:
result = []
G.eval()
C1.eval()
C2.eval()
for i, (test_data, _) in enumerate(test_dataloader):
    test_data = test_data.cuda()
    class_logits = C1(G(test_data))

    x = torch.argmax(class_logits, dim=1).cpu().detach().numpy()
    result.append(x)

import pandas as pd
result = np.concatenate(result)

# Generate your submission
df = pd.DataFrame({'id': np.arange(0,len(result)), 'label': result})
df.to_csv('msda_submission.csv',index=False)