# M2608.001300 Machine Learning<br> Assignment #5 Final Projects (Pytorch)


Copyright (C) Data Science & AI Laboratory, Seoul National University. This material is for educational uses only. Some contents are based on the material provided by other paper/book authors and may be copyrighted by them.

**For understanding of this work, please carefully look at given PPT file.**

Note: certain details are missing or ambiguous on purpose, in order to test your knowledge on the related materials. However, if you really feel that something essential is missing and cannot proceed to the next step, then contact the teaching staff with clear description of your problem.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

import os
import random

import matplotlib.pyplot as plt
import numpy as np

import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.utils.data import DataLoader,Dataset
from torch.autograd import Variable
from PIL import Image
#import resnet
import torchvision.models as models

#from keras.models import Sequential
#from keras.layers import Dense, Activation

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Load datasets


In [2]:
NUMBER = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
ALPHABET = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
NONE = ['NONE'] # label for empty space
ALL_CHAR_SET = NUMBER + ALPHABET + NONE
ALL_CHAR_SET_LEN = len(ALL_CHAR_SET)
MAX_CAPTCHA = 7

print(ALL_CHAR_SET.index('NONE'))

def encode(a):
    onehot = [0]*ALL_CHAR_SET_LEN
    idx = ALL_CHAR_SET.index(a)
    onehot[idx] += 1
    return onehot

# modified dataset class
class Mydataset(Dataset):
    def __init__(self, img_path, label_path, is_train=True, transform=None):
        self.path = img_path
        self.label_path = label_path
        if is_train: 
            self.img = os.listdir(self.path)[:10000]
            self.labels = open(self.label_path, 'r').read().split('\n')[:-1][:10000]
        else: 
            self.img = os.listdir(self.path)[:1000]
            self.labels = open(self.label_path, 'r').read().split('\n')[:-1][:1000]
        
        self.transform = transform
        self.max_length = MAX_CAPTCHA
        
    def __getitem__(self, idx):
        img_path = self.img[idx]
        img = Image.open(f'{self.path}/{self.img[idx]}')
        img = img.convert('L')
        label = self.labels[idx]
        label_oh = []
        # one-hot for each character
        for i in range(self.max_length):
            if i < len(label):
                label_oh += encode(label[i])
            else:
                #label_oh += [0]*ALL_CHAR_SET_LEN
                label_oh += encode('NONE')
            
        if self.transform is not None:
            img = self.transform(img)
        return img, np.array(label_oh), label
    
    def __len__(self):
        return len(self.img)

transform = transforms.Compose([
    transforms.Resize([160, 60]),
    transforms.ToTensor(),
##############################################################################
#                          IMPLEMENT YOUR CODE                               #
##############################################################################
#    transforms.Normalize(mean=0.456, std=0.224)
##############################################################################
#                          END OF YOUR CODE                                  #
##############################################################################
])



36


In [3]:
"""Loading DATA"""
# Change to your own data foler path!
gPath = '/content/drive/My Drive/2020Spring_ML_final/'

train_ds = Mydataset(gPath+'Data/train/', gPath+'Data/train.txt',transform=transform)
test_ds = Mydataset(gPath+'Data/test/', gPath+'Data/test.txt',False, transform)
train_dl = DataLoader(train_ds, batch_size=128, num_workers=4)
test_dl = DataLoader(test_ds, batch_size=1, num_workers=4)

In [4]:
"""To CUDA for local run"""
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#GPUID = '4' # define GPUID
#os.environ["CUDA_VISIBLE_DEVICES"] = str(GPUID)


Problem 1: Design LSTM model for catcha image recognition. (10 points)

In [5]:
class LSTM(nn.Module):
    def __init__(self, cnn_dim, hidden_size, vocab_size, num_layers=1):
        super(LSTM, self).__init__()
        
        # define the properties
        self.cnn_dim = cnn_dim
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        
        # lstm cell
        self.lstm_cell = nn.LSTMCell(input_size=self.vocab_size, hidden_size=hidden_size)
    
        # output fully connected layer
        self.fc_in = nn.Linear(in_features=self.cnn_dim, out_features=self.vocab_size)
        self.fc_out = nn.Linear(in_features=self.hidden_size, out_features=self.vocab_size)
    
        # embedding layer 
        self.embed = nn.Embedding(num_embeddings=self.vocab_size, embedding_dim=self.vocab_size)
    
        # activations
        self.softmax = nn.Softmax(dim=1)
    
    def forward(self, features, captions):

        batch_size = features.size(0)
        cnn_dim = features.size(1)

        hidden_state = torch.zeros((batch_size, self.hidden_size)).cuda()
        cell_state = torch.zeros((batch_size, self.hidden_size)).cuda()
    
        # define the output tensor placeholder
        outputs = torch.empty((batch_size, captions.size(1), self.vocab_size)).cuda()

        # embed the captions
        captions_embed = self.embed(captions)

##############################################################################
#                          IMPLEMENT YOUR CODE                               #
##############################################################################
        for t in range(captions.size(1)):
          if t == 0:
            hidden_state, cell_state=self.lstm_cell(features, (hidden_state, cell_state))
          else:
            hidden_state, cell_state=self.lstm_cell(captions_embed[:,t,:], (hidden_state,cell_state))

          out = self.fc_out(hidden_state)

          outputs[:, t, :]=out
##############################################################################
#                          END OF YOUR CODE                                  #
##############################################################################
        return outputs 




Problem 2: 

*   1.Connect CNN model to the desinged LSTM model.
*   2.Replace ResNet to your own CNN model from Assignment3.


          


In [10]:

##############################################################################
#                          IMPLEMENT YOUR CODE                               #
##############################################################################
 #CNN

class BetterNet(nn.Module):
    def __init__(self,embed_size=ALL_CHAR_SET_LEN*MAX_CAPTCHA):
        super(BetterNet, self).__init__()
        
        self.densenet = models.densenet121(pretrained=True)
        self.densenet.features.conv0 = nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
        
        # replace the classifier with a fully connected embedding layer
        self.densenet.classifier = nn.Linear(in_features=1024, out_features=512)
        
        # add another fully connected layer
        self.embed = nn.Linear(in_features=512, out_features=embed_size)
        
        # dropout layer
        self.dropout = nn.Dropout(p=0.3)
        
        # activation layers
        self.prelu = nn.PReLU()
        
    def forward(self, images):
        
        # get the embeddings from the densenet
        out = self.prelu(self.densenet(images))
       # out = self.dropout(out)
        # pass through the fully connected
        out = self.embed(out)
        
        return out
#CNN
betternet = BetterNet() 
"""
#Resnet
betternet = resnet.resnet18(pretrained=False)
betternet.conv1 = nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
betternet.fc = nn.Linear(in_features=512, out_features=ALL_CHAR_SET_LEN*MAX_CAPTCHA, bias=True)
"""
betternet = betternet.to(device)

##############################################################################
#                          END OF YOUR CODE                                  #
##############################################################################

       
# LSTM
cnn_dim=512 #resnet18-512
hidden_size=8
vocab_size =ALL_CHAR_SET_LEN*MAX_CAPTCHA # 수정
lstm = LSTM(cnn_dim=cnn_dim, hidden_size=hidden_size, vocab_size=vocab_size)
lstm = lstm.to(device)

# loss, optimizer 이거를 바꾸자
##############################################################################
#                          IMPLEMENT YOUR CODE                               #
##############################################################################
loss_func = nn.MultiLabelSoftMarginLoss()
cnn_optim = torch.optim.SGD(betternet.parameters(), lr=0.00025,momentum=0.9)
#lstm_optim = torch.optim.Adam(lstm.parameters(), lr=0.001)
##############################################################################
#                          END OF YOUR CODE                                  #
##############################################################################

Problem3: Find hyper-parameters.


In [11]:
"""TRAINING"""
print_interval = 15
max_epoch = 100

for epoch in range(max_epoch):
    for step, i in enumerate(train_dl):
        img, label_oh, label = i
        img = Variable(img).cuda()
#        label_oh_train=label_oh[:,:label_oh.shape[1]-1].to(device) # lstm연결을 위한 것 추가
#        label_oh_target=label_oh[:,1:].to(device) # lstm연결을 위한 것 추가
        label_oh = Variable(label_oh.float()).cuda()
        batch_size, _ = label_oh.shape
        pred = betternet(img)
#        pred = lstm(pred, label_oh)
        loss = loss_func(pred, label_oh)
        cnn_optim.zero_grad()
        loss.backward()
        cnn_optim.step()
##############################################################################
#                          IMPLEMENT YOUR CODE                               #
############################################################################## 
#        betternet.zero_grad()
#        lstm.zero_grad()
        
#        betternet.train()
#        lstm.train()

         # cnn-lstm 결합
#        pred = lstm(pred, label_oh_train)
#        if lstm_optim is not None:
#            lstm_optim.zero_grad()                
#        if lstm_optim is not None:
#            lstm_optim.step()
# model은 위에서 합친 모델 - 아마 feature를 lstm으로 받는거겠지 그럼 loss도 다시바꿔야겠다
# 파라미터를 바꾸는 건데 뭘 바꿔야하는지 모르겠다 이대로 돌려도 되긴해
# 위 파라미터 값 수정, 여기서 모델을 합쳐도 됨

##############################################################################
#                          END OF YOUR CODE                                  #
##############################################################################
        if (step+1)%print_interval == 0:
            print('epoch:', epoch+1, 'step:', step+1, 'loss:', loss.item())

epoch: 1 step: 15 loss: 0.6994454860687256
epoch: 1 step: 30 loss: 0.6959024667739868
epoch: 1 step: 45 loss: 0.6929036378860474
epoch: 1 step: 60 loss: 0.6894955635070801
epoch: 1 step: 75 loss: 0.6859017610549927
epoch: 2 step: 15 loss: 0.6819623112678528
epoch: 2 step: 30 loss: 0.6782490611076355
epoch: 2 step: 45 loss: 0.6752631664276123
epoch: 2 step: 60 loss: 0.6718862056732178
epoch: 2 step: 75 loss: 0.6683573722839355
epoch: 3 step: 15 loss: 0.6643806099891663
epoch: 3 step: 30 loss: 0.6606832146644592
epoch: 3 step: 45 loss: 0.6576210260391235
epoch: 3 step: 60 loss: 0.6541764736175537
epoch: 3 step: 75 loss: 0.650532066822052
epoch: 4 step: 15 loss: 0.6463741660118103
epoch: 4 step: 30 loss: 0.6425571441650391
epoch: 4 step: 45 loss: 0.6392848491668701
epoch: 4 step: 60 loss: 0.6356703042984009
epoch: 4 step: 75 loss: 0.6317853331565857
epoch: 5 step: 15 loss: 0.6272999048233032
epoch: 5 step: 30 loss: 0.6232743859291077
epoch: 5 step: 45 loss: 0.6196931600570679
epoch: 5 ste

In [12]:
"""TEST"""
def get_char_count(arg1):
    c0 = ALL_CHAR_SET[np.argmax(arg1.cpu().tolist()[0:ALL_CHAR_SET_LEN])]
    c1 = ALL_CHAR_SET[np.argmax(arg1.cpu().tolist()[ALL_CHAR_SET_LEN:ALL_CHAR_SET_LEN*2])]
    c2 = ALL_CHAR_SET[np.argmax(arg1.cpu().tolist()[ALL_CHAR_SET_LEN*2:ALL_CHAR_SET_LEN*3])]
    c3 = ALL_CHAR_SET[np.argmax(arg1.cpu().tolist()[ALL_CHAR_SET_LEN*3:ALL_CHAR_SET_LEN*4])]
    c4 = ALL_CHAR_SET[np.argmax(arg1.cpu().tolist()[ALL_CHAR_SET_LEN*4:ALL_CHAR_SET_LEN*5])]
    c5 = ALL_CHAR_SET[np.argmax(arg1.cpu().tolist()[ALL_CHAR_SET_LEN*5:ALL_CHAR_SET_LEN*6])]
    c6 = ALL_CHAR_SET[np.argmax(arg1.cpu().tolist()[ALL_CHAR_SET_LEN*6:ALL_CHAR_SET_LEN*7])]
    return c0, c1, c2,c3, c4, c5, c6 
 


char_correct = 0
word_correct = 0
total = 0

betternet.eval()
lstm.eval()

with torch.no_grad():
    for step, (img, label_oh, label) in enumerate(test_dl):
        char_count =0
        img = Variable(img).cuda()
        label_oh = Variable(label_oh.float()).cuda()
        pred = betternet(img)
     #   pred = lstm(pred, label_oh) # outputs = lstm(feature, ...) 수정해야
        label_len = label[0]
        pred = pred.squeeze(0)
        label_oh = label_oh.squeeze(0)
        
        c0,c1,c2,c3,c4,c5,c6 = get_char_count(pred.squeeze()) 
        d0,d1,d2,d3,d4,d5,d6 = get_char_count(label_oh) 
         
        c = '%s%s%s%s%s%s%s' % (c0, c1, c2, c3, c4, c5, c6)
        d = '%s%s%s%s%s%s%s' % (d0, d1, d2, d3, d4, d5, d6)
    
        char_count += (c0==d0)+(c1==d1)+(c2==d2)+(c3==d3)+(c4==d4)+(c5==d5)+(c6==d6)
        char_correct += char_count

        if(bool(str(label[0]) in str(c))):
            word_correct+=1

        total += 1
       
print(100/7*char_correct/total)
print(100*word_correct/total)
"""END TEST"""

37.1
0.0


'END TEST'

In [13]:
torch.save(betternet.state_dict(), './model.pth')