# Homework 6 - Berkeley STAT 157

**Your name: XX, SID YY, teammates A,B,C (Please add your name, SID and teammates to ease Ryan and Rachel to grade.)**

Handout 3/5/2019, due 3/12/2019 by 4pm. Please submit through gradescope.

In this homework, we will train a CNN model on CIFAR-10 and submit the results into [Kaggle](https://www.kaggle.com/c/cifar-10). The rule is similar to homework 4: 

- work as a team
- submit your results into Kaggle
- take a screen shot of your best score and insert it below
- the top 3 teams/individuals will be awarded with 500 dollar AWS credits

The rest of this notebook contains a baseline ResNet-15 model to train on CIFAR-10. Please use it as a starting point. The end of this notebooks has several hints to improve your results.

First, import the packages or modules required for the competition.

In [None]:
import torch
from torch import nn,optim,tensor
from torch.utils.data import DataLoader
from torchvision.utils import make_grid
from torchvision import datasets,transforms
import numpy as np
from matplotlib import pyplot as plt
import time
import pandas as pd

In [None]:
pip install py7zr

In [None]:
import py7zr

archive = py7zr.SevenZipFile('../input/cifar-10/test.7z', mode='r')
# Must have a root folder wrapper on the test dataset folder.
# Otherwise, the ImageFolder will complain.
archive.extractall(path="./root_test")
archive.close()

In [None]:
batch_size = 32 
num_print = int(50000//batch_size//4)  
epoch_num = 50  
lr = 0.01        
step_size = 10  

In [None]:
def transforms_RandomHorizontalFlip():
    transform_train = transforms.Compose([transforms.RandomHorizontalFlip(),
                                          transforms.ToTensor(),transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))])
    transform = transforms.Compose([transforms.ToTensor(),
                                    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))])
    train_dataset = datasets.CIFAR10(root='./data', 
                                     train=True, transform = transform_train,download=True)
    test_dataset = datasets.CIFAR10(root='./data', 
                                    train=False, transform = transform,download=True)
    return train_dataset,test_dataset

In [None]:
train_dataset,test_dataset = transforms_RandomHorizontalFlip()

train_loader = DataLoader(train_dataset, batch_size = batch_size,shuffle=True)
test_loader = DataLoader(test_dataset, batch_size = batch_size,shuffle=False)

In [None]:
classes = ('plane', 'car', 'bird', 'cat','deer', 'dog', 'frog', 'horse', 'ship', 'truck')
def image_show(img):
    img = img / 2 + 0.5     
    npimg = img.numpy()
    plt.imshow(np.transpose(npimg, (1, 2, 0)))
def label_show(loader):  
    global classes
    dataiter = iter(loader)  
    images, labels = dataiter.next()
    image_show(make_grid(images))
    print(' '.join('%5s' % classes[labels[j]] for j in range(batch_size)))
    return images,labels
label_show(train_loader)

## Define the Model


In [None]:
from torch import nn

def vgg_block(num_convs,in_channels,out_channels):
    blk = []
    for i in range(num_convs):
        if i == 0:
            blk.append(nn.Conv2d(in_channels,out_channels,kernel_size=3,stride = 1,padding = 1 ))
        else:
            blk.append(nn.Conv2d(out_channels,out_channels,kernel_size=3,stride = 1,padding = 1 ))
        blk.append(nn.BatchNorm2d(out_channels))
        blk.append(nn.ReLU(inplace = True))
    blk.append(nn.MaxPool2d(kernel_size=2,stride=2))
    return nn.Sequential(*blk)

class Vgg16_Net(nn.Module):
    def __init__(self,conv_arch,fc_features,fc_hidden_units):
        super(Vgg16_Net, self).__init__()
        self.conv_arch = conv_arch
        self.fc_features = fc_features
        self.fc_hidden_units = fc_hidden_units
        self.conv_layer = nn.Sequential()
        for i ,(num_convs,in_channels,out_channels) in enumerate(self.conv_arch):
            self.conv_layer.add_module('vgg_block_'+str(i+1),vgg_block(num_convs,in_channels,out_channels))
        self.fc_layer = nn.Sequential(   
                nn.Linear(self.fc_features, self.fc_features),
                nn.ReLU(inplace=True),
                nn.Dropout(0.5),
                
                nn.Linear(self.fc_features, self.fc_hidden_units),
                nn.ReLU(inplace=True),
                nn.Dropout(0.5),
        
                nn.Linear(self.fc_hidden_units, 10)
                )
        
    def forward(self, x):
        x = self.conv_layer(x)
        x = x.view(-1, self.fc_features)
        x = self.fc_layer(x)
        return x


## Same Model in Another Way

In [None]:
'''
from torch import nn

class Vgg16_Net(nn.Module):
    def __init__(self):
        super(Vgg16_Net, self).__init__()
        #2个卷积层和1个最大池化层
        self.layer1 = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size = 3, stride=1, padding=1),             # (32-3+2)/1+1 = 32  32*32*64
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            
            nn.Conv2d(64,64, kernel_size = 3, stride=1, padding=1),             # (32-3+2)/1+1 = 32  32*32*64
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            
            nn.MaxPool2d(2, 2)                                                  # (32-2)/2+1 = 16    16*16*64
            
            )
        #2个卷积层和1个最大池化层
        self.layer2 = nn.Sequential(
            nn.Conv2d(64, 128, kernel_size = 3, stride=1, padding=1),           # (16-3+2)/1+1 = 16  16*16*128
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),

            nn.Conv2d(128, 128, kernel_size = 3, stride=1, padding=1),          # (16-3+2)/1+1 = 16  16*16*128
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            
            nn.MaxPool2d(2, 2)                                                  # (16-2)/2+1 = 8    8*8*128
            )
        #3个卷积层和1个最大池化层
        self.layer3 = nn.Sequential(
            nn.Conv2d(128, 256, kernel_size = 3, stride=1, padding=1),          # (8-3+2)/1+1 = 8  8*8*256
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),

            nn.Conv2d(256, 256, kernel_size = 3, stride=1, padding=1),          # (8-3+2)/1+1 = 8  8*8*256
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),

            nn.Conv2d(256, 256, kernel_size = 3, stride=1, padding=1),          # (8-3+2)/1+1 = 8  8*8*256
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            
            nn.MaxPool2d(2, 2),                                                 # (8-2)/2+1 = 4    4*4*256
            )
        #3个卷积层和1个最大池化层
        self.layer4 = nn.Sequential(
            nn.Conv2d(256, 512, kernel_size = 3, stride=1, padding=1),          # (4-3+2)/1+1 = 4  4*4*512
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),

            nn.Conv2d(512, 512, kernel_size = 3, stride=1, padding=1),          # (4-3+2)/1+1 = 4  4*4*512
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),

            nn.Conv2d(512, 512, kernel_size = 3, stride=1, padding=1),          # (4-3+2)/1+1 = 4  4*4*512
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),
            
            nn.MaxPool2d(2, 2)                                                  # (4-2)/2+1 = 2    2*2*512
            )
        #3个卷积层和1个最大池化层
        self.layer5 = nn.Sequential(
            nn.Conv2d(512, 512, kernel_size = 3, stride=1, padding=1),          # (2-3+2)/1+1 = 2  2*2*512
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),

            nn.Conv2d(512, 512, kernel_size = 3, stride=1, padding=1),          # (2-3+2)/1+1 = 2  2*2*512
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),

            nn.Conv2d(512, 512, kernel_size = 3, stride=1, padding=1),          # (2-3+2)/1+1 = 2  2*2*512
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),
            
            nn.MaxPool2d(2, 2)                                                  # (2-2)/2+1 = 1    1*1*512
            )
        self.conv = nn.Sequential(
            self.layer1,
            self.layer2,
            self.layer3,
            self.layer4,
            self.layer5
            )
        self.fc = nn.Sequential(    
            nn.Linear(512, 512),
            nn.ReLU(inplace=True),
            nn.Dropout(0.5),
            
            nn.Linear(512, 256),
            nn.ReLU(inplace=True),
            nn.Dropout(0.5),
    
            nn.Linear(256, 10)
            )

    def forward(self, x):
        x = self.conv(x)
        x = x.view(-1, 512)
        x = self.fc(x)
        return x

'''

## Model and Optimizer

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
conv_arch = ((2,3,64),(2,64,128),(3,128,256),(3,256,512),(3,512,512))
fc_features = 512
fc_hidden_units = 256

model = Vgg16_Net(conv_arch,fc_features,fc_hidden_units).to(device)

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(),lr = lr,momentum = 0.8,weight_decay = 0.001 )
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=step_size, gamma=0.5, last_epoch=-1)

## Train and Validate the Model


In [None]:
loss_list = []
start = time.time()

# train
for epoch in range(epoch_num):  
    running_loss = 0.0
    for i, (inputs, labels) in enumerate(train_loader, 0):
        inputs ,labels = inputs.to(device),labels.to(device)
        
        optimizer.zero_grad()   
        outputs = model(inputs)
        loss = criterion(outputs, labels).to(device)
        
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        loss_list.append(loss.item())
        if i % num_print == num_print-1 :
            print('[%d epoch, %d] loss: %.6f' %(epoch + 1, i + 1, running_loss / num_print))
            running_loss = 0.0  
    lr_1 = optimizer.param_groups[0]['lr']
    print('learn_rate : %.15f'%lr_1)
    scheduler.step()

end = time.time()
print('time:{}'.format(end-start))

In [None]:
# loss images show
plt.plot(loss_list, label='Minibatch cost')
plt.plot(np.convolve(loss_list,np.ones(200,)/200, mode='valid'),label='Running average')
plt.ylabel('Cross Entropy')
plt.xlabel('Iteration')
plt.legend()
plt.show()

In [None]:
# prediction with images
images,labels = label_show(test_loader)
images, labels = images.to(device), labels.to(device)
outputs = model(images)
predicted = outputs.argmax(dim = 1)
print('Predicted: ', ' '.join('%5s' % classes[predicted[j]] for j in range(batch_size)))

In [None]:
# test
model.eval()
correct = 0.0
total = 0
with torch.no_grad():  # No need to back propogate
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device) 
        outputs = model(inputs)
        pred = outputs.argmax(dim = 1)  
        total += inputs.size(0)
        correct += torch.eq(pred,labels).sum().item()
print('Accuracy of the network on the 10000 test images: %.2f %%' % (100.0 * correct / total))

class_correct = list(0. for i in range(10))
class_total = list(0. for i in range(10))
for inputs, labels in test_loader:
    inputs, labels = inputs.to(device), labels.to(device)
    outputs = model(inputs)
    pred = outputs.argmax(dim = 1)  
    c = (pred == labels.to(device)).squeeze()
    for i in range(4):
        label = labels[i]
        class_correct[label] += float(c[i])
        class_total[label] += 1

for i in range(10):
    print('Accuracy of %5s : %.2f %%' % (classes[i], 100 * class_correct[i] / class_total[i]))

## Feature Map

In [None]:
# show feature_map
a = 0
def viz(module, input):
    global a
    x = input[0][0].cpu()
    # print(x.device)
    min_num = min(4,x.size()[0])
    for i in range(min_num):
        plt.subplot(1, min_num, i+1)
        plt.xticks([]) 
        plt.yticks([])  
        plt.axis('off')	
        plt.rcParams['figure.figsize'] = (20, 20) 
        plt.rcParams['savefig.dpi'] = 480
        plt.rcParams['figure.dpi'] = 480
        plt.imshow(x[i])
    plt.savefig('./'+str(a)+'.jpg')
    a += 1
    plt.show()

In [None]:
dataiter = iter(test_loader)  
images, labels = dataiter.next()

for name, m in model.named_modules():
    if isinstance(m, torch.nn.Conv2d):
        m.register_forward_pre_hook(viz)

model.eval()
with torch.no_grad():
    model(images[2].unsqueeze(0).to(device))

## Classify the Testing Set and Submit Results on Kaggle

After obtaining a satisfactory model design and hyper-parameters, we use all training data sets (including validation sets) to retrain the model and classify the testing set.

In [None]:
transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010])])

test_ds = datasets.ImageFolder('root_test', transform=transform_test)

test_iter = torch.utils.data.DataLoader(test_ds, 128, shuffle=False)

In [None]:
preds = []
labels = {0:'airplane', 1:'automobile', 2:'bird', 3:'cat', 4:'deer', 5:'dog', 6:'frog', 7:'horse', 8:'ship', 9:'truck'}

with torch.no_grad():
    for X, _ in test_iter:
        y_hat = model(X.cuda(0))
        preds.extend(y_hat.argmax(dim=1).type(torch.int32).cpu().numpy())
sorted_ids = list(range(1, len(test_ds) + 1))
sorted_ids.sort(key=lambda x: str(x))  # this version of test dataset sortted by this order
df = pd.DataFrame({'id': sorted_ids, 'label': preds})
df['label'] = df['label'].apply(lambda x: labels[x])
df.to_csv('submission.csv', index=False)

After executing the above code, we will get a "submission.csv" file. The format of this file is consistent with the Kaggle competition requirements. 

## Hints to Improve Your Results

* You should use the compete CIFAR-10 dataset to get meaningful results. 
* You'd better use a GPU machine to run it, otherwise it'll be quite slow. (Please DON'T FORGET to stop or terminate your instance if you are not using it, otherwise AWS will change you)
* Change the `batch_size` and number of epochs `num_epochs` to 128 and 100, respectively. (It will take a while to run.)
* Change to another network, such as ResNet-34 or Inception