# part a

In [16]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import numpy as np 
import torch.nn.functional as F


In [17]:
transform_data = transforms.Compose([transforms.Resize(224),transforms.ToTensor()])
trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_data, target_transform = transforms.Lambda(lambda y: torch.zeros(10, dtype=torch.float).scatter_(dim=0, index=torch.tensor(y),value=1) ))

trainloader = DataLoader(trainset, batch_size=16, shuffle=True)
testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_data)
testloader = torch.utils.data.DataLoader(testset, batch_size=16, shuffle=True)



Files already downloaded and verified
Files already downloaded and verified


In [18]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")


Using cuda device


In [19]:
model1=models.resnet50(pretrained=True)

for params in model1.parameters():
    params.requires_grad=False

ftrs=model1.fc.in_features
model1.fc=nn.Linear(ftrs, 10)


In [20]:
model1.to(device)
next(model1.parameters()).is_cuda

True

In [21]:
learning_rate = 1e-3
batch_size =16
epochs = 10

In [22]:
# Initialize the loss function
loss_fn = nn.CrossEntropyLoss()

In [23]:
optimizer = torch.optim.SGD(model1.parameters(), lr=learning_rate)

In [26]:
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    correct=0
    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction and loss
        X=X.to(device)
        y=y.to(device)
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        #correct += (pred.argmax(1) == y).type(torch.float).sum().item()
        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
            

def test_loop(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0

    with torch.no_grad():
        for X, y in dataloader:
            X=X.to(device)
            y=y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")



In [27]:
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(trainloader, model1, loss_fn, optimizer)
    test_loop(testloader, model1, loss_fn)
print("Done!")

Epoch 1
-------------------------------
loss: 2.240924  [    0/50000]
loss: 2.226423  [ 1600/50000]
loss: 1.974952  [ 3200/50000]
loss: 1.808619  [ 4800/50000]
loss: 1.892246  [ 6400/50000]
loss: 1.603985  [ 8000/50000]
loss: 1.652094  [ 9600/50000]
loss: 1.543158  [11200/50000]
loss: 1.503549  [12800/50000]
loss: 1.318970  [14400/50000]
loss: 1.406009  [16000/50000]
loss: 1.320396  [17600/50000]
loss: 1.398761  [19200/50000]
loss: 1.251215  [20800/50000]
loss: 1.192538  [22400/50000]
loss: 1.039668  [24000/50000]
loss: 1.413914  [25600/50000]
loss: 0.991171  [27200/50000]
loss: 1.012909  [28800/50000]
loss: 1.116941  [30400/50000]
loss: 0.925442  [32000/50000]
loss: 1.013851  [33600/50000]
loss: 1.061969  [35200/50000]
loss: 1.326345  [36800/50000]
loss: 1.064024  [38400/50000]
loss: 1.306992  [40000/50000]
loss: 1.157890  [41600/50000]
loss: 1.015867  [43200/50000]
loss: 0.959195  [44800/50000]
loss: 1.194677  [46400/50000]
loss: 1.017253  [48000/50000]
loss: 1.071112  [49600/50000]


In [30]:
torch.save(model1, 'p2_model1.pth')

# part b

In [124]:
trainloader4 = DataLoader(trainset, batch_size=16, shuffle=True)
testloader4 = torch.utils.data.DataLoader(testset, batch_size=16, shuffle=True)

In [125]:
model2=models.resnet18(pretrained=False)
ftrs=model2.fc.in_features
model2.fc=nn.Linear(ftrs, 10)

model2.to(device)
next(model2.parameters()).is_cuda

True

In [126]:
torch.cuda.empty_cache()


In [127]:
model1 = torch.load('p2_model1.pth')


In [128]:
next(model1.parameters()).is_cuda

True

In [129]:
learning_rate = 1e-3
batch_size =16
epochs = 15

In [134]:
import torch.nn.functional as F

def newloss(predicted, targets, teacher_target, tau, alpha):
    s1 = F.softmax(predicted, dim=1)
    t = F.softmax(teacher_target/tau, dim=1)
    s2= F.softmax(predicted/tau, dim=1)
    soft_loss = alpha * tau * tau * F.cross_entropy(t, s2)
    hard_loss = (1-alpha) * F.cross_entropy(targets, s1)
    loss = soft_loss + hard_loss 
    return loss




In [135]:
optimizer = torch.optim.Adam(model2.parameters(), lr=learning_rate)

In [138]:
def train_loop(dataloader, smodel, tmodel, loss_fn, optimizer, tau, alpha):
    size = len(dataloader.dataset)
    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction and loss
        X=X.to(device)
        y=y.to(device)
        pred = smodel(X)
        t_target=tmodel(X)
        
        loss = newloss(pred, y, t_target, tau, alpha)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")


def test_loop(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0

    with torch.no_grad():
        for X, y in dataloader:
            X=X.to(device)
            y=y.to(device)
            pred = model(X)
           
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")



In [139]:
tau=2
alpha=0.4
ls_fn=nn.CrossEntropyLoss()
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(trainloader4, model2, model1, newloss, optimizer, tau, alpha)
    test_loop(testloader4, model2, ls_fn)
print("Done!")

Epoch 1
-------------------------------
loss: 4.987925  [    0/50000]
loss: 4.606635  [ 1600/50000]
loss: 4.711490  [ 3200/50000]
loss: 4.990084  [ 4800/50000]
loss: 5.040056  [ 6400/50000]
loss: 4.876433  [ 8000/50000]
loss: 4.547595  [ 9600/50000]
loss: 4.678495  [11200/50000]
loss: 4.708873  [12800/50000]
loss: 4.975726  [14400/50000]
loss: 5.098983  [16000/50000]
loss: 4.645491  [17600/50000]
loss: 4.794865  [19200/50000]
loss: 4.736393  [20800/50000]
loss: 4.850601  [22400/50000]
loss: 4.844409  [24000/50000]
loss: 4.835175  [25600/50000]
loss: 4.730545  [27200/50000]
loss: 4.743885  [28800/50000]
loss: 4.791839  [30400/50000]
loss: 4.705799  [32000/50000]
loss: 4.804523  [33600/50000]
loss: 5.015117  [35200/50000]
loss: 4.551866  [36800/50000]
loss: 4.949496  [38400/50000]
loss: 4.649533  [40000/50000]
loss: 4.928356  [41600/50000]
loss: 4.998233  [43200/50000]
loss: 4.839613  [44800/50000]
loss: 4.704820  [46400/50000]
loss: 4.884536  [48000/50000]
loss: 4.494431  [49600/50000]


In [87]:
torch.save(model2, 'p2_model2.pth')

Higher accuracies can be achieved by training in more epochs, but because it takes several hours, just 15 epochs is chosen.

# part c

In [5]:
trainloader5 = DataLoader(trainset, batch_size=16, shuffle=True)
testloader5 = torch.utils.data.DataLoader(testset, batch_size=16, shuffle=True)

In [6]:
model3=models.resnet18(pretrained=False)

ftrs=model3.fc.in_features
model3.fc=nn.Linear(ftrs, 10)



In [7]:
torch.cuda.empty_cache()


In [8]:
model3.to(device)
next(model3.parameters()).is_cuda

True

In [10]:
learning_rate = 1e-3
batch_size =16
epochs = 15

In [11]:
loss_fn = nn.CrossEntropyLoss()

In [12]:
optimizer = torch.optim.Adam(model3.parameters(), lr=learning_rate)

In [13]:
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction and loss
        X=X.to(device)
        y=y.to(device)
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")


def test_loop(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0

    with torch.no_grad():
        for X, y in dataloader:
            X=X.to(device)
            y=y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")



In [14]:
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(trainloader5, model3, loss_fn, optimizer)
    test_loop(testloader5, model3, loss_fn)
print("Done!")

Epoch 1
-------------------------------
loss: 2.896867  [    0/50000]
loss: 2.226656  [ 1600/50000]
loss: 1.574036  [ 3200/50000]
loss: 1.675238  [ 4800/50000]
loss: 1.926916  [ 6400/50000]
loss: 1.811888  [ 8000/50000]
loss: 1.530740  [ 9600/50000]
loss: 1.541551  [11200/50000]
loss: 1.801526  [12800/50000]
loss: 1.494005  [14400/50000]
loss: 1.756647  [16000/50000]
loss: 1.653669  [17600/50000]
loss: 1.270565  [19200/50000]
loss: 1.515260  [20800/50000]
loss: 1.416307  [22400/50000]
loss: 1.088918  [24000/50000]
loss: 0.874840  [25600/50000]
loss: 1.032787  [27200/50000]
loss: 1.201496  [28800/50000]
loss: 0.985781  [30400/50000]
loss: 1.264638  [32000/50000]
loss: 1.202904  [33600/50000]
loss: 1.396457  [35200/50000]
loss: 0.936910  [36800/50000]
loss: 1.212113  [38400/50000]
loss: 0.853512  [40000/50000]
loss: 0.701064  [41600/50000]
loss: 0.764038  [43200/50000]
loss: 0.625422  [44800/50000]
loss: 0.850141  [46400/50000]
loss: 1.374852  [48000/50000]
loss: 0.933962  [49600/50000]


In [17]:
torch.save(model3, 'model3.pth')

If the model is trained in more epochs, we can conclude that using teacher model and distilling the knowledge in a neural network increase the accuracy. May be it takes more time to train with a teacher but the results are more accurate. 

# part d

In [4]:
trainloader7 = DataLoader(trainset, batch_size=16, shuffle=True)
testloader7 = torch.utils.data.DataLoader(testset, batch_size=16, shuffle=True)

In [6]:
model4=models.resnet50(pretrained=True)

ftrs=model4.fc.in_features
model4.fc=nn.Linear(ftrs, 10)


In [7]:
torch.cuda.empty_cache()


In [8]:
model4.to(device)
next(model4.parameters()).is_cuda

True

In [9]:
learning_rate = 1e-3
batch_size =16
epochs = 10

In [10]:
loss_fn = nn.CrossEntropyLoss()

In [11]:
optimizer = torch.optim.SGD(model4.parameters(), lr=learning_rate)

In [12]:
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction and loss
        X=X.to(device)
        y=y.to(device)
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 50 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")


def test_loop(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0

    with torch.no_grad():
        for X, y in dataloader:
            X=X.to(device)
            y=y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")



In [13]:
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(trainloader7, model4, loss_fn, optimizer)
    test_loop(testloader7, model4, loss_fn)
print("Done!")

Epoch 1
-------------------------------
loss: 2.382523  [    0/50000]
loss: 2.266681  [  800/50000]
loss: 2.069983  [ 1600/50000]
loss: 1.983108  [ 2400/50000]
loss: 1.983583  [ 3200/50000]
loss: 1.706721  [ 4000/50000]
loss: 1.784302  [ 4800/50000]
loss: 1.439089  [ 5600/50000]
loss: 1.433651  [ 6400/50000]
loss: 1.254860  [ 7200/50000]
loss: 1.321148  [ 8000/50000]
loss: 1.228954  [ 8800/50000]
loss: 1.032186  [ 9600/50000]
loss: 1.020627  [10400/50000]
loss: 0.741668  [11200/50000]
loss: 1.157192  [12000/50000]
loss: 0.899214  [12800/50000]
loss: 0.689110  [13600/50000]
loss: 0.732615  [14400/50000]
loss: 0.956569  [15200/50000]
loss: 1.176852  [16000/50000]
loss: 0.924006  [16800/50000]
loss: 0.804347  [17600/50000]
loss: 0.592983  [18400/50000]
loss: 0.548827  [19200/50000]
loss: 0.444268  [20000/50000]
loss: 0.436632  [20800/50000]
loss: 0.286955  [21600/50000]
loss: 0.643605  [22400/50000]
loss: 0.214450  [23200/50000]
loss: 0.697296  [24000/50000]
loss: 0.353383  [24800/50000]


In [14]:
torch.save(model4, 'model4.pth')

Fine tuning has better results. Applying fine-tuning allows us to utilize pre-trained networks to recognize classes they were not originally trained on. And furthermore, this method can lead to higher accuracy than transfer learning via feature extraction. However linear tuning is much faster than fine tuning. Linear tuning acts better when the two datasets of pretrained model and new model are similar and rather small. If datasets are similar but huge, it's better to train more layers of FC part. For different datasets, it's maybe better to use fine tuning.  