Loading the packages and the datasets from google drive.

In [1]:
from google.colab import files
from google.colab import drive  
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
!nvidia-smi -L

GPU 0: Tesla P100-PCIE-16GB (UUID: GPU-e77e8c94-b6b0-dc89-5d8d-d8b3799ef423)


In [3]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Sat Dec 11 22:03:35 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   46C    P0    28W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 27.3 gigabytes of available RAM

You are using a high-RAM runtime!


In [5]:
!unzip "/content/gdrive/MyDrive/dl_reddit/train_set.zip" -d "/content"
!unzip "/content/gdrive/MyDrive/dl_reddit/valid_set.zip" -d "/content"
!unzip "/content/gdrive/MyDrive/dl_reddit/test_set.zip" -d "/content"

[1;30;43mA streamkimeneten csak az utolsó 5000 sor látható.[0m
  inflating: /content/test_set/0/2021.10.16_90.png  
  inflating: /content/test_set/0/2021.10.14_85.png  
  inflating: /content/test_set/0/2021.09.22_93.png  
  inflating: /content/test_set/0/2021.09.13_66.png  
  inflating: /content/test_set/0/2021.09.11_41.png  
  inflating: /content/test_set/0/2021.08.29_124.png  
  inflating: /content/test_set/0/2021.09.14_103.png  
  inflating: /content/test_set/0/2021.10.03_135.png  
  inflating: /content/test_set/0/2021.10.01_118.png  
  inflating: /content/test_set/0/2021.09.22_134.png  
  inflating: /content/test_set/0/2021.09.24_64.png  
  inflating: /content/test_set/0/2021.08.26_117.png  
  inflating: /content/test_set/0/2021.09.05_90.png  
  inflating: /content/test_set/0/2021.09.03_126.png  
  inflating: /content/test_set/0/2021.08.27_40.png  
  inflating: /content/test_set/0/2021.09.29_157.png  
  inflating: /content/test_set/0/2021.09.22_47.png  
  inflating: /content/test

In [6]:
!cp "/content/gdrive/MyDrive/dl_reddit/train_set.csv" "./"
!cp "/content/gdrive/MyDrive/dl_reddit/valid_set.csv" "./"
!cp "/content/gdrive/MyDrive/dl_reddit/test_set.csv" "./"

In [1]:
from __future__ import print_function

import glob
from itertools import chain
import os
import random
import zipfile

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from linformer import Linformer
from PIL import Image
from sklearn.model_selection import train_test_split
from torch.optim.lr_scheduler import StepLR
from torch.utils.data import DataLoader, Dataset
from torchvision import datasets, transforms
from tqdm.notebook import tqdm
from torch.utils.data import WeightedRandomSampler

from vit_pytorch.efficient import ViT

Fixing some parameters.

In [2]:
batch_size = 128
epochs = 10
lr = 0.00005
seed = 42

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(seed)

In [4]:
device = 'cuda'

Locating the datasets.

In [5]:
train_list0 = glob.glob(os.path.join('./train_set/0/','*.png'))
train_list1 = glob.glob(os.path.join('./train_set/1/','*.png'))
val_list0 = glob.glob(os.path.join('./valid_set/0/','*.png'))
val_list1 = glob.glob(os.path.join('./valid_set/1/','*.png'))
test_list0 = glob.glob(os.path.join('./test_set/0/','*.png'))
test_list1 = glob.glob(os.path.join('./test_set/1/','*.png'))

In [6]:
train_labels=[0]*len(train_list0)+[1]*len(train_list1)
val_labels=[0]*len(val_list0)+[1]*len(val_list1)
test_labels=[0]*len(test_list0)+[1]*len(test_list1)
train_list=train_list0+train_list1
val_list=val_list0+val_list1
test_list=test_list0+test_list1

In [7]:
print(f"Train Data: {len(train_list)}")
print(f"Validation Data: {len(val_list)}")
print(f"Test Data: {len(test_list)}")

Train Data: 154462
Validation Data: 5000
Test Data: 5000


Creating the datasets, the item has to be transformed, and the label can be acquired from the location of the file.

In [8]:
class Dataset(Dataset):
    def __init__(self, file_list, transform=None):
        self.file_list = file_list
        self.transform = transform

    def __len__(self):
        self.filelength = len(self.file_list)
        return self.filelength

    def __getitem__(self, idx):
        img_path = self.file_list[idx]
        img = Image.open(img_path)
        img_transformed = self.transform(img)

        label = int(img_path.split("/")[-2])

        return img_transformed, label

# Transfer Learning

Installing the required package for a pretrained VIT.

In [10]:
!pip install git+https://github.com/huggingface/transformers.git

Collecting git+https://github.com/huggingface/transformers.git
  Cloning https://github.com/huggingface/transformers.git to /tmp/pip-req-build-nr_x55ry
  Running command git clone -q https://github.com/huggingface/transformers.git /tmp/pip-req-build-nr_x55ry
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 429 kB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 18.5 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 63.0 MB/s 
Collecting pyyaml>=5.1
  Downloading P

Loading the packages, and the model.

In [11]:
from transformers import ViTForImageClassification
from transformers import ViTFeatureExtractor
import torch

device = torch.device('cuda')

feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224')

model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224')
model.eval()
model.to(device)

Downloading:   0%|          | 0.00/160 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/68.0k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/330M [00:00<?, ?B/s]

ViTForImageClassification(
  (vit): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): PatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0): ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_features=768, out_fea

Freezing the layers.

In [12]:
for param in model.parameters():
    param.requires_grad = False

Changing the output layer.

In [13]:
model.classifier=torch.nn.Sequential(torch.nn.Linear(in_features=768, out_features=2))
model.to(device)
model

ViTForImageClassification(
  (vit): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): PatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0): ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_features=768, out_fea

Building the data loaders and loss functions, optimizers.

In [14]:
train_data = Dataset(train_list, transform=feature_extractor)
val_data = Dataset(val_list, transform=feature_extractor)
test_data = Dataset(test_list, transform=feature_extractor)
train_loader = DataLoader(dataset = train_data, batch_size=batch_size, shuffle=True, num_workers=4)
valid_loader = DataLoader(dataset = val_data, batch_size=batch_size, shuffle=True, num_workers=4)
test_loader = DataLoader(dataset = test_data, batch_size=batch_size, shuffle=True, num_workers=4)

In [16]:
# loss function
criterion = nn.CrossEntropyLoss(weight=torch.cuda.FloatTensor([0.1, 1]))
# optimizer
optimizer = optim.Adam(model.parameters(), lr=0.0001)

Training the model.

In [None]:
for epoch in range(10):
    epoch_loss = 0
    epoch_accuracy = 0

    for data, label in tqdm(train_loader):
        data = data['pixel_values'][0].to(device)
        label = label.to(device)

        output = model(data)[0]
        loss = criterion(output, label)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        acc = (output.argmax(dim=1) == label).float().mean()
        epoch_accuracy += acc / len(train_loader)
        epoch_loss += loss / len(train_loader)

    with torch.no_grad():
        epoch_val_accuracy = 0
        epoch_val_loss = 0
        for data, label in valid_loader:
            data = data['pixel_values'][0].to(device)
            label = label.to(device)

            val_output = model(data)[0]
            val_loss = criterion(val_output, label)

            acc = (val_output.argmax(dim=1) == label).float().mean()
            epoch_val_accuracy += acc / len(valid_loader)
            epoch_val_loss += val_loss / len(valid_loader)

    print(
        f"Epoch : {epoch+1} - loss : {epoch_loss:.4f} - acc: {epoch_accuracy:.4f} - val_loss : {epoch_val_loss:.4f} - val_acc: {epoch_val_accuracy:.4f}\n"
    )

  0%|          | 0/716 [00:00<?, ?it/s]

Epoch : 1 - loss : 0.6296 - acc: 0.6322 - val_loss : 0.6544 - val_acc: 0.6106



  0%|          | 0/716 [00:00<?, ?it/s]

Epoch : 2 - loss : 0.6292 - acc: 0.6335 - val_loss : 0.6521 - val_acc: 0.6114



  0%|          | 0/716 [00:00<?, ?it/s]

Epoch : 3 - loss : 0.6288 - acc: 0.6344 - val_loss : 0.6493 - val_acc: 0.6197



  0%|          | 0/716 [00:00<?, ?it/s]

Epoch : 4 - loss : 0.6275 - acc: 0.6325 - val_loss : 0.6614 - val_acc: 0.6279



  0%|          | 0/716 [00:00<?, ?it/s]

Epoch : 5 - loss : 0.6279 - acc: 0.6313 - val_loss : 0.6502 - val_acc: 0.5958



  0%|          | 0/716 [00:00<?, ?it/s]

Epoch : 6 - loss : 0.6284 - acc: 0.6330 - val_loss : 0.6611 - val_acc: 0.6828



  0%|          | 0/716 [00:00<?, ?it/s]

Epoch : 7 - loss : 0.6267 - acc: 0.6356 - val_loss : 0.6546 - val_acc: 0.6203



  0%|          | 0/716 [00:00<?, ?it/s]

Epoch : 8 - loss : 0.6271 - acc: 0.6329 - val_loss : 0.6539 - val_acc: 0.6668



  0%|          | 0/716 [00:00<?, ?it/s]

Epoch : 9 - loss : 0.6265 - acc: 0.6380 - val_loss : 0.6553 - val_acc: 0.6871



  0%|          | 0/716 [00:00<?, ?it/s]

Epoch : 10 - loss : 0.6261 - acc: 0.6343 - val_loss : 0.6580 - val_acc: 0.7172



And after countless hours of training...

In [18]:
model=torch.load('/content/gdrive/My Drive/NotSoBadVIT.pth')
model.to(device)

ViTForImageClassification(
  (vit): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): PatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0): ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_features=768, out_fea

Evaluate the model on the datasets.

In [None]:
with torch.no_grad():
        target_true=0
        predicted_true=0
        correct_true=0
        epoch_val_accuracy = 0
        epoch_val_loss = 0
        for data, label in train_loader:
            data = data['pixel_values'][0].to(device)
            label = label.to(device)
            val_output = model(data)[0]
            val_loss = criterion(val_output, label)
            predicted_classes = torch.argmax(val_output, dim=1) == 1
            target_classes = label
            target_true += torch.sum(target_classes == 1).float()
            predicted_true += torch.sum(predicted_classes).float()
            correct_true += torch.sum(
            (predicted_classes == target_classes) * (predicted_classes == 1)).float()
            acc = (val_output.argmax(dim=1) == label).float().mean()
            epoch_val_accuracy += acc / len(valid_loader)
            epoch_val_loss += val_loss / len(valid_loader)

In [None]:
recall = correct_true / target_true
precision = correct_true / predicted_true
f1_score = 2 * precision * recall / (precision + recall)

In [None]:
print('Measures on train set:')
print(' Accuracy:',round(epoch_val_accuracy.item()*len(valid_loader)/len(train_loader),3),'\n','Precision:',round(precision.item(),3),'\n','Recall',round(recall.item(),3),'\n','F1',round(f1_score.item(),3))

Measures on train set:
 Accuracy: 0.873 
 Precision: 0.125 
 Recall 0.262 
 F1 0.169


In [None]:
with torch.no_grad():
        target_true=0
        predicted_true=0
        correct_true=0
        epoch_val_accuracy = 0
        epoch_val_loss = 0
        for data, label in valid_loader:
            data = data['pixel_values'][0].to(device)
            label = label.to(device)
            val_output = model(data)[0]
            val_loss = criterion(val_output, label)
            predicted_classes = torch.argmax(val_output, dim=1) == 1
            target_classes = label
            target_true += torch.sum(target_classes == 1).float()
            predicted_true += torch.sum(predicted_classes).float()
            correct_true += torch.sum(
            (predicted_classes == target_classes) * (predicted_classes == 1)).float()
            acc = (val_output.argmax(dim=1) == label).float().mean()
            epoch_val_accuracy += acc / len(valid_loader)
            epoch_val_loss += val_loss / len(valid_loader)

In [None]:
recall = correct_true / target_true
precision = correct_true / predicted_true
f1_score = 2 * precision * recall / (precision + recall)

In [None]:
print('Measures on validation set:')
print(' Accuracy:',round(epoch_val_accuracy.item(),3),'\n','Precision:',round(precision.item(),3),'\n','Recall',round(recall.item(),3),'\n','F1',round(f1_score.item(),3))

Measures on validation set:
 Accuracy: 0.873 
 Precision: 0.122 
 Recall 0.236 
 F1 0.161


In [19]:
with torch.no_grad():
        target_true=0
        predicted_true=0
        correct_true=0
        epoch_val_accuracy = 0
        epoch_val_loss = 0
        for data, label in test_loader:
            data = data['pixel_values'][0].to(device)
            label = label.to(device)
            val_output = model(data)[0]
            val_loss = criterion(val_output, label)
            predicted_classes = torch.argmax(val_output, dim=1) == 1
            target_classes = label
            target_true += torch.sum(target_classes == 1).float()
            predicted_true += torch.sum(predicted_classes).float()
            correct_true += torch.sum(
            (predicted_classes == target_classes) * (predicted_classes == 1)).float()
            acc = (val_output.argmax(dim=1) == label).float().mean()
            epoch_val_accuracy += acc / len(valid_loader)
            epoch_val_loss += val_loss / len(valid_loader)

In [20]:
recall = correct_true / target_true
precision = correct_true / predicted_true
f1_score = 2 * precision * recall / (precision + recall)

In [21]:
print('Measures on test set:')
print(' Accuracy:',round(epoch_val_accuracy.item(),4),'\n','Precision:',round(precision.item(),4),'\n','Recall',round(recall.item(),4),'\n','F1',round(f1_score.item(),4))

Measures on test set:
 Accuracy: 0.8803 
 Precision: 0.1106 
 Recall 0.2208 
 F1 0.1474


In [23]:
from sklearn.metrics import average_precision_score
preds=[]
labels=[]
with torch.no_grad():
        target_true=0
        predicted_true=0
        correct_true=0
        epoch_val_accuracy = 0
        epoch_val_loss = 0
        for data, label in test_loader:
            data = data['pixel_values'][0].to(device)
            label = label.to(device)
            val_output = model(data)[0]
            preds+=val_output.tolist()
            labels+=label.tolist()
preds=np.array(preds)
labels=np.array(labels)
scores=preds[:,1]-preds[:,0]
average_precision_score(labels, scores)

0.10205056919085867

So the average_precision_score is 0.102.