%pip install datasets
%pip install transformers
%pip install sentencepiece
%pip install diffusers --upgrade
%pip install invisible_watermark accelerate safetensors
%pip install accelerate
%pip install evaluate

# Image Classification Model
This notebook implements a model for classifying art pieces. We use a visual Transformer model, which processes both the image and its corresponding label to provide a classification output.

## Overview
1. **Label Tokenization**: Convert text labels into numerical indices.
2. **Image Processing**: Standardize and preprocess images for model input.
3. **Model Architecture**: Define and train the visual Transformer model.
4. **Evaluation**: Assess model performance using appropriate metrics.

In [None]:
import pandas as pd
import torch
import torchvision
import numpy as np
from PIL import Image, ImageFile
Image.LOAD_TRUNCATED_IMAGES = True
ImageFile.LOAD_TRUNCATED_IMAGES = True
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
from torchvision import transforms
from torch.utils.data import DataLoader
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import optuna
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
import json


### Label Tokenization
To facilitate feeding labels into the model, we tokenize them by pairing each unique label with a corresponding index. This process converts textual labels into numerical form, making them compatible with the model's input requirements.

**Example**:
- **Label**: "Leonardo DaVinci" -> **ID**: 3

**Code**:
```python
from sklearn.preprocessing import LabelEncoder

# Sample labels
labels = ["Leonardo DaVinci", "Vincent van Gogh", "Pablo Picasso"]
label_encoder = LabelEncoder()
label_ids = label_encoder.fit_transform(labels)

print(label_ids)

In [None]:
data = pd.read_csv('../described_dataset_label.csv',sep='\t',encoding='utf-8')
data = data.sample(frac=1).reset_index(drop=True)
data = data[:20000]
data = data.rename(columns={'FILE':'image','AUTHOR':'author', 'TECHNIQUE':'style'})
data = data[['image','author','style']]
data['image'] = [f'.{x}' for x in data['image']]
data['author'] = [x.lower() for x in data['author']]
data['style'] = [x.split(',')[0].lower() for x in data['style']]

with open('../label_author.json', 'r') as f:
    labels_author = json.load(f)
label2id_auth, id2label_auth = dict(), dict()
for i, label in labels_author.items():
    i= int(i)
    id2label_auth[i]=label
    label2id_auth[label]=i

with open('../label_style.json', 'r') as f:
    labels_sty = json.load(f)
label2id_sty, id2label_sty = dict(), dict()
for i, label in labels_sty.items():
    i=int(i)
    label2id_sty[label]=i
    id2label_sty[i]=label

In [None]:
data.head()

In [None]:
data['author'] = data['author'].map(label2id_auth)
data['style'] = data['style'].map(label2id_sty)
data.columns

### Image Processing
To achieve optimal results, we need to preprocess the images. Key steps include:

#### Standardization
Resize all images to a uniform size (e.g., 256x256 pixels) to ensure consistency in model input.

#### Mean and Standard Deviation Calculation
Calculate the mean and standard deviation of the images for normalization purposes. Normalization helps in stabilizing and speeding up the training process.

#### Transformation Pipeline
Use the `Compose` function from the `transforms` library to build a pipeline that applies a series of transformations to each image systematically.

### Transformation Details
Separate transformation pipelines are defined for training and testing datasets to include operations such as resizing, cropping, normalization, and data augmentation.

**Code**:
```python
from torchvision import transforms

# Define transformation pipelines
train_transforms = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

test_transforms = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])


In [None]:
transform = transforms.Compose([
    transforms.RandomResizedCrop(256),
    transforms.ToTensor(),
])

img_tr = [transform(Image.open(img)) for img in tqdm(data['image'])]

mean,std = img_tr[0].mean(),img_tr[0].std()

In [None]:
mean,std = img_tr[0].mean([1,2]),img_tr[0].std([1,2])
print("mean and std before normalize:")
print("Mean of the image:", mean)
print("Std of the image:", std)

In [None]:
train_transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(256),
    transforms.RandomRotation(10),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean,std),
    ])

test_transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(256),
    transforms.ToTensor(),
    transforms.Normalize(mean,std),
    ])

class ArtDataset(torch.utils.data.Dataset):

    def __init__(self,images,labels,transform=None,):
        self.data = images
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self,index):
        img_pil = Image.open(self.data[index])
        img_pil = img_pil.convert('RGB')
        if self.transform is not None:
            img_pil = self.transform(img_pil)
        label = torch.tensor(self.labels[index])
        return(img_pil,label)

## Author Train and Eval data

In [None]:
# Split train into train and val for the author lable
X_train_auth, X_val_auth, y_train_auth, y_val_auth = train_test_split(data['image'],data['author'], test_size=0.2, random_state=42)
train_dataset_auth = ArtDataset(X_train_auth.values,y_train_auth.values,transform=train_transform)
val_dataset_auth = ArtDataset(X_val_auth.values,y_val_auth.values,transform=test_transform)

## Style Train and Eval data

In [None]:
# Split train into train and val for the author lable
X_train_style, X_val_style, y_train_style, y_val_style = train_test_split(data['image'],data['style'], test_size=0.2, random_state=42)
train_dataset_style = ArtDataset(X_train_style.values,y_train_style.values,transform=train_transform)
val_dataset_style = ArtDataset(X_val_style.values,y_val_style.values,transform=test_transform)

In [None]:
image = train_dataset_auth.data[0]
author = train_dataset_auth.labels[0]
sty = train_dataset_style.labels[0]
display(Image.open(image))
print(f"Author: {id2label_auth[author]}")
print(f"Style: {id2label_sty[sty]}")

In [None]:
del X_train_auth, X_val_auth, y_train_auth, y_val_auth, X_train_style, X_val_style, y_train_style, y_val_style

## The Model

Regarding the model, we decided to go with a Convolutional Neural Network (CNN) augmented with visual attention mechanisms.

### AttentionBlock

To enhance the model's ability to focus on relevant regions within the input images, we designed and implemented the AttentionBlock. This component enables the network to dynamically adjust the importance of local features based on their relevance to the task at hand. By incorporating attention mechanisms, we aimed to improve the model's discriminative power and performance, particularly in scenarios where selective feature attention is beneficial.


In [None]:
class AttentionBlock(nn.Module):
    def __init__(self, in_features_l, in_features_g, attn_features, up_factor, normalize_attn=True):
        super(AttentionBlock, self).__init__()
        self.up_factor = up_factor
        self.normalize_attn = normalize_attn
        self.W_l = nn.Conv2d(in_channels=in_features_l, out_channels=attn_features, kernel_size=3, padding=1, bias=False)
        self.W_g = nn.Conv2d(in_channels=in_features_g, out_channels=attn_features, kernel_size=3, padding=1, bias=False)
        self.phi = nn.Conv2d(in_channels=attn_features, out_channels=1, kernel_size=1, padding=0, bias=True)
        
    def forward(self, l, g):
        N, C, W, H = l.size()
        l_ = self.W_l(l)
        g_ = self.W_g(g)
        g_ = F.interpolate(g_, size=(W, H), mode='bilinear', align_corners=False)
        c = self.phi(F.relu(l_ + g_)) # batch_sizex1xWxH
        
        # compute attn map
        if self.normalize_attn:
            a = F.softmax(c.view(N,1,-1), dim=2).view(N,1,W,H)
        else:
            a = torch.sigmoid(c)
        # re-weight the local feature
        f = torch.mul(a.expand_as(l), l) # batch_sizexCxWxH
        if self.normalize_attn:
            output = f.view(N,C,-1).sum(dim=2) # weighted sum
        else:
            output = F.adaptive_avg_pool2d(f, (1,1)).view(N,C) # global average pooling
        return a, output

### Transfer Learning with VGG16

Initially, we attempted to implement the VGG network architecture from scratch. However, due to the computational complexity and time constraints associated with training such a deep network, we opted for transfer learning. Specifically, we leveraged the pre-trained VGG16 model as our base network.

By utilizing transfer learning, we could benefit from the features learned by VGG16 on a large dataset (e.g., ImageNet) and focus our efforts on fine-tuning the model for our specific task. This approach significantly reduced the training time and computational resources required while still enabling us to achieve satisfactory performance.


In [None]:
# #VGG
# class VGG(nn.Module):
#     def __init__(self, num_classes):
#         super(VGG,self).__init__()
#         self.layer1 = nn.Sequential(
#             nn.Conv2d(3,64,kernel_size=3,stride=1,padding=1),
#             nn.BatchNorm2d(64),
#             nn.ReLU(),
#         )
#         self.layer2 = nn.Sequential(
#             nn.Conv2d(64,64,kernel_size=3,stride=1,padding=1),
#             nn.BatchNorm2d(64),
#             nn.ReLU(),
#             nn.MaxPool2d(2,2)
#         )
#         self.layer3 = nn.Sequential(
#             nn.Conv2d(64,128,kernel_size=3,stride=1,padding=1),
#             nn.BatchNorm2d(128),
#             nn.ReLU(),
#         )
#         self.layer4 = nn.Sequential(
#             nn.Conv2d(128,128,kernel_size=3,stride=1,padding=1),
#             nn.BatchNorm2d(128),
#             nn.ReLU(),
#             nn.MaxPool2d(2,2)
#         )
#         self.layer5 = nn.Sequential(
#             nn.Conv2d(128,256,kernel_size=3,stride=1,padding=1),
#             nn.BatchNorm2d(256),
#             nn.ReLU(),
#         )
#         self.layer6 = nn.Sequential(
#             nn.Conv2d(256,256,kernel_size=3,stride=1,padding=1),
#             nn.BatchNorm2d(256),
#             nn.ReLU(),
#         )
#         self.layer7 = nn.Sequential(
#             nn.Conv2d(256,256,kernel_size=3,stride=1,padding=1),
#             nn.BatchNorm2d(256),
#             nn.ReLU(),
#             nn.MaxPool2d(2,2)
#         )
#         self.layer8 = nn.Sequential(
#             nn.Conv2d(256,512,kernel_size=3,stride=1,padding=1),
#             nn.BatchNorm2d(512),
#             nn.ReLU(),
#         )
#         self.layer9 = nn.Sequential(
#             nn.Conv2d(512,512,kernel_size=3,stride=1,padding=1),
#             nn.BatchNorm2d(512),
#             nn.ReLU(),
#         )
#         self.layer10 = nn.Sequential(
#             nn.Conv2d(512,512,kernel_size=3,stride=1,padding=1),
#             nn.BatchNorm2d(512),
#             nn.ReLU(),
#             nn.MaxPool2d(2,2)
#         )
#         self.layer11 = nn.Sequential(
#             nn.Conv2d(512,512,kernel_size=3,stride=1,padding=1),
#             nn.BatchNorm2d(512),
#             nn.ReLU(),
#         )
#         self.layer12 = nn.Sequential(
#             nn.Conv2d(512,512,kernel_size=3,stride=1,padding=1),
#             nn.BatchNorm2d(512),
#             nn.ReLU(),
#         )
#         self.layer13 = nn.Sequential(
#             nn.Conv2d(512,512,kernel_size=3,stride=1,padding=1),
#             nn.BatchNorm2d(512),
#             nn.ReLU(),
#             nn.MaxPool2d(2,2)
#         )
#         # self.fc = nn.Sequential(
#         #     nn.Dropout(0.5),
#         #     nn.Linear(7*7*512,4096),
#         #     nn.ReLU(),
#         # )
#         # self.fc1 = nn.Sequential(
#         #     nn.Dropout(0.5),
#         #     nn.Linear(4,4096),
#         #     nn.ReLU(),
#         # )
#         # self.fc2 = nn.Sequential(
#         #     nn.Linear (4096, num_classes))
        
#     def forward(self, x):
#         x = self.layer1(x)
#         x = self.layer2(x)
#         x = self.layer3(x)
#         x = self.layer4(x)
#         x = self.layer5(x)
#         x = self.layer6(x)
#         l = self.layer7(x)
#         x = self.layer8(l)
#         x = self.layer9(x)
#         x = self.layer10(x)
#         x = self.layer11(x)
#         x = self.layer12(x)
#         g = self.layer13(x)
#         x = x.reshape(x.size(0),-1)
#         # x = self.fc(x)
#         # x = self.fc1(x)
#         # x = self.fc2(x)
#         return x

### Construction of Our Model

Building upon the VGG16 base, we constructed our network by adding additional layers and incorporating attention mechanisms. This allowed us to tailor the model to our particular classification task while capitalizing on the robust feature extraction capabilities of VGG16.


### Benefits of Transfer Learning and Attention Mechanisms

Utilizing transfer learning with VGG16 and integrating attention mechanisms provided several advantages:
- **Time Efficiency:** Transfer learning expedited the model development process by leveraging pre-trained weights, reducing the need for extensive training on our dataset.
- **Performance Enhancement:** Attention mechanisms allowed the model to focus on salient features, potentially improving classification accuracy and robustness.
- **Resource Conservation:** By reusing pre-trained weights and incorporating attention mechanisms, we optimized resource utilization, making the model more practical for deployment in resource-constrained environments.


In [None]:
class ArtCNN(nn.Module):
    def __init__(self, num_classes, normalize_attn=False, dropout=None):
        super(ArtCNN, self).__init__()
        net = torchvision.models.vgg19_bn(weights=torchvision.models.VGG19_BN_Weights.DEFAULT)
        self.conv_block1 = nn.Sequential(*list(net.features.children())[0:6])
        self.conv_block2 = nn.Sequential(*list(net.features.children())[7:13])
        self.conv_block3 = nn.Sequential(*list(net.features.children())[14:23])
        self.conv_block4 = nn.Sequential(*list(net.features.children())[24:33])
        self.conv_block5 = nn.Sequential(*list(net.features.children())[34:43])
        self.pool = nn.AdaptiveAvgPool2d(7)
        self.dpt = None
        if dropout is not None:
            self.dpt = nn.Dropout(dropout)
        self.cls = nn.Linear(in_features=25856, out_features=num_classes, bias=True)
        
       # initialize the attention blocks defined above
        self.attn1 = AttentionBlock(256, 512, 256, 8, normalize_attn=normalize_attn)
        self.attn2 = AttentionBlock(512, 512, 256, 4, normalize_attn=normalize_attn)
        
    def forward(self, x):
        block1 = self.conv_block1(x)       # /1
        pool1 = F.max_pool2d(block1, 2, 2) # /2
        block2 = self.conv_block2(pool1)   # /2
        pool2 = F.max_pool2d(block2, 2, 2) # /4
        block3 = self.conv_block3(pool2)   # /4
        pool3 = F.max_pool2d(block3, 2, 2) # /8
        block4 = self.conv_block4(pool3)   # /8
        pool4 = F.max_pool2d(block4, 2, 2) # /16
        block5 = self.conv_block5(pool4)   # /16
        pool5 = F.max_pool2d(block5, 2, 2) # /32
        N, __, __, __ = pool5.size()
        g = self.pool(pool5).view(N,-1)
        a1, g1 = self.attn1(pool3, pool5)
        a2, g2 = self.attn2(pool4, pool5)
        g_hat = torch.cat((g,g1,g2), dim=1) # batch_size x C
        if self.dpt is not None:
            g_hat = self.dpt(g_hat)
        out = self.cls(g_hat)

        return [out, a1, a2]

In [None]:
def train_and_evaluate(model, optimizer, criterion, train_loader, val_loader, epochs=10):
    
    with tqdm(total=epochs) as pbar:
        for _ in range(epochs):
            # Training phase
            model.train()
            for _, data in enumerate(train_loader, 0):
                inputs, labels = data[0].to(device), data[1].to(device)
                optimizer.zero_grad()
                outputs, _, _ = model(inputs)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()
            pbar.update(1)
        correct = 0
        total = 0
        with torch.no_grad():
            for data in val_loader:
                images, labels = data[0].to(device), data[1].to(device)
                # calculate outputs by running images through the network
                outputs,_,_ = model(images)
                loss = criterion(outputs,labels)
                # the class with the highest energy is what we choose as prediction
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
    accuracy = 100 * correct // total
    print(f'Accuracy: {accuracy}%')
    return accuracy

In [None]:
def calculate_accuracy(val_loader, model, criterion, device):
    correct = 0
    total = 0
    with torch.no_grad():
        for data in val_loader:
            images, labels = data[0].to(device), data[1].to(device)
            # calculate outputs by running images through the network
            outputs,_,_ = model(images)
            loss = criterion(outputs,labels)
            # the class with the highest energy is what we choose as prediction
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    accuracy = 100 * correct // total
    print(f'Accuracy: {accuracy}%')
    return accuracy

# Model Training

## Training Process
1. **Load and Preprocess Data**: Prepare the dataset by loading and applying necessary preprocessing steps.
2. **Define Model Architecture**: Set up the visual Transformer model, including the encoder and decoder components.
3. **Set Loss Function and Optimizer**: Choose appropriate methods for training the model.
4. **Training Loop**: Iterate through the dataset, updating model parameters to minimize the loss function.
5. **Validation**: Evaluate the model on a separate validation dataset to monitor performance and avoid overfitting.
6. **Metrics Monitoring**: Track metrics like loss and accuracy to assess training progress.

### Hyperparameters
- **Learning Rate**: Determines the step size during gradient descent.
- **Batch Size**: Number of samples processed before updating the model.
- **Epochs**: Number of times the entire training dataset is passed through the model.
- **Optimizer**: Algorithm used for updating model parameters (e.g., AdamW).

In [None]:
def train(model, optimizer, criterion, train_loader, val_loader, epochs=10, early_stopping_patience=25):
    train_losses = []
    val_losses = []
    best_val_loss = float('inf')
    epochs_without_improvement = 0
    
    with tqdm(total=epochs) as pbar:
        for epoch in range(epochs):
            # Training phase
            model.train()
            train_loss = 0.0
            for _, data in enumerate(train_loader, 0):
                inputs, labels = data[0].to(device), data[1].to(device)
                optimizer.zero_grad()
                outputs, _, _ = model(inputs)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()
                train_loss += loss.item()
            train_losses.append(train_loss / len(train_loader))

            # Validation phase
            model.eval()
            val_loss = 0.0
            with torch.no_grad():
                for data in val_loader:
                    images, labels = data[0].to(device), data[1].to(device)
                    outputs, _, _ = model(images)
                    loss = criterion(outputs, labels)
                    val_loss += loss.item()
                val_losses.append(val_loss / len(val_loader))

            # Early stopping
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                epochs_without_improvement = 0
            else:
                epochs_without_improvement += 1
                if epochs_without_improvement >= early_stopping_patience:
                    print(f'Early stopping at epoch {epoch + 1}')
                    break

            # Update progress bar
            pbar.set_description(f'Train loss: {train_losses[-1]:.3f} | Val loss: {val_losses[-1]:.3f}')
            pbar.update(1)

    return train_losses, val_losses

### Hyperparameters Search

To get the best hyperparameters for each model we decided to use the Optuna Library. This library is a hyperparameter optimization framework applicable to machine learning models. It has been designed with a focus on simplicity and ease of use, while still providing a wide range of features and capabilities. Optuna is a popular choice for hyperparameter optimization due to its flexibility, scalability. It also supports a variety of use cases, including traditional machine learning models, deep learning models, and more.
 

In [None]:
def objective_auth(trial):
    # Define search space for hyperparameters
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-1,log=True, step = 1e-5)
    batch_size = trial.suggest_categorical('batch_size', [32, 64])
    dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.5, step=0.1)
    weight_decay = trial.suggest_float('weight_decay', 1e-6, 1e-4,log=True, step =1e-6) 
    epochs = trial.suggest_int('epochs', 5, 50, step=5)
    early_stopping_patience = trial.suggest_int('early_stopping_patience', 5, 20, step=5)

    # Train the model with current hyperparameters
    model = ArtCNN(num_classes=len(labels_author), dropout=dropout_rate).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    criterion = nn.CrossEntropyLoss()
    train_loader = DataLoader(train_dataset_auth, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset_auth, batch_size=batch_size, shuffle=False)
    _, val_loss = train(model, optimizer, criterion, train_loader, val_loader, epochs = epochs, early_stopping_patience=early_stopping_patience)
    accuracy = calculate_accuracy(val_loader, model, criterion, device)

    return accuracy

# Create a study object and optimize the objective function
study = optuna.create_study(direction='maximize')
study.optimize(objective_auth, n_trials=10)

# Get the best hyperparameters and accuracy
best_params_auth = study.best_params
best_accuracy_auth = study.best_value
#{'learning_rate': 5.1481519513568155e-05, 'batch_size': 32, 'dropout_rate': 0.25189548904234554, 'weight_decay': 3.701365753127736e-05, 'epochs': 24, 'early_stopping_patience': 17}
print("Best hyperparameters Author:", best_params_auth)
print("Best accuracy Author:", best_accuracy_auth)
torch.cuda.empty_cache()

In [None]:
def objective_style(trial):
    # Define search space for hyperparameters
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-1,log=True, step = 1e-5)
    batch_size = trial.suggest_categorical('batch_size', [32, 64])
    dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.5, step=0.1)
    weight_decay = trial.suggest_float('weight_decay', 1e-6, 1e-4,log=True, step =1e-6) 
    epochs = trial.suggest_int('epochs', 5, 50, step=5)
    early_stopping_patience = trial.suggest_int('early_stopping_patience', 5, 20, step=5)

    # Train the model with current hyperparameters
    model = ArtCNN(num_classes=len(labels_sty), dropout=dropout_rate).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    criterion = nn.CrossEntropyLoss()
    train_loader = DataLoader(train_dataset_style, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset_style, batch_size=batch_size, shuffle=False)
    _, val_loss = train(model, optimizer, criterion, train_loader, val_loader, epochs = epochs, early_stopping_patience=early_stopping_patience)
    accuracy = calculate_accuracy(val_loader, model, criterion, device)


    return accuracy

# Create a study object and optimize the objective function
study = optuna.create_study(direction='maximize')
study.optimize(objective_style, n_trials=100)

# Get the best hyperparameters and accuracy
best_params_sty = study.best_params
best_accuracy_sty = study.best_value

print("Best hyperparameters Style:", best_params_sty)
print("Best accuracy Style:", best_accuracy_sty)
torch.cuda.empty_cache()

In [None]:
# #Hyperparameters Grid Search
# from itertools import product
 
# learning_rates = [0.1, 0.01, 0.001,0.0001]
# weight_decays = [0.1, 0.01, 0.001,0.0001]
# dropouts = [0.2, 0.3,0.5]
# batch_sizes = [32, 64]

# best_accuracy = 0
# best_learning_rate = 0
# for lr, batch_size, dropout_rate, weight_decay in product(learning_rates, batch_sizes, dropouts, weight_decays):
#     model = ArtCNN(num_classes=len(labels_auth), normalize_attn=True, dropout=dropout_rate)
#     model.to(device)
#     train_loader = DataLoader(train_dataset_auth, batch_size=batch_size, shuffle=True)
#     val_loader = DataLoader(val_dataset_auth, batch_size=batch_size, shuffle=False)
#     optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
#     accuracy,_,_ = train_and_evaluate(model, optimizer, nn.CrossEntropyLoss(), train_loader, val_loader)
#     if accuracy > best_accuracy:
#         best_accuracy = accuracy
#         best_learning_rate = lr
#         best_batch_size = batch_size
#         best_dropout_rate = dropout_rate
#         best_weight_decay = weight_decay
# print(f'Best accuracy: {best_accuracy}, Best learning rate: {best_learning_rate}, Best batch size: {best_batch_size}, Best dropout rate: {best_dropout_rate}, Best weight decay: {best_weight_decay}')

## Create Dataloader

In [None]:
if not best_params_auth:
    best_params_auth = {'learning_rate': 5.1481519513568155e-05, 'batch_size': 32, 'dropout_rate': 0.25189548904234554, 'weight_decay': 3.701365753127736e-05, 'epochs': 24, 'early_stopping_patience': 17}
if not best_accuracy_sty:
    best_params_sty = {'learning_rate': 1.0225636410036674e-05, 'batch_size': 32, 'dropout_rate': 0.4978375095238207, 'weight_decay': 1.1995772075099324e-06, 'epochs': 50, 'early_stopping_patience': 12}
    
trainloader_auth = DataLoader(train_dataset_auth, batch_size=best_params_auth['batch_size'], shuffle=True)
testloader_auth = DataLoader(val_dataset_auth, batch_size=best_params_auth['batch_size'], shuffle=False)
trainloader_style = DataLoader(train_dataset_style, batch_size=best_params_sty['batch_size'], shuffle=True)
testloader_style = DataLoader(val_dataset_style, batch_size=best_params_sty['batch_size'], shuffle=False)

In [None]:
features, labels = next(iter(trainloader_auth))
print(f"Feature batch shape: {features.size()}")
print(f"Labels batch shape: {labels.size()}")
img = features[0].squeeze().permute(1, 2, 0)
label = labels[0]
plt.imshow(img)
plt.show()
print(f"Author: {labels_author[label]}")

In [None]:
model_auth = ArtCNN(len(labels_author),dropout=best_params_auth['dropout']).to(device)
model_style = ArtCNN(len(labels_sty),dropout=best_params_sty['dropout']).to(device)

### Choice of Loss Function and Optimizer
Selecting an appropriate loss function and optimizer is critical for effective model training.

1. **Cross-Entropy Loss**:
   - **Applicability**: Suitable for classification tasks, where the goal is to categorize input images into predefined classes.
   - **Compatibility**: Works well with models that output probabilities for different classes via a softmax layer.
   - **Training Objective**: Minimizing this loss function helps the model improve its prediction accuracy.

2. **AdamW Optimizer**:
   - **Adaptive Learning Rate**: Adjusts learning rates for each parameter based on the magnitude of gradients, promoting efficient convergence.
   - **Weight Decay**: Regularizes the model by penalizing large weights, reducing the risk of overfitting.
   - **Robustness**: Performs well across various tasks and architectures.
   - **Stability**: Ensures stable and faster convergence by maintaining separate learning rates for each parameter.


In [None]:
criterion = nn.CrossEntropyLoss()
optimizer_auth = optim.AdamW(model_auth.parameters(),lr=best_params_auth['learning_rate'], weight_decay=best_params_auth['weight_decay'])
optimizer_style = optim.AdamW(model_style.parameters(),lr=best_params_sty['learning_rate'], weight_decay=best_params_sty['weight_decay'])

# Models Training

In [None]:
print('Training the model for author...')
auth_train_losses, auth_test_loss = train(model_auth, optimizer_auth, criterion, trainloader_auth, testloader_auth, epochs=best_params_auth['epochs'], early_stopping_patience=best_params_auth['early_stopping_patience'])
print('Training the model for style...')
style_train_losses, style_test_loss = train(model_style, optimizer_style, criterion, trainloader_style, testloader_style, epochs=best_params_sty['epochs'],early_stopping_patience=best_params_sty['early_stopping_patience'])

In [None]:
import matplotlib.pyplot as plt

def plot_loss(loss_values, title="Loss Plot"):
    """
    Plot the loss values over training epochs.

    Parameters:
    - loss_values: A list or array of loss values for each epoch.
    - title: The title of the plot (optional).
    """
    plt.figure(figsize=(10, 6))
    plt.plot(loss_values[0], marker='o', linestyle='-', label='train')
    plt.plot(loss_values[1], marker='o', linestyle='-',label='val')
    plt.title(title)
    plt.legend()
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.grid(True)
    plt.show()

In [None]:
plot_loss([auth_train_losses,auth_test_loss], "Author Training Loss")
plot_loss([style_train_losses,style_test_loss], "Style Training Loss")

In [None]:
def imshow(img):
    img = img / 2 + 0.5     # unnormalize
    npimg = img.numpy()
    plt.imshow(np.transpose(npimg, (1, 2, 0)))
    plt.show()

In [None]:
dataiter_auth = iter(testloader_auth)
dataiter_style = iter(testloader_style)

images, label_auth = next(dataiter_auth)
images, label_sty = next(dataiter_style)

imshow(torchvision.utils.make_grid(images))
print('GroundTruth Author: ', '; '.join('%5s' % labels_author[label_auth[j]] for j in range(4)))
print('GroundTruth Style: ', '; '.join('%5s' % labels_sty[label_sty[j]] for j in range(4)))

In [None]:
model_auth.eval()
images = images.to(device)

outputs = model_auth(images)[0].to(device)
_, predicted = torch.max(outputs, 1)
print('Predicted: ', '; '.join('%5s' % labels_author[predicted[j]] for j in range(4)))

In [None]:
model_style.eval()
images = images.to(device)
outputs = model_auth(images)[0].to(device)
_, predicted = torch.max(outputs, 1)
print('Predicted: ', '; '.join('%5s' % labels_sty[predicted[j]] for j in range(4)))

In [None]:
calculate_accuracy(testloader_auth, model_auth, criterion, device)

In [None]:
calculate_accuracy(testloader_style, model_style, criterion, device)

In [None]:
import os
os.environ['TORCH_USE_CUDA_DSA'] = '1'
torch.save(model_auth.state_dict(), './model_checkpoints/model_auth.pth')
torch.save(model_style.state_dict(), './model_checkpoints/model_style.pth')