# Data Analysis

## Import Libraries

In [1]:
import torch
import torch.nn as nn

from torch.utils.data import DataLoader
from torchvision import transforms, models
from torchvision.models import ResNet18_Weights, MobileNet_V3_Large_Weights, EfficientNet_B2_Weights
from dataset import MultiLabelDataset
from tqdm import tqdm
from tools import get_data, load_data, remove_class



## Import Data

### Performing Text Cleaning & Data Extraction (Possibly for NLP)

We will see how to code and clean the textual data for the following methods.
- Lowecasing the data
- Removing Puncuatations
- Removing Numbers
- Removing extra space
- Replacing the repetitions of punctations
- Removing Emojis
- Removing emoticons
- Removing Contractions
Reference: https://www.analyticsvidhya.com/blog/2022/01/text-cleaning-methods-in-nlp/

Manual Data Cleaning:
- Remove `\n` character that breaks the caption into two lines (1014.jpg)
- Remove `\n` character that breaks the caption into two lines (2259.jpg)
- Remove `\n` character that breaks the caption into two lines (6751.jpg)
- Remove **a couple of** `\n` character that breaks the caption into two lines (24624.jpg)

**Note: there is no label 12! [Label starts from 1 to 19]**

### Class Labels

There are 19 classes: class starts from 1 to 19. There is no label 12 assigned to any image. <br>
For one-hot encoding: the 0-the index will represent class 1.

In [2]:
# import data
train_data = get_data("./dataset/train.csv")
test_data = get_data("./dataset/test.csv")

# perform text cleaning and get the pandas' dataframe
train_data = load_data(train_data)
test_data = load_data(test_data, has_label=False)

In [3]:
print(f"Number of training instances: {train_data.shape[0]}")
print(f"Number of testing instances:  {test_data.shape[0]}")

Number of training instances: 30000
Number of testing instances:  10000


### Remove instances with only class 1 (Data Imbalance Problem)

In [4]:
# remove an imbalanced class
train_data = remove_class(train_data, class_no=1)
print(f"Number of training instances: {train_data.shape[0]}")

Number of training instances: 15925


## Preprocessing for Images and Caption

In [5]:
# define the image transformation: currently following resnet18
transform = transforms.Compose([
    transforms.Resize((288, 288)),
    transforms.CenterCrop(224),
    transforms.ToTensor(), # converts images to [0, 1]
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225],
    )
])

## Create Dataset and DataLoader

In [6]:
# initialize the dataset
train_dataset = MultiLabelDataset(
    csv_file=train_data,
    root_dir='./dataset/data/',
    vectorizer=None,
    transform=transform,
)
test_dataset = MultiLabelDataset(
    csv_file=test_data,
    root_dir='./dataset/data/',
    vectorizer=None,
    transform=transform,
)

BATCH_SIZE=16

# load the dataset into batches 
train_dataloader = DataLoader(
    dataset=train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
)
test_dataloader = DataLoader(
    dataset=test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
)

## Define Model, Optimizer, Loss Function, etc.

In [7]:
# model = models.mobilenet_v3_large(weights=MobileNet_V3_Large_Weights.IMAGENET1K_V2)
# model = models.resnet18(weights=ResNet18_Weights.IMAGENET1K_V1)
model = models.efficientnet_b2(weights=EfficientNet_B2_Weights.IMAGENET1K_V1)

# freeze the top layers of the model
for name, params in model.named_parameters():

    # MobileNet V3
    # if ("classifier" not in name):
    #     params.requires_grad = False

    # ResNet18
    # if ("fc" not in name):
    #     params.requires_grad = False

    # EfficentNet B2
    if ("classifier" not in name):
        params.requires_grad = False

# define the classifier layer again
n_out = 19

# MobileNet V3
# model.classifier = nn.Sequential(
#     nn.Linear(960, 1280),
#     nn.Hardswish(inplace=True),
#     nn.Dropout(p=0.2, inplace=True),
#     nn.Linear(1280, n_out),
# )

# ResNet18
# n_features = model.fc.in_features
# model.fc = nn.Sequential(
#     nn.Linear(in_features=n_features, out_features=n_out),
# )

# EfficientNet B2
model.classifier = nn.Sequential(
    nn.Dropout(p=0.3, inplace=True),
    nn.Linear(in_features=1408, out_features=n_out),
)

# define hyperparameters
EPOCHS = 5
THRESHOLD = 0.5
loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(
    params=model.parameters(),
    lr=0.01,
)

# utilise GPU
if torch.cuda.is_available():
    print('using GPU')
    model = model.to('cuda')

using GPU


## Perform Training

In [8]:
train_losses = []
train_accs = []
for epoch in range(EPOCHS):

	n_total = 0
	n_correct = 0
	train_loss = 0.
	model.train()
	for _, images, _, labels in tqdm(train_dataloader, desc=f"Epoch {epoch+1} Training: "):

		if torch.cuda.is_available():
			images = images.to('cuda')
			labels = labels.to('cuda')

		y_pred = model(images)

		# backward
		loss = loss_fn(y_pred, labels)
		loss.backward()

		# update
		optimizer.step()

		# compare
		predicted = (y_pred > THRESHOLD).int()

		train_loss += loss.item()
		n_correct += torch.all(torch.eq(predicted, labels), dim=1).sum()
		n_total += labels.shape[0]

	train_losses.append(train_loss / len(train_dataloader))
	train_accs.append(n_correct / n_total)

	print("Epoch {:d}, Train Loss: {:.7f}, Train Accuracy: {:.3f}%".format(epoch+1, train_losses[-1], train_accs[-1]*100))

  labels = torch.Tensor(self.df.iloc[idx, 2:])
Epoch 1 Training: 100%|██████████| 996/996 [01:06<00:00, 15.06it/s]


Epoch 1, Train Loss: 12.9891046, Train Accuracy: 7.359%


Epoch 2 Training: 100%|██████████| 996/996 [01:06<00:00, 14.92it/s]


Epoch 2, Train Loss: 22.4438925, Train Accuracy: 11.661%


Epoch 3 Training: 100%|██████████| 996/996 [01:06<00:00, 15.01it/s]


Epoch 3, Train Loss: 25.4430100, Train Accuracy: 11.906%


Epoch 4 Training: 100%|██████████| 996/996 [01:06<00:00, 15.05it/s]


Epoch 4, Train Loss: 28.7335936, Train Accuracy: 12.100%


Epoch 5 Training: 100%|██████████| 996/996 [01:06<00:00, 15.01it/s]

Epoch 5, Train Loss: 30.5462551, Train Accuracy: 12.873%





## 

## Multi-Label Image Classification

Tutorials:
- [Build First Multi-Label Image Classification Model Python](https://www.analyticsvidhya.com/blog/2019/04/build-first-multi-label-image-classification-model-python/)