# Video classification using transfer learning 

The dataset consists of 1641* videos (1240 for train, 200 for val, and 201 for test).  The dataset is balanced for *reduced* vs. *not_reduced* left ventricular ejection fraction.

#### Video Pre-Processing:

Using ffmpeg in another notebook, videos are pre-processed as follows;:

- convert to mp4
- reduce frame per sec to obtain a 50-frame clip 

####  Setting Details:
- frame_transform = v2.Compose([v2.Compose([v2.ToImage(),
                                  v2.Resize((172,172)),
                                  v2.Grayscale(num_output_channels = 3),
                                  v2.ToDtype(torch.float32, scale=True),
                                  v2.Normalize(mean=[0.43216, 0.394666, 0.37645],
                                                     std=[0.22803, 0.22145, 0.216989])

- Hyperparameters: common to the four studied models

  *It was needed for this model to avoid a last incomplete batch and thus reduced our standard dataset accordingly and used a batch size of 20. 

In [None]:
# Some parts of this code are based on the Python script:
# https://github.com/pytorch/tutorials/blob/master/beginner_source/transfer_learning_tutorial.py
# License: BSD
# For the MoViNet model, the code is in part based on th python script:
#https://github.com/Atze00/MoViNet-pytorch/blob/main/movinet_tutorial.ipynb
# License: BSD: MIT

import time
import os
import copy
import numpy as np
import random
import itertools
import pandas as pd 

# PyTorch
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import torchvision
from torchvision import datasets, transforms
import torchvision.transforms.functional as F
from torchvision.io import read_video
from torch.utils.data import DataLoader, TensorDataset
from torchvision.datasets.folder import make_dataset
from torchvision.transforms import v2

from movinets import MoViNet
from movinets.config import _C

torch.manual_seed(42)
np.random.seed(42)
rng = np.random.default_rng(seed=42)
# OpenMP: number of parallel threads.
os.environ["OMP_NUM_THREADS"] = "1"

## Setting of the main hyper-parameters of the models

In [9]:
step = 0.001               # Learning rate
batch_size = 20            # Number of samples for each training step
num_epochs = 10              # Number of training epochs
gamma_lr_scheduler = 0.0001    # Learning rate reduction applied every 10 epochs.
start_time = time.time()    # Start of the computation timer
clip_len = 50

In [13]:
# for local laptop
path_video_data = ""

## Creating a dataloader

In [14]:
data_dir =  "movienet_50f/" 

In [15]:
def _find_classes(dir):
    classes = [d.name for d in os.scandir(dir) if d.is_dir()]
    classes.sort()
    class_to_idx = {cls_name: i for i, cls_name in enumerate(classes)}
    return classes, class_to_idx

def get_samples(root, extensions=(".mp4", ".avi")):
    _, class_to_idx = _find_classes(root)
    return make_dataset(root, class_to_idx, extensions=extensions)

In [16]:
class_names = _find_classes(data_dir + "train")
class_names

(['not_reduced', 'reduced'], {'not_reduced': 0, 'reduced': 1})

In [17]:
class RandomDataset(torch.utils.data.IterableDataset):
    def __init__(self, root, epoch_size=None, frame_transform=None, video_transform=None, clip_len=16):
        super(RandomDataset).__init__()

        self.samples = get_samples(root)

        # Allow for temporal jittering
        if epoch_size is None:
            epoch_size = len(self.samples)
        self.epoch_size = epoch_size

        self.clip_len = clip_len
        self.frame_transform = frame_transform
        self.video_transform = video_transform

    def __iter__(self):
        rng = np.random.default_rng(seed=42)
        random_index_list = rng.choice(self.epoch_size, size= self.epoch_size,
                                replace = False).tolist()  
        
        for i in range(self.epoch_size):
            # Get random sample (seed = 42)
            path, target = self.samples[random_index_list[i]]  
            # Get video object
            vid = torchvision.io.VideoReader(path, "video")
            metadata = vid.get_metadata()
            video_frames = []  # video frame buffer

            # Seek and return frames
            #max_seek is not used here:
            #max_seek = metadata["video"]['duration'][0]  - (self.clip_len / metadata["video"]['fps'][0])
            start = 0. # was: random.uniform(0., max_seek), now no more random, start at 0
            for frame in itertools.islice(vid.seek(start), self.clip_len):
                video_frames.append(self.frame_transform(frame['data']))                
                current_pts = frame['pts']
            # Stack it into a tensor
            video = torch.stack(video_frames, 0)
            if self.video_transform:
                video = self.video_transform(video)
            output = {
                'path': path,
                'video': video,
                'target': target,
                'start': start,
                'end': current_pts}
            yield output

## Dataset Loading

In [18]:
video_transform = None 

In [19]:
# our videos are already with size (112, 112)
frame_transform = v2.Compose([v2.ToImage(),
                              v2.Resize((172,172)), # for MoViNet-pytorch in tutorial
                              v2.Grayscale(num_output_channels = 3),
                              v2.ToDtype(torch.float32, scale=True),
                              v2.Normalize(mean=[0.43216, 0.394666, 0.37645],
                                                     std=[0.22803, 0.22145, 0.216989]),
                  
])                            

In [21]:
dataset_sizes ={}
for phase in ["train", "val","test"]: 
    dataset_sizes[phase] = len(get_samples(data_dir+phase+"/") ) 
dataset_sizes

{'train': 1243, 'val': 214, 'test': 201}

In [22]:
dataset_dic = {}
dataloaders = {}

for phase in ["train", "val","test"]: 
    dataset_dic[phase] = RandomDataset(data_dir+phase+"/", epoch_size=None,
                                       frame_transform=frame_transform,
                                       video_transform = video_transform,
                                       clip_len = clip_len
                                      )
    # drop_last and shuffled added for MoViNet
    dataloaders[phase] = DataLoader(dataset_dic[phase],
                                    batch_size=batch_size,
                                    #shuffle = True, # not allowed for IterableDataset
                                    drop_last = True, # necessary for MoVieNet
                                    )

### stream

In [24]:
def train_model(model, criterion, optimizer, scheduler, num_epochs):
    since = time.time()
    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0
    best_loss = 10000.0  # Large arbitrary number
    best_acc_train = 0.0
    best_loss_train = 10000.0  # Large arbitrary number
    print("Training started:")

    for epoch in range(num_epochs):

        # Each epoch has a training and validation phase
        for phase in ["train", "val"]:
            if phase == "train":
                # Set model to training mode
                model.train()
            else:
                # Set model to evaluate mode
                model.eval()
            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            n_batches = dataset_sizes[phase] // batch_size
            it = 0
           
            for batch in dataloaders[phase]:
                since_batch = time.time()
                #next_data = next(data_iter)
                inputs = batch['video']
                labels = batch['target']
                batch_size_ = len(inputs)
                
                inputs = torch.permute(inputs, (0,2,1,3,4)) 
                
                inputs = inputs.to(device)
                labels = labels.to(device)
                optimizer.zero_grad()

                # Track/compute gradient and make an optimization step only when training
                with torch.set_grad_enabled(phase == "train"):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)
                    if phase == "train":
                        loss.backward()
                        optimizer.step()

                # Print iteration results
                running_loss += loss.item() * batch_size_
                batch_corrects = torch.sum(preds == labels.data).item()
                running_corrects += batch_corrects
                print(
                    "Phase: {} Epoch: {}/{} Iter: {}/{} Batch time: {:.4f}".format(
                        phase,
                        epoch + 1,
                        num_epochs,
                        it + 1,
                        n_batches + 1,
                        time.time() - since_batch,
                    ),
                    end="\r",
                    flush=True,
                )
                it += 1

            # Print epoch results
            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects / dataset_sizes[phase]
            print(
                "Phase: {} Epoch: {}/{} Loss: {:.4f} Acc: {:.4f}        ".format(
                    "train" if phase == "train" else "val",
                    epoch + 1,
                    num_epochs,
                    epoch_loss,
                    epoch_acc,
                )
            )

            # Check if this is the best model wrt previous epochs
            if phase == "val" and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())
            if phase == "val" and epoch_loss < best_loss:
                best_loss = epoch_loss
            if phase == "train" and epoch_acc > best_acc_train:
                best_acc_train = epoch_acc
            if phase == "train" and epoch_loss < best_loss_train:
                best_loss_train = epoch_loss

            # Update learning rate
            if phase == "train":
                scheduler.step()

    # Print final results
    model.load_state_dict(best_model_wts)
    time_elapsed = time.time() - since
    print(
        "Training completed in {:.0f}m {:.0f}s".format(time_elapsed // 60, time_elapsed % 60)
    )
    print("Best test loss: {:.4f} | Best test accuracy: {:.4f}".format(best_loss, best_acc))
    return model

In [25]:
def predict_for_phase(model, count_max=10000, phase = 'test'):
    path_list = []
    prob_list = []
    pred_list = []
    class_list_pred = []
    class_list_label = []
    label_list = []
    counter = 0
    
    model.eval()
    with torch.no_grad():
        
        for next_data in dataset_dic[phase]: 
            
            path = next_data['path']
            inputs = next_data['video']
            label = next_data['target']
            
            inputs = inputs[None, :, :, :, :]

            inputs = torch.permute(inputs, (0,2,1,3,4))      
                   
            inputs = inputs.to(device)
            
            outputs = model(inputs)
            outputs = nn.Sigmoid()(outputs)
            outputs = torch.nn.functional.normalize(outputs, p=1)
            prob = outputs[0][1].item()
            _, preds = torch.max(outputs, 1)
            pred = int(preds[0].item())
            class_pred = class_names[0][preds[0]]
            class_label = class_names[0][label]         
    
            if counter%50 == 0:
                print(counter)
                
            path_list.append(path)
            class_list_pred.append(class_pred)
            class_list_label.append(class_label)
            prob_list.append(prob)                  
            pred_list.append(pred)
            label_list.append(label)
            
            counter += 1 
            if counter == count_max: break
    # create csv
    df = pd.DataFrame(list(zip(path_list, class_list_label, class_list_pred, 
                               label_list, pred_list, prob_list)),
               columns =['path', 'true_class', 'pred_class', 'label', 'pred', 'prob']) 
        
    return df 

# Training

In [27]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [28]:
model = MoViNet(_C.MODEL.MoViNetA0, causal = False, pretrained = True ) # CAUSAL IS FALSE

In [30]:
for param in model.parameters():
    param.requires_grad = False

In [None]:
model.classifier[3] = nn.Conv3d(2048, 2, kernel_size = (1,1,1))

In [32]:
# Use CUDA or CPU according to the "device" object.
model = model.to(device)

criterion = nn.CrossEntropyLoss()

# Observe that only parameters of final layer are being optimized
optimizer = optim.Adam(model.classifier[3].parameters(), lr=step)

exp_lr_scheduler = lr_scheduler.StepLR(
    optimizer, step_size=10, gamma=gamma_lr_scheduler
)

In [34]:
# train
model = train_model(
    model, criterion, optimizer, exp_lr_scheduler, num_epochs=num_epochs
)

Training started:
Phase: train Epoch: 1/10 Loss: 0.6329 Acc: 0.6597        
Phase: val Epoch: 1/10 Loss: 0.5328 Acc: 0.7009        
Phase: train Epoch: 2/10 Loss: 0.5585 Acc: 0.7273        
Phase: val Epoch: 2/10 Loss: 0.4881 Acc: 0.7103        
Phase: train Epoch: 3/10 Loss: 0.5233 Acc: 0.7498        
Phase: val Epoch: 3/10 Loss: 0.4637 Acc: 0.7477        
Phase: train Epoch: 4/10 Loss: 0.4997 Acc: 0.7659        
Phase: val Epoch: 4/10 Loss: 0.4487 Acc: 0.7477        
Phase: train Epoch: 5/10 Loss: 0.4828 Acc: 0.7747        
Phase: val Epoch: 5/10 Loss: 0.4382 Acc: 0.7523        
Phase: train Epoch: 6/10 Loss: 0.4672 Acc: 0.7788        
Phase: val Epoch: 6/10 Loss: 0.4305 Acc: 0.7570        
Phase: train Epoch: 7/10 Loss: 0.4594 Acc: 0.7844        
Phase: val Epoch: 7/10 Loss: 0.4251 Acc: 0.7617        
Phase: train Epoch: 8/10 Loss: 0.4503 Acc: 0.7981        
Phase: val Epoch: 8/10 Loss: 0.4203 Acc: 0.7617        
Phase: train Epoch: 9/10 Loss: 0.4471 Acc: 0.7908        
Phase: val E

## Predictions and accuracy for the test dataset

In [37]:
df = predict_for_phase(model, count_max = 20000, phase = 'test')



0
50
100
150
200


In [38]:
correct = 0
for i, label in enumerate(df['label']):
    if label == df.pred[i]:
        correct += 1
accuracy = correct / len(df)
accuracy

0.7761194029850746

In [39]:
csv_name = "redo_movinet_A0_modified.csv"

In [40]:
df.to_csv(csv_name, index = False)