In [1]:
import numpy as np
import torch
import io
import torch.utils.data as utils
import torchvision.models as models
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
from torch.optim import lr_scheduler
import time
import torchvision
import torchvision.transforms as transforms
import torch.nn.functional as F
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import os
from PIL import Image
import tensorflow as tf
from tensorboardX import SummaryWriter


In [2]:
save_model_path = "/home/rajanie/models/RNN_ckpt/"

In [3]:
from torch.utils import data
class Dataload_RNN(data.Dataset):
    "Characterizes a dataset for PyTorch"

    def __init__(self, data_path, transform=None):
        "Initialization"
        self.transform = transform
        self.folders = data_path

    def __len__(self):
        "Denotes the total number of samples"
        return len(os.listdir(self.folders))

    def read_images(self, data_path, use_transform):
        X = []
        for i in os.listdir(data_path):
            # print("file name is ",i)
            image = Image.open(os.path.join(data_path, i))

            # print(image.shape)
            if use_transform is not None:
                image = use_transform(image)
                # print(image.size)
            image = torch.from_numpy(np.asarray(image))
            X.append(image)
        # print(X)
        # X = np.array(X)
        X = torch.stack(X, dim=0)

        return X

    def __getitem__(self, index):
        "Generates one sample of data"
        # Select sample
        # print("index passed is ",index)
        # print(self.folders)
        data_path = os.path.join(self.folders, os.listdir(self.folders)[index])
        # data_path = self.folders+ str(index)
        # print("Data path is ",data_path)

        # Load data
        X = self.read_images(data_path, self.transform)  # (input) spatial images

        y = 1
        if 'orig' in data_path:
            y = 0
        # print(X.shape)
        return X, torch.from_numpy(np.array(y)).type(torch.LongTensor)



## ---------------------- end of Dataloaders ---------------------- ##


In [4]:
TRANSFORM_IMG = transforms.Compose([
    transforms.Resize((224,224)),
    #transforms.CenterCrop(224),
    #transforms.ToTensor()
    #transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         #std=[0.229, 0.224, 0.225] )
    ])
train_path = "/home/chinmay/datatset/train/"
train_data = Dataload_RNN(train_path, transform=TRANSFORM_IMG)
val_path = "/home/chinmay/datatset/val/"
val_data = Dataload_RNN(val_path,  transform=TRANSFORM_IMG)


In [5]:
## ------------------------ CRNN module ---------------------- ##

def conv2D_output_size(img_size, padding, kernel_size, stride):
    # compute output shape of conv2D
    outshape = (np.floor((img_size[0] + 2 * padding[0] - (kernel_size[0] - 1) - 1) / stride[0] + 1).astype(int),
                np.floor((img_size[1] + 2 * padding[1] - (kernel_size[1] - 1) - 1) / stride[1] + 1).astype(int))
    return outshape


# 2D CNN encoder train from scratch (no transfer learning)
class EncoderCNN(nn.Module):
    def __init__(self, img_x=90, img_y=120, fc_hidden1=512, fc_hidden2=512, drop_p=0.3, CNN_embed_dim=300):
        super(EncoderCNN, self).__init__()

        self.img_x = img_x
        self.img_y = img_y
        self.CNN_embed_dim = CNN_embed_dim

        # CNN architechtures
        self.ch1, self.ch2, self.ch3, self.ch4 = 32, 64, 128, 256
        self.k1, self.k2, self.k3, self.k4 = (5, 5), (3, 3), (3, 3), (3, 3)
        self.s1, self.s2, self.s3, self.s4 = (2, 2), (2, 2), (2, 2), (2, 2)      # 2d strides
        self.pd1, self.pd2, self.pd3, self.pd4 = (0, 0), (0, 0), (0, 0), (0, 0)  # 2d padding
        # self.ch1, self.ch2  = 32, 64,
        # self.k1, self.k2 = (5, 5), (3, 3)
        # self.s1, self.s2 = (2, 2), (2, 2)
        # self.pd1, self.pd2 = (0, 0), (0, 0)  # 2d padding


        # conv2D output shapes
        self.conv1_outshape = conv2D_output_size((self.img_x, self.img_y), self.pd1, self.k1, self.s1)  # Conv1 output shape
        self.conv2_outshape = conv2D_output_size(self.conv1_outshape, self.pd2, self.k2, self.s2)
        self.conv3_outshape = conv2D_output_size(self.conv2_outshape, self.pd3, self.k3, self.s3)
        self.conv4_outshape = conv2D_output_size(self.conv3_outshape, self.pd4, self.k4, self.s4)

        # fully connected layer hidden nodes
        self.fc_hidden1, self.fc_hidden2 = fc_hidden1, fc_hidden2
        self.drop_p = drop_p

        self.conv1 = nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=self.ch1, kernel_size=self.k1, stride=self.s1, padding=self.pd1),
            nn.BatchNorm2d(self.ch1, momentum=0.01),
            nn.ReLU(inplace=True),                      
            # nn.MaxPool2d(kernel_size=2),
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(in_channels=self.ch1, out_channels=self.ch2, kernel_size=self.k2, stride=self.s2, padding=self.pd2),
            nn.BatchNorm2d(self.ch2, momentum=0.01),
            nn.ReLU(inplace=True),
            # nn.MaxPool2d(kernel_size=2),
        )

        self.conv3 = nn.Sequential(
            nn.Conv2d(in_channels=self.ch2, out_channels=self.ch3, kernel_size=self.k3, stride=self.s3, padding=self.pd3),
            nn.BatchNorm2d(self.ch3, momentum=0.01),
            nn.ReLU(inplace=True),
            # nn.MaxPool2d(kernel_size=2),
        )

        self.conv4 = nn.Sequential(
            nn.Conv2d(in_channels=self.ch3, out_channels=self.ch4, kernel_size=self.k4, stride=self.s4, padding=self.pd4),
            nn.BatchNorm2d(self.ch4, momentum=0.01),
            nn.ReLU(inplace=True),
            # nn.MaxPool2d(kernel_size=2),
        )

        self.drop = nn.Dropout2d(self.drop_p)
        self.pool = nn.MaxPool2d(2)
        self.fc1 = nn.Linear(self.ch2 * self.conv2_outshape[0] * self.conv2_outshape[1], self.fc_hidden1)   # fully connected layer, output k classes
        self.fc2 = nn.Linear(self.fc_hidden1, self.fc_hidden2)
        self.fc3 = nn.Linear(self.fc_hidden2, self.CNN_embed_dim)   # output = CNN embedding latent variables

    def forward(self, x_3d):
        cnn_embed_seq = []
        x_3d = x_3d.permute(0, 1, 4, 2, 3)
        for t in range(x_3d.size(1)):
            # CNNs
            
            x = self.conv1(x_3d[:, t, :, :, :])
            x = self.conv2(x)
            # x = self.conv3(x)
            # x = self.conv4(x)
            x = x.view(x.size(0), -1)           # flatten the output of conv

            # FC layers
            x = F.relu(self.fc1(x))
            # x = F.dropout(x, p=self.drop_p, training=self.training)
            # x = F.relu(self.fc2(x))
            # x = F.dropout(x, p=self.drop_p, training=self.training)
            # x = self.fc3(x)
            x = self.fc2(x)
            cnn_embed_seq.append(x)

        # swap time and sample dim such that (sample dim, time dim, CNN latent dim)
        cnn_embed_seq = torch.stack(cnn_embed_seq, dim=0).transpose_(0, 1)
        # cnn_embed_seq: shape=(batch, time_step, input_size)

        return cnn_embed_seq

# 2D CNN encoder using ResNet-152 pretrained
class ResCNNEncoder(nn.Module):
    def __init__(self, fc_hidden1=512, fc_hidden2=512, drop_p=0.3, CNN_embed_dim=300):
        """Load the pretrained ResNet-152 and replace top fc layer."""
        super(ResCNNEncoder, self).__init__()

        self.fc_hidden1, self.fc_hidden2 = fc_hidden1, fc_hidden2
        self.drop_p = drop_p

        resnet = models.resnet50(pretrained=False)
        modules = list(resnet.children())[:-1]      # delete the last fc layer.
        self.resnet = nn.Sequential(*modules)
        self.fc1 = nn.Linear(resnet.fc.in_features, fc_hidden1)
        self.bn1 = nn.BatchNorm1d(fc_hidden1, momentum=0.01)
        self.fc2 = nn.Linear(fc_hidden1, fc_hidden2)
        self.bn2 = nn.BatchNorm1d(fc_hidden2, momentum=0.01)
        self.fc3 = nn.Linear(fc_hidden2, CNN_embed_dim)
        
    def forward(self, x_3d):
        cnn_embed_seq = []
        x_3d = x_3d.permute(0, 1, 4, 2, 3)
        with torch.no_grad():
            for t in range(x_3d.size(1)):
                # CNNs
                x = self.resnet(x_3d[:, t, :, :, :]) # ResNet
                x = x.view(x.size(0), -1)            # flatten output of conv

                # FC layers
                x = self.bn1(self.fc1(x))
                x = F.relu(x)
                x = self.bn2(self.fc2(x))
                x = F.relu(x)
                x = F.dropout(x, p=self.drop_p, training=self.training)
                x = self.fc3(x)

                cnn_embed_seq.append(x)

        # swap time and sample dim such that (sample dim, time dim, CNN latent dim)
        cnn_embed_seq = torch.stack(cnn_embed_seq, dim=0).transpose_(0, 1)
        # cnn_embed_seq: shape=(batch, time_step, input_size)

        return cnn_embed_seq

class DecoderRNN(nn.Module):
    def __init__(self, CNN_embed_dim=300, h_RNN_layers=3, h_RNN=256, h_FC_dim=128, drop_p=0.3, num_classes=50):
        super(DecoderRNN, self).__init__()

        self.RNN_input_size = CNN_embed_dim
        self.h_RNN_layers = h_RNN_layers   # RNN hidden layers
        self.h_RNN = h_RNN                 # RNN hidden nodes
        self.h_FC_dim = h_FC_dim
        self.drop_p = drop_p
        self.num_classes = num_classes

        self.LSTM = nn.LSTM(
            input_size=self.RNN_input_size,
            hidden_size=self.h_RNN,        
            num_layers=h_RNN_layers,       
            batch_first=True,       # input & output will has batch size as 1s dimension. e.g. (batch, time_step, input_size)
        )

        self.fc1 = nn.Linear(self.h_RNN, self.h_FC_dim)
        self.fc2 = nn.Linear(self.h_FC_dim, self.num_classes)

    def forward(self, x_RNN):
        
        self.LSTM.flatten_parameters()
        RNN_out, (h_n, h_c) = self.LSTM(x_RNN, None)  
        """ h_n shape (n_layers, batch, hidden_size), h_c shape (n_layers, batch, hidden_size) """ 
        """ None represents zero initial hidden state. RNN_out has shape=(batch, time_step, output_size) """

        # FC layers
        x = self.fc1(RNN_out[:, -1, :])   # choose RNN_out at the last time step
        x = F.relu(x)
        x = F.dropout(x, p=self.drop_p, training=self.training)
        x = self.fc2(x)

        return x

In [6]:

def train_func(log_interval, model, device, train_loader, optimizer, epoch):
    # set model as training mode
    cnn_encoder, rnn_decoder = model
    cnn_encoder.train()
    rnn_decoder.train()

    losses = []
    scores = []
    N_count = 0   # counting total trained sample in one epoch
    for batch_idx, (X, y) in enumerate(train_loader):
        # distribute data to device
        X, y = X.to(device, dtype=torch.float), y.to(device)

        N_count += X.size(0)

        optimizer.zero_grad()
        output = rnn_decoder(cnn_encoder(X))   # output has dim = (batch, number of classes)
        loss = F.cross_entropy(output, y)
        losses.append(loss.item())

        # to compute accuracy
        y_pred = torch.max(output, 1)[1]  # y_pred != output
        #print(y_pred)
        
        #step_score = accuracy_score(y, y_pred)
        #test_score = accuracy_score(y.cpu().data.squeeze().numpy(), y_pred.cpu().data.squeeze().numpy())
        step_score = accuracy_score(y, y_pred)

        #print(step_score)
        scores.append(step_score)         # computed on CPU

        loss.backward()
        optimizer.step()

        # show information
        if (batch_idx + 1) % log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}, Accu: {:.2f}%'.format(
                epoch + 1, N_count, len(train_loader.dataset), 100. * (batch_idx + 1) / len(train_loader), loss.item(), 100 * step_score))
            
    print('\nTrain set: Average loss: {:.4f}, Accuracy: {:.2f}%\n'.format(np.mean(losses) ,100 * np.mean(scores)))


    return np.mean(losses), np.mean(scores)


In [7]:

def validation(model, device, optimizer, test_loader):
    # set model as testing mode
    cnn_encoder, rnn_decoder = model
    cnn_encoder.eval()
    rnn_decoder.eval()

    test_loss = 0
    all_y = []
    all_y_pred = []
    with torch.no_grad():
        for X, y in test_loader:
            # distribute data to device
            X, y = X.to(device,  dtype=torch.float), y.to(device)

            output = rnn_decoder(cnn_encoder(X))

            loss = F.cross_entropy(output, y, reduction='sum')
            test_loss += loss.item()                 # sum up batch loss
            y_pred = output.max(1, keepdim=True)[1]  # (y_pred != output) get the index of the max log-probability

            # collect all y and y_pred in all batches
            all_y.extend(y)
            all_y_pred.extend(y_pred)

    test_loss /= len(test_loader.dataset)

    # compute accuracy
    all_y = torch.stack(all_y, dim=0)
    all_y_pred = torch.stack(all_y_pred, dim=0)
    test_score = accuracy_score(all_y.cpu().data.squeeze().numpy(), all_y_pred.cpu().data.squeeze().numpy())
    #test_score = accuracy_score(all_y.cpu().data.squeeze().numpy(), all_y_pred.cpu().data.squeeze().numpy())

    # show information
    print('\nTest set ({:d} samples): Average loss: {:.4f}, Accuracy: {:.2f}%\n'.format(len(all_y), test_loss, 100* test_score))

   

    return test_loss, test_score


In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score

                        
# EncoderCNN architecture
CNN_fc_hidden1, CNN_fc_hidden2 = 1024, 512
CNN_embed_dim = 512      # latent dim extracted by 2D CNN
img_x, img_y = 224, 224  # resize video 2d frame size
dropout_p_enco = 0.0      # dropout probability
dropout_p_deco = 0.0

# DecoderRNN architecture
RNN_hidden_layers = 3
RNN_hidden_nodes = 512
RNN_FC_dim = 256

writer = SummaryWriter('runs/ff_resnet50_scratch')

# training parameters
epochs = 500        # training epochs
batch_size = 100
learning_rate = 1e-3
log_interval = 10   # interval for displaying training info


# Detect devices
use_cuda = torch.cuda.is_available()                   # check if GPU exists
device = torch.device("cuda" if use_cuda else "cpu")   # use CPU or GPU

# Data loading parameters
params = {'batch_size': batch_size, 'shuffle': True, 'num_workers': 4, 'pin_memory': True} if use_cuda else {}


# train_overfit = []
# for i in range(10):
#     train_overfit.append(train_data[i])
train_loader = data.DataLoader(train_data, **params)
valid_loader = data.DataLoader(val_data, **params)

# Create model
cnn_encoder = ResCNNEncoder(fc_hidden1=CNN_fc_hidden1, fc_hidden2=CNN_fc_hidden2,
                         drop_p=dropout_p_enco, CNN_embed_dim=CNN_embed_dim).to(device)

# cnn_encoder = EncoderCNN(img_x=img_x, img_y=img_y, fc_hidden1=CNN_fc_hidden1, fc_hidden2=CNN_fc_hidden2,
#                          drop_p=dropout_p, CNN_embed_dim=CNN_embed_dim).to(device)

rnn_decoder = DecoderRNN(CNN_embed_dim=CNN_embed_dim, h_RNN_layers=RNN_hidden_layers, h_RNN=RNN_hidden_nodes, 
                         h_FC_dim=RNN_FC_dim, drop_p=dropout_p_deco, num_classes=2).to(device)
                         
#cnn_encoder.load_state_dict(torch.load(os.path.join(save_model_path, 'cnn_encoder_best.pth')))
#rnn_decoder.load_state_dict(torch.load(os.path.join(save_model_path, 'rnn_decoder_best.pth')))

crnn_params = list(cnn_encoder.parameters()) + list(rnn_decoder.parameters())
#optimizer = optim.SGD(crnn_params, lr = learning_rate , momentum=0.94, weight_decay = 1)




#optimizer = torch.optim.Adam(crnn_params, lr=1e-2)
optimizer = torch.optim.RMSprop(crnn_params, lr=learning_rate)
#optimizer.load_state_dict(torch.load(os.path.join(save_model_path, 'optimizer_best.pth')))

#scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=5)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 50, gamma=0.5)


# record training process
epoch_train_losses = []
epoch_train_scores = []
epoch_test_losses = []
epoch_test_scores = []

# start training
best_loss = 10
for epoch in range(epochs):
    # train, test model
    scheduler.step()
    train_losses, train_scores = train_func(log_interval, [cnn_encoder, rnn_decoder], device, train_loader, optimizer, epoch)
    epoch_test_loss, epoch_test_score = validation([cnn_encoder, rnn_decoder], device, optimizer, valid_loader)
    
    
    ##tensorboard logs
    writer.add_scalars("loss",{'Train' : train_losses , 'Val' :epoch_test_loss}, epoch)
    writer.add_scalars("Accuracy", {'Train' :train_scores, 'Val' :epoch_test_score} ,epoch)
   
    #lr = scheduler.get_lr()[0]
#     tbc.save_value("Scheduler LR","LR", epoch,lr )
   
    #writer.add_scalars("Learning rate", lr ,epoch)
  
    # save results
    epoch_train_losses.append(train_losses)
    epoch_train_scores.append(train_scores)
    epoch_test_losses.append(epoch_test_loss)
    epoch_test_scores.append(epoch_test_score)
    
    ##save best model
    if epoch_test_loss < best_loss:
        best_loss= epoch_test_loss
         # save Pytorch models of best record
        torch.save(cnn_encoder.state_dict(), os.path.join(save_model_path, 'cnn_encoder_best_resnet50_scratch.pth'))  # save spatial_encoder
        torch.save(rnn_decoder.state_dict(), os.path.join(save_model_path, 'rnn_decoder_best_resnet50_scratch.pth'))  # save motion_encoder
        torch.save(optimizer.state_dict(), os.path.join(save_model_path, 'optimizer_best_resnet50_scratch.pth'))      # save optimizer
        print("Epoch {} model saved!".format(epoch + 1))
    

    # save all train test results
#     A = np.array(epoch_train_losses)
#     B = np.array(epoch_train_scores)
#     C = np.array(epoch_test_losses)
#     D = np.array(epoch_test_scores)
#     np.save('./CRNN_epoch_training_losses.npy', A)
#     np.save('./CRNN_epoch_training_scores.npy', B)
#     np.save('./CRNN_epoch_test_loss.npy', C)
#     np.save('./CRNN_epoch_test_score.npy', D)

# # plot
# fig = plt.figure(figsize=(10, 4))
# plt.subplot(121)
# plt.plot(np.arange(1, epochs + 1), A[:, -1])  # train loss (on epoch end)
# plt.plot(np.arange(1, epochs + 1), C)         #  test loss (on epoch end)
# plt.title("model loss")
# plt.xlabel('epochs')
# plt.ylabel('loss')
# plt.legend(['train', 'test'], loc="upper left")
# # 2nd figure
# plt.subplot(122)
# plt.plot(np.arange(1, epochs + 1), B[:, -1])  # train accuracy (on epoch end)
# plt.plot(np.arange(1, epochs + 1), D)         #  test accuracy (on epoch end)
# plt.title("training scores")
# plt.xlabel('epochs')
# plt.ylabel('accuracy')
# plt.legend(['train', 'test'], loc="upper left")
# title = "./fig_CRNN.png"
# plt.savefig(title, dpi=600)
# # plt.close(fig)
# plt.show()



Train set: Average loss: 0.8575, Accuracy: 51.97%


Test set (300 samples): Average loss: 0.7756, Accuracy: 50.00%

Epoch 1 model saved!

Train set: Average loss: 0.7075, Accuracy: 50.17%


Test set (300 samples): Average loss: 0.6970, Accuracy: 49.00%

Epoch 2 model saved!

Train set: Average loss: 0.6959, Accuracy: 48.33%


Test set (300 samples): Average loss: 0.6956, Accuracy: 50.00%

Epoch 3 model saved!

Train set: Average loss: 0.6953, Accuracy: 53.50%


Test set (300 samples): Average loss: 0.7046, Accuracy: 49.67%


Train set: Average loss: 0.6980, Accuracy: 50.90%


Test set (300 samples): Average loss: 0.6996, Accuracy: 52.67%


Train set: Average loss: 0.6955, Accuracy: 46.87%


Test set (300 samples): Average loss: 0.6938, Accuracy: 50.33%

Epoch 6 model saved!

Train set: Average loss: 0.6936, Accuracy: 51.97%


Test set (300 samples): Average loss: 0.6935, Accuracy: 52.33%

Epoch 7 model saved!

Train set: Average loss: 0.6925, Accuracy: 52.23%


Test set (300 samples):

### Resnet upto 20 epochs lr 1e-4 after that lr 1e-5