<a href="https://colab.research.google.com/github/sarahgin/DeepLeaningProj/blob/master/InstancePreTrainClass_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# mount data
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
! mkdir /Drive
!ls /Drive

In [0]:
import shutil
import glob
import os
from zipfile import ZipFile

for src in glob.glob('/content/gdrive/My Drive/videos_2/yt_bb_detection_train/*.zip'):
  dst = f'/Drive/{os.path.basename(src)}'
  print(src, dst)    
  if os.path.exists(src) and not os.path.exists(dst):
    print(f'copy {src} to {dst}')
    shutil.copy2(src, dst)
  
 # Create a ZipFile Object and load sample.zip in it
for z in glob.glob('/Drive/*zip'):
  with ZipFile(z, 'r') as zipObj:
    # Extract all the contents of zip file in current directory
    zipObj.extractall('/Drive')
    
!ls /Drive

In [0]:
import shutil
import os
import pathlib
import random

def list_dirs(directory):
    """Returns all directories in a given directory
    """
    return [f for f in pathlib.Path(directory).iterdir() if f.is_dir()]

def list_files(directory):
    """Returns all files in a given directory
    """
    return [
        f
        for f in pathlib.Path(directory).iterdir()
        if f.is_file() and not f.name.startswith(".")
    ]

def setup_files(class_dir, seed):
    """Returns shuffled files
    """
    # make sure its reproducible
    random.seed(seed)

    files = list_files(class_dir)

    files.sort()
    random.shuffle(files)
    return files

def ratio(input, output="output", seed=1337, ratio=(0.8, 0.1, 0.1)):
    # make up for some impression
    assert round(sum(ratio), 5) == 1
    assert len(ratio) in (2, 3)

    for class_dir in list_dirs(input):
        for instance_dir in list_dirs(class_dir):
            instancename = os.path.basename(instance_dir)
            classname = os.path.basename(class_dir)
            fulloutput = os.path.join(classname,instancename)
            #output_dir = os.path.join(output, classname ,instancename)
            split_class_dir_ratio(instance_dir, output, fulloutput, ratio, seed, None)


def split_class_dir_fixed(class_dir, output, fixed, seed, prog_bar):
    """Splits one very class folder
    """
    files = setup_files(class_dir, seed)

    if not len(files) > sum(fixed):
        raise ValueError(
            f'The number of samples in class "{class_dir.stem}" are too few. There are only {len(files)} samples available but your fixed parameter {fixed} requires at least {sum(fixed)} files. You may want to split your classes by ratio.'
        )

    split_train = len(files) - sum(fixed)
    split_val = split_train + fixed[0]

    li = split_files(files, split_train, split_val, len(fixed) == 2)
    copy_files(li, class_dir, output, prog_bar)
    return len(files)

def split_class_dir_ratio(class_dir, output, fulloutput, ratio, seed, prog_bar):
    """Splits one very class folder
    """
    files = setup_files(class_dir, seed)

    split_train = int(ratio[0] * len(files))
    split_val = split_train + int(ratio[1] * len(files))

    li = split_files(files, split_train, split_val, len(ratio) == 3)
    copy_files(li, class_dir, output, fulloutput, prog_bar)


def split_files(files, split_train, split_val, use_test):
    """Splits the files along the provided indices
    """
    files_train = files[:split_train]
    files_val = files[split_train:split_val] if use_test else files[split_train:]

    li = [(files_train, "train"), (files_val, "test")]

    # optional test folder
    if use_test:
        files_test = files[split_val:]
        li.append((files_test, "test"))
    return li


def copy_files(files_type, class_dir, output, fulloutput, prog_bar):
    """Copies the files from the input folder to the output folder
    """
    # get the last part within the file
    for (files, folder_type) in files_type:
        full_path = os.path.join(output, folder_type, fulloutput)

        pathlib.Path(full_path).mkdir(parents=True, exist_ok=True)
        for f in files:
            if not prog_bar is None:
                prog_bar.update()
            shutil.copy2(f, full_path)
            
            
ratio('/Drive', output='/Drive/data/', seed=1337, ratio=(.8, .2))  #the partition to 80% train 20% test

In [0]:
# DataSet object
import pandas as pd
import os
from torch.utils.data.dataset import Dataset
import imageio
import numpy as np  
from PIL import Image

class ClassDataset(Dataset):
      
  def __init__(self, basedir, transform, percentage):
        super().__init__()
        files = glob.glob(os.path.join(basedir ,'**/*.jpg'), recursive=True)
        if(percentage < 100):
          files_num = len(list(files))
          choosefilesnumber = int(files_num * percentage / 100)
          print(f'choosefilesnumber: {choosefilesnumber}  filesnum: {files_num}')
          print((list(files)))
          selectedfiles = np.random.choice(files, choosefilesnumber, replace=False)
        elif(percentage == 100):
          selectedfiles = files
        
        self.data = pd.DataFrame([self._split_file(f) for f in selectedfiles], 
                            columns=['class_id', 'file_path'])
        names  = np.unique(self.data['class_id'])
        
        self.classDict = {str:index for index, str in enumerate(names)}          
        self.data['class_num'] = self.data['class_id'].map(self.classDict)
        self.transform = transform
        
  def _split_file(self, f):
        parts = f.split(os.sep)[-3:-1]
        return parts[0], f   #label is originaly a str

  def __getitem__(self, index):
      dat = self.data.iloc[index]
      img = Image.open(dat['file_path'])
      if self.transform:
          img = self.transform(img)
      img = np.resize(img, (3,128,128))
      img = np.asarray(img)
      return (img.astype(np.float32), dat['class_num'])
     
  def __len__(self):
      return len(self.data)    

In [0]:
import glob
import torchvision.transforms as transforms
from PIL import Image, ImageEnhance, ImageOps

#basedir = '/content/gdrive/My Drive/video_data/'       
trainbasedir = '/Drive/data/train'   
testbasedir = '/Drive/data/test'

data_transforms = transforms.Compose([transforms.Resize((256,256)),transforms.RandomResizedCrop(224), 
                         transforms.RandomHorizontalFlip(), #ImageNetPolicy(), 
                         transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])

train_set = ClassDataset(trainbasedir, data_transforms, 60)
test_set = ClassDataset(testbasedir, None, 100)
#train_set = InstanceDataset(trainbasedir, data_transforms, 100) 
#test_set = InstanceDataset(testbasedir, None, 100)             

In [0]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable


batch_size = 50

train_loader = torch.utils.data.DataLoader(
                 dataset=train_set,
                 batch_size=batch_size,
                 shuffle=True,
                 num_workers=4)
test_loader = torch.utils.data.DataLoader(
                 dataset=test_set,
                 batch_size=batch_size,
                 shuffle=False,
                 num_workers=4)

print('Train size: {}'.format(len(train_loader)))


In [0]:
def perf_measure(y_actual, y_pred):
    TP = 0
    FP = 0
    TN = 0
    FN = 0
    cm = np.zeros((2,2))
    for i in range(len(y_pred)):
        if y_actual[i]==y_pred[i]:
           TP += 1
        if y_pred[i]==1 and y_actual[i]!=y_pred[i]:
           FP += 1
        if y_actual[i]==y_pred[i]:
           TN += 1
        if y_pred[i]==0 and y_actual[i]!=y_pred[i]:
           FN += 1

        cm[0][0] = TP
        cm[0][1] = FP
        cm[1][0]  = FN
        cm[1][1]  = TN

    return (cm)
  
def create_confusion_matrix_fig(c_cm):
    fig = plt.figure(figsize=(14, 12))
    plt.imshow(c_cm, interpolation='nearest')
    plt.title('Confusion matrix')
    plt.colorbar()
    return fig

In [0]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable

model = torch.load('/content/gdrive/My Drive/InstanceModel2.pth') #for third net to load our trained instance net


In [0]:
num_final_in = model.fc.in_features
print(f'last fc number: {num_final_in}')

last fc number: 512


In [0]:
import torch
from torchvision import models
from torch.utils.tensorboard import SummaryWriter
import matplotlib.pyplot as plt
import pylab as pl

import time 
import tqdm

use_cuda = torch.cuda.is_available()
assert use_cuda

model = torch.load('/content/gdrive/My Drive/InstanceModel2.pth') #for third net to load our trained instance net

# Writer will output to ./runs/ directory by default
writer = SummaryWriter('with_cm')    
    
num_final_in = model.fc.in_features

NUM_CLASSES_Instance = 6476  #len(np.unique(dataset.data['instance_num']))
NUM_CLASSES_Class = 23

model.fc = nn.Linear(num_final_in, NUM_CLASSES_Class)      
optimizer = optim.Adam(model.parameters(), lr=0.003) #, momentum=0.9)

criterion = nn.CrossEntropyLoss()

if use_cuda:
  model = model.cuda()
  model.to(torch.device("cuda"))

#model = torch.load('/content/gdrive/My Drive/myModels/InstanceModel.pth')
#num_final_in = model.fc.in_features
#print(f'last fc number: {num_final_in})
#model.fc = nn.Linear(num_final_in, NUM_CLASSES_Instance) 

for epoch in range(50):
    # trainning
    sum_loss = 0 
    total_cnt = 0
    correct_cnt = 0
    tf = time.time()
    for batch_idx, (x, target) in enumerate(train_loader):
    #for batch_idx, (x, target) in enumerate(tqdm.tqdm(train_loader)):
        optimizer.zero_grad()
        if use_cuda:
            x, target = x.cuda(), target.cuda()
        
        out = model(x)
        loss = criterion(out, target)       
        loss.backward()
        optimizer.step()
                
        pred_label = torch.max(out.data, 1)
        total_cnt += x.data.size()[0]
        correct_cnt += (pred_label[1] == target.data).sum()
        sum_loss += loss.data
        
        if (batch_idx+1) % 100 == 0 or (batch_idx+1) == len(train_loader):
            print ('==>>> epoch: {}, batch index: {}, train loss: {:.6f}, acc: {}'.format(
                epoch, batch_idx+1, sum_loss/batch_idx, float(correct_cnt)/total_cnt))    
            
    cm = perf_measure(target.data, pred_label[1])
    cur_fig = create_confusion_matrix_fig(cm)
    writer.add_figure('train_confusion_matrix', cur_fig, global_step=None, close=True, walltime=None)
    writer.add_scalar('train accuracy', float(correct_cnt)/total_cnt, epoch)
    writer.add_scalar('train loss', sum_loss/len(train_loader), epoch)
    
     # testing    
    correct_cnt, sum_loss = 0, 0
    total_cnt = 0
    for batch_idx, (x, target) in enumerate(test_loader):
        x = x.float()
        if use_cuda:
            x, target = x.cuda(), target.cuda()
        out = model(x)   
        loss = criterion(out, target)
        
        _, pred_label = torch.max(out.data, 1)
        total_cnt += x.data.size()[0]
        correct_cnt += (pred_label == target.data).sum()
        # smooth averages
        sum_loss += loss.data
        
        if(batch_idx+1) % 100 == 0 or (batch_idx+1) == len(test_loader):
            print ('==>>> epoch: {}, batch index: {}, test loss: {:.6f}, acc: {:.3f}'.format(
                epoch, batch_idx+1, sum_loss/batch_idx,float(correct_cnt)/total_cnt))
    
    try:
      cm = perf_measure(target.data, pred_label)
      cur_fig = create_confusion_matrix_fig(cm)
      writer.add_figure('test_confusion_matrix', cur_fig, global_step=None, close=True, walltime=None)
    except:
      print("An exception in test_confusion_matrix")
      continue
    writer.add_scalar('test accuracy', float(correct_cnt)/total_cnt, epoch)
    writer.add_scalar('test loss', sum_loss/len(test_loader), epoch)

#torch.save(model.state_dict(), 'yt_bb_detection_train/mymodel3')
writer.close()

elapsed = time.time() - tf
print(f'Elapsed time: {elapsed}')

#torch.save(model, '/content/gdrive/My Drive/myModels/newInstanceModel.pth')