# Exploration of Split Test/Train

The notebook aims to explore how to split the train set in train and validation set with the current settings.



In [8]:
sys.path.append('../src')
import sys
import warnings
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from sklearn.decomposition import PCA
from tqdm import tqdm
import numpy as np
from torchvision import models, transforms
import time
import os
import random
import pandas as pd
import logging
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from torch.nn import functional as F
from cld_ivado.utils.context import get_context
from cld_ivado.utils.compute_metrics import get_metrics, get_majority_vote,log_test_metrics
from cld_ivado.utils.dataframe_creation import create_dataframe_preproccessing
from cld_ivado.utils.split import train_test_split, get_train_test_patients_id
from cld_ivado.dataset.dl_dataset import CldIvadoDataset
import copy
from pathlib import Path
import yaml
warnings.filterwarnings("ignore")
logging.basicConfig(level = logging.INFO)


In [11]:
# Get the current project path (where you open the notebook)
# and go up two levels to get the project path
current_dir = Path.cwd()
#proj_path = current_dir.parent
proj_path = current_dir.parent
# make the code in src available to import in this notebook
sys.path.append(os.path.join(proj_path, 'src'))

# Catalog contains all the paths related to datasets
with open(os.path.join(proj_path, 'conf/data_catalog.yml'), "r") as f:
    catalog = yaml.safe_load(f)

# Params contains all of the dataset creation parameters and model parameters
with open(os.path.join(proj_path, 'conf/parameters.yml'), "r") as f:
    params = yaml.safe_load(f)


# Split training set

In [12]:
def reshape_raw_images(df, M, N):
    # Reshape the data appropriately
    logging.info('Using Raw Images')
    data = df['img'].iloc[0].view(1, M * N)
    for i in tqdm(range(1, df['img'].shape[0])):
        data = torch.cat([data, df['img'].iloc[i].view(1, M * N)])
    data = pd.DataFrame(data.numpy())
    return data



In [14]:
# panda dataframe containing flatten images - this will be use to compute eigenvectors
df = pd.read_pickle(os.path.join(catalog['data_root'], catalog['02_interim_pd']))
data = reshape_raw_images(df, params['preprocess']['dimension']['M'], params['preprocess']['dimension']['N'] )
df = pd.concat([df, data], axis=1)
df = df.drop(['img','fat'], axis=1)


INFO:root:Using Raw Images
100%|██████████| 549/549 [00:07<00:00, 74.64it/s] 


In [15]:
df


Unnamed: 0,id,class,0,1,2,3,4,5,6,7,...,276014,276015,276016,276017,276018,276019,276020,276021,276022,276023
0,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
545,55,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
546,55,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
547,55,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
548,55,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
# panda dataframe with path to images
dataset = pd.read_pickle(os.path.join(catalog['data_root'], catalog['02_interim_pd']))
dataset = create_dataframe_preproccessing(dataset)
test_n_splits = params['cross_val']['test_n_splits']
group_kfold_test = GroupKFold(n_splits=test_n_splits)
seed = params['cross_val']['seed']
fold_c = 1
df_pid = df['id']
df_y = df['class']
test_metrics = {}  
test_metrics_mv = {}     


In [19]:
dataset


Unnamed: 0,id,labels,fat,fname
0,1,0,3,data/01_raw/raw_images/P1_image1.jpg
1,1,0,3,data/01_raw/raw_images/P1_image2.jpg
2,1,0,3,data/01_raw/raw_images/P1_image3.jpg
3,1,0,3,data/01_raw/raw_images/P1_image4.jpg
4,1,0,3,data/01_raw/raw_images/P1_image5.jpg
...,...,...,...,...
545,55,1,20,data/01_raw/raw_images/P55_image6.jpg
546,55,1,20,data/01_raw/raw_images/P55_image7.jpg
547,55,1,20,data/01_raw/raw_images/P55_image8.jpg
548,55,1,20,data/01_raw/raw_images/P55_image9.jpg


In [17]:
data_transforms = {'train': transforms.Compose([
    #transforms.RandomResizedCrop(224),
    #transforms.Resize(224),
    #transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize([0.485], [0.229])
]),'val': transforms.Compose([
    #transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize([0.485], [0.229])
]),}

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
logging.info('Cross-validation Started')


INFO:root:Cross-validation Started


In [22]:
df_pid

0       1
1       1
2       1
3       1
4       1
       ..
545    55
546    55
547    55
548    55
549    55
Name: id, Length: 550, dtype: uint8

In [31]:
for train_index, test_index in group_kfold_test.split(df, df_y, df_pid):
    random.seed(seed)
    random.shuffle(train_index)
    X_train, X_test = dataset.iloc[train_index], dataset.iloc[test_index]

    # split training set in subtrain and validation set
    train_id, test_id = get_train_test_patients_id(df_pid, train_sz=params['model']['train_pct'], seed=seed)
    subtrain_data = dataset[dataset['id'].isin(train_id)].reset_index(drop=True).sample(frac=1,random_state =seed)
    val_data = dataset[dataset['id'].isin(test_id)].reset_index(drop=True)
    subtrain_data_flatten = df[df['id'].isin(train_id)].reset_index(drop=True).sample(frac=1,random_state =seed)
    subtrain_data_flatten= subtrain_data_flatten.drop(['id','class'], axis=1)
    # pca is used for dimensionality reduction
    logging.info(f'FOLD {fold_c}: Apply PCA on train data points')
    pca = PCA(n_components = params['pca']['n_components'], random_state = seed)          
    pca.fit(subtrain_data_flatten)
    
    break

INFO:root:FOLD 1: Apply PCA on train data points


In [32]:
subtrain_data_flatten

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,276014,276015,276016,276017,276018,276019,276020,276021,276022,276023
165,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
323,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
432,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
448,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
91,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
80,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
191,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
subtrain_data

Unnamed: 0,id,labels,fat,fname
165,21,1,7,data/01_raw/raw_images/P21_image6.jpg
346,41,1,25,data/01_raw/raw_images/P41_image7.jpg
323,39,1,50,data/01_raw/raw_images/P39_image4.jpg
432,50,1,25,data/01_raw/raw_images/P50_image3.jpg
448,51,1,15,data/01_raw/raw_images/P51_image9.jpg
...,...,...,...,...
337,40,1,70,data/01_raw/raw_images/P40_image8.jpg
91,10,0,1,data/01_raw/raw_images/P10_image2.jpg
80,9,0,2,data/01_raw/raw_images/P9_image1.jpg
191,24,1,20,data/01_raw/raw_images/P24_image2.jpg
