In [1]:
%matplotlib inline
# !pip install --upgrade plotly 
# deepchecks for tabular data:
!pip install deepchecks --upgrade -qq
# for installing deepchecks including the computer vision subpackage (note - Pytorch should be installed separately):
!pip install "deepchecks[vision]" --upgrade -qq


In [68]:
import os,argparse
from skimage import io, transform
from PIL import Image
import pandas as pd
import numpy as np

import torch
from torch.utils.data import Dataset,DataLoader
import torchvision.transforms as transforms

import deepchecks
from deepchecks.vision import VisionData
from deepchecks.vision.checks import ImagePropertyDrift

from typing import Any, Dict, Mapping, Optional, Sequence, Union

import torch
from ignite.metrics import Metric
from torch import nn

from deepchecks.core.check_result import CheckResult
from deepchecks.core.checks import DatasetKind, ModelOnlyBaseCheck, SingleDatasetBaseCheck, TrainTestBaseCheck
from deepchecks.utils.ipython import ProgressBarGroup
from deepchecks.vision import deprecation_warnings  # pylint: disable=unused-import # noqa: F401
from deepchecks.vision._shared_docs import docstrings
from deepchecks.vision.batch_wrapper import Batch
from deepchecks.vision.context import Context
from deepchecks.vision.utils.vision_properties import STATIC_PROPERTIES_FORMAT
from deepchecks.vision.vision_data import VisionData

In [69]:
train_data_dir = "datasets/DataSets/train/"
test_data_dir = "datasets/DataSets/val/"
# train_data_dir = "red"
# test_data_dir = "green"
recursive=True
class DatasetLoader(Dataset):
    def __init__(self, root):
        self.root = root
        img_paths = []
        img_labels = []
        label = root.split('/')[-1]       
        if recursive :
            for filename in os.listdir(root):
                if filename.split('.')[1] not in ['png','jpg','jpeg'] : 
                    continue
                img_paths.append(os.path.join(root,filename))
                img_labels.append(label)
        else :
            categories = os.listdir(root)
            for cat_index, cat in enumerate(categories):
                directory = os.path.join(root,cat)
                for filename in os.listdir(directory):
                    if filename.split('.')[1] not in ['png','jpg','jpeg'] : 
                        continue
                    img_paths.append(os.path.join(directory,filename))
                    img_labels.append(cat_index)


        self.images_filepaths = img_paths
        self.labels = img_labels

    def image_from_path(self,path) :
        trans = transforms.ToTensor()
        return trans(Image.open(path))
    def __getitem__(self, idx):
        return self.image_from_path(self.images_filepaths[idx]), self.labels[idx]
    def __len__(self):
        return len(self.images_filepaths)

class DeepCheckData(VisionData):
    def batch_to_images(self, batch):
        imgs = batch[0].detach().numpy().transpose((0, 2, 3, 1))
        return imgs*255

In [70]:
labels = os.listdir(train_data_dir)
train_images,test_images=[],[]
for label in labels:
    train_path=os.path.join(train_data_dir,label)
    test_path=os.path.join(test_data_dir,label)
    for image in os.listdir(train_path):
        train_images.append(image)
        
    for image in os.listdir(test_path):
        test_images.append(image)

### Class wise drift score

In [5]:
os.chdir("/project/datasets/")
# !ls datasets

In [72]:
res={}
for label in labels :
    train_dataset = DatasetLoader(train_data_dir+label)
    val_dataset = DatasetLoader(test_data_dir+label)

    train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True,generator=torch.Generator())
    test_dataloader = DataLoader(val_dataset, batch_size=1, shuffle=True,generator=torch.Generator())

    train_ds = DeepCheckData(train_dataloader)
    test_ds = DeepCheckData(test_dataloader)

    check = ImagePropertyDrift()#.add_condition_drift_score_less_than(0.1)
    result = check.run(train_ds, test_ds)
    res[label]=result





















In [8]:
# properties=['Aspect Ratio','Area','Brightness','RMS Contrast','Mean Red Relative Intensity','Mean Green Relative Intensity',
#             'Mean Blue Relative Intensity']
intermediate_result={}
for label in labels:
    intermediate_result[label]={}
    for j in range(1,8):
        intermediate_result[label][result.display[j].layout.title.text]={"drift_score":res[label].value[result.display[j].layout.title.text]}
        intermediate_result[label][result.display[j].layout.title.text]['Train Dataset']={}
        intermediate_result[label][result.display[j].layout.title.text]['Test Dataset']={}
        intermediate_result[label][result.display[j].layout.title.text]['Train Dataset']['x']=list(res[label].display[j].data[1]['x'])
        intermediate_result[label][result.display[j].layout.title.text]['Train Dataset']['Probability Density']=res[label].display[j].data[1]['y']
        intermediate_result[label][result.display[j].layout.title.text]['Test Dataset']['x']=list(res[label].display[j].data[4]['x'])
        intermediate_result[label][result.display[j].layout.title.text]['Test Dataset']['Probability Density']=res[label].display[j].data[4]['y']


In [166]:
train_dataset = DatasetLoader(train_data_dir)
val_dataset = DatasetLoader(test_data_dir)

train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True,generator=torch.Generator())
test_dataloader = DataLoader(val_dataset, batch_size=1, shuffle=True,generator=torch.Generator())

train_ds = DeepCheckData(train_dataloader)
test_ds = DeepCheckData(test_dataloader)

check = ImagePropertyDrift()#.add_condition_drift_score_less_than(0.1)
result = check.run(train_ds, test_ds)





In [173]:
result.save_as_html("car_truck_drift.html")
# for i in range(1,8):
#     result.display[i].data[-6]['name']='car Dataset'
#     result.display[i].data[-5]['name']='car Mean'
#     result.display[i].data[-4]['name']='car Median'
#     result.display[i].data[-3]['name']='truck Dataset'
#     result.display[i].data[-2]['name']='truck Mean'
#     result.display[i].data[-1]['name']='truck Median'


'car_truck_drift.html'

=> Dockerize requirements.txt, config.txt,.py in a function, args from command line

In [156]:
# import configparser
# config=configparser.ConfigParser()
# config["DeepChecks"]=check.config()
# config["Conditions"]={"Classes":"All_ classes","Drift_score_less_than":0.001}
# with open("config.ini",'w') as configfile:
#     config.write(configfile)

In [None]:
# def ImagePropertyDrift():
#     parser=argparse.ArgumentParser()
#     parser.add_atgument("--train_dir",help="path of training images",default="/project/datasets/DataSets/train/")
#     parser.add_atgument("--test_dir",help="path of testing images",default="/project/datasets/DataSets/val/")
#     parser.add_argument("--thres_drift",help"threshold drift score",default=0.001,type=float)
#     args=parser.parse_args()

In [63]:
# !unzip train.zip

In [164]:
os.chdir("/project")

## Image Dataset Drift

In [65]:
from deepchecks.vision.checks import ImageDatasetDrift 

In [66]:
check_data=ImageDatasetDrift()
result_datacheck=check_data.run(train,test)

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [193]:
result_datacheck.save_as_html("dataset_check_car_truck.html")

'dataset_check_car_truck.html'

In [191]:
for i in range(5,8):
    result_datacheck.display[i].data[0]['name']='car Dataset'
    result_datacheck.display[i].data[1]['name']='car Mean'
    result_datacheck.display[i].data[2]['name']='car Median'
    result_datacheck.display[i].data[3]['name']='truck Dataset'
    result_datacheck.display[i].data[4]['name']='truck Mean'
    result_datacheck.display[i].data[5]['name']='truck Median'

In [34]:
# train_csv,test_csv={'images':train_images},{'images':test_images}
train_csv,test_csv={},{}

for label in labels:
    for key in intermediate_result[label].keys():
        train_csv['x_'+key+'_'+label]=intermediate_result[label][key]['Train Dataset']['x']
        train_csv['y_'+key+'_'+label]=intermediate_result[label][key]['Train Dataset']['Probability Density']      
        test_csv['x_'+key+'_'+label]=intermediate_result[label][key]['Train Dataset']['x']
        test_csv['y_'+key+'_'+label]=intermediate_result[label][key]['Train Dataset']['Probability Density']

In [59]:
df_train=pd.DataFrame(train_csv,index=None)
df_test=pd.DataFrame(test_csv,index=None)
# sum=0
# for label in labels:
#     sum+=len(train_csv['y_'+key+'_'+label])
# print(sum)

In [None]:
# df_train.to_csv("code/Ravi Gautam/train.csv")
# df_test.to_csv("code/Ravi Gautam/test.csv")

In [61]:
# keys=[]
# for key in train_csv.keys():
#     if 'x' in key.split('_'):
#         keys.append(key)
train=df_train[keys]
test=df_test[keys]

In [64]:
train=np.array(train)
test=np.array(test)

In [115]:
result.__sizeof__()

32

In [129]:
# dir(result)
# result.display

In [4]:
import os
len(os.listdir("Data folder/train/traffic"))

400