In [1]:
import pandas as pd
import os
from PIL import Image
from sklearn.model_selection import train_test_split

In [2]:
def prepare_data(df):
    
    hgc = '/kaggle/input/endoscopic-bladder-tissue-classification-dataset/EndoscopicBladderTissue/HGC'
    lgc = '/kaggle/input/endoscopic-bladder-tissue-classification-dataset/EndoscopicBladderTissue/LGC'
    nst = '/kaggle/input/endoscopic-bladder-tissue-classification-dataset/EndoscopicBladderTissue/NST'
    ntl = '/kaggle/input/endoscopic-bladder-tissue-classification-dataset/EndoscopicBladderTissue/NTL'
    
    for index, row in df.iterrows():
        if row['tissue type'] == 'HGC':
            df.at[index, 'HLY'] = os.path.join(hgc, row['HLY'])
        elif row['tissue type'] == 'LGC':
            df.at[index, 'HLY'] = os.path.join(lgc, row['HLY'])
        elif row['tissue type'] == 'NST':
            df.at[index, 'HLY'] = os.path.join(nst, row['HLY'])
        elif row['tissue type'] == 'NTL':
            df.at[index, 'HLY'] = os.path.join(ntl, row['HLY'])
            
    return df

In [3]:
def get_problematic_image_paths_only(data):
    """
    Get only the paths of images that cannot be loaded (concise version)
    """
    
    problematic_paths = []
    problematic_idx = []
    
    for idx, row in data.iterrows():
        img_path = row['HLY']
        try:
            image = Image.open(img_path).convert('RGB')
            image.close()
        except:
            problematic_paths.append(img_path)
            problematic_idx.append(idx)
            
    return problematic_paths, problematic_idx

In [4]:
ann_csv = "/kaggle/input/endoscopic-bladder-tissue-classification-dataset/EndoscopicBladderTissue/annotations.csv"

ann_df = pd.read_csv(ann_csv)

df = prepare_data(ann_df)
prob_path, prob_idx = get_problematic_image_paths_only(df)
df.drop(index=prob_idx, inplace=True)

train = df[df['sub_dataset'] == 'train']
test = df[df['sub_dataset'] == 'test']
valid = df[df['sub_dataset'] == 'val']

In [5]:
round(ann_df['tissue type'].value_counts()/len(ann_df) * 100,2)

tissue type
LGC    37.68
HGC    27.32
NST    27.20
NTL     7.80
Name: count, dtype: float64

In [6]:
round(train['tissue type'].value_counts()/len(train) * 100,2)

tissue type
LGC    39.38
NST    28.27
HGC    25.25
NTL     7.11
Name: count, dtype: float64

In [7]:
round(test['tissue type'].value_counts()/len(test) * 100,2)

tissue type
HGC    39.15
LGC    28.04
NST    19.58
NTL    13.23
Name: count, dtype: float64

In [8]:
round(valid['tissue type'].value_counts()/len(valid) * 100,2)

tissue type
LGC    36.84
HGC    28.29
NST    27.63
NTL     7.24
Name: count, dtype: float64

In [9]:
train.to_csv('train.csv', index=False)
test.to_csv('test.csv', index=False)
valid.to_csv('valid.csv', index=False)

In [10]:
train_val_df, test_df = train_test_split(df, test_size=0.2, stratify=df['tissue type'], random_state=42)
train_df, valid_df = train_test_split(train_val_df, test_size=0.2, stratify=train_val_df['tissue type'], random_state=42)

In [11]:
train_df.to_csv('train_split.csv', index=False)
test_df.to_csv('test_split.csv', index=False)
valid_df.to_csv('valid_split.csv', index=False)

In [12]:
round(train_df['tissue type'].value_counts()/len(train_df) * 100,2)

tissue type
LGC    37.61
HGC    27.32
NST    27.23
NTL     7.83
Name: count, dtype: float64

In [13]:
round(test_df['tissue type'].value_counts()/len(test_df) * 100,2)

tissue type
LGC    37.79
HGC    27.33
NST    27.03
NTL     7.85
Name: count, dtype: float64

In [14]:
round(valid_df['tissue type'].value_counts()/len(valid_df) * 100,2)

tissue type
LGC    37.82
HGC    27.27
NST    27.27
NTL     7.64
Name: count, dtype: float64