In [None]:
!pip install --upgrade seaborn

In [None]:
import numpy as np, pandas as pd
from glob import glob
import shutil, os
import matplotlib.pyplot as plt
from sklearn.model_selection import GroupKFold
from tqdm.notebook import tqdm
import seaborn as sns
import torch
from IPython.display import Image, clear_output  # Hiển thị ảnh

dim = 512 #512, 256, 'original'
fold = 4

In [None]:
data = pd.read_csv(f'../input/vinbigdata-{dim}-image-dataset/vinbigdata/train.csv')

In [None]:
data['image_path'] = f'/kaggle/input/vinbigdata-{dim}-image-dataset/vinbigdata/train/'+data.image_id+('.png' if dim!='original' else '.jpg')
data.head()

# Tiền xử lý dữ liệu

In [None]:
data = data[data.class_id!=14].reset_index(drop = True)  #chỉ lấy 14 lớp,xóa lớp no finding

In [None]:
data['x_min'] = data.apply(lambda row: (row.x_min)/row.width, axis =1)
data['y_min'] = data.apply(lambda row: (row.y_min)/row.height, axis =1)

data['x_max'] = data.apply(lambda row: (row.x_max)/row.width, axis =1)
data['y_max'] = data.apply(lambda row: (row.y_max)/row.height, axis =1)

data['x_mid'] = data.apply(lambda row: (row.x_max+row.x_min)/2, axis =1)
data['y_mid'] = data.apply(lambda row: (row.y_max+row.y_min)/2, axis =1)

data['w'] = data.apply(lambda row: (row.x_max-row.x_min), axis =1)
data['h'] = data.apply(lambda row: (row.y_max-row.y_min), axis =1)

data['area'] = data['w']*data['h']
data.head()

In [None]:
features = ['x_min', 'y_min', 'x_max', 'y_max', 'x_mid', 'y_mid', 'w', 'h', 'area']
X = data[features]
y = data['class_id']
X.shape, y.shape

In [None]:
classes = ['Aortic enlargement',
 'Atelectasis',
 'Calcification',
 'Cardiomegaly',
 'Consolidation',
 'ILD',
 'Infiltration',
 'Lung Opacity',
 'Nodule/Mass',
 'Other lesion',
 'Pleural effusion',
 'Pleural thickening',
 'Pneumothorax',
 'Pulmonary fibrosis']

# t-SNE Visualization

In [None]:
%%time
from sklearn.manifold import TSNE

tsne = TSNE(n_components = 2, perplexity = 40, random_state=1, n_iter=5000)
data_X = X
data_y = y.loc[data_X.index]
embs = tsne.fit_transform(data_X)
# Thêm vào khung dữ liệu để thuận tiện
plot_x = embs[:, 0]
plot_y = embs[:, 1]

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize = (15, 15))
plt.axis('off')
scatter = plt.scatter(plot_x, plot_y, marker = 'o',s = 50, c=data_y.tolist(), alpha= 0.5,cmap='viridis')
plt.legend(handles=scatter.legend_elements()[0], labels=classes)

# Chia tỉ lệ

In [None]:
gkf  = GroupKFold(n_splits = 5)
data['fold'] = -1
for fold, (train_idx, val_idx) in enumerate(gkf.split(data, groups = data.image_id.tolist())):
    data.loc[val_idx, 'fold'] = fold
data.head()

In [None]:
train_files = []
val_files   = []
val_files += list(data[data.fold==fold].image_path.unique())
train_files += list(data[data.fold!=fold].image_path.unique())
len(train_files), len(val_files)

# Tệp bản sao

In [None]:
os.makedirs('/kaggle/working/vinbigdata/labels/train', exist_ok = True)
os.makedirs('/kaggle/working/vinbigdata/labels/val', exist_ok = True)
os.makedirs('/kaggle/working/vinbigdata/images/train', exist_ok = True)
os.makedirs('/kaggle/working/vinbigdata/images/val', exist_ok = True)
label_dir = '/kaggle/input/vinbigdata-yolo-labels-dataset/labels'
#tổng hợp các ảnh trùng thành 1
for file in tqdm(train_files):
    shutil.copy(file, '/kaggle/working/vinbigdata/images/train')
    filename = file.split('/')[-1].split('.')[0]
    shutil.copy(os.path.join(label_dir, filename+'.txt'), '/kaggle/working/vinbigdata/labels/train')
    
for file in tqdm(val_files):
    shutil.copy(file, '/kaggle/working/vinbigdata/images/val')
    filename = file.split('/')[-1].split('.')[0]
    shutil.copy(os.path.join(label_dir, filename+'.txt'), '/kaggle/working/vinbigdata/labels/val')

# YOLOv5 Stuff

In [None]:
from os import listdir
from os.path import isfile, join
import yaml

cwd = '/kaggle/working/'

with open(join( cwd , 'train.txt'), 'w') as f:
    for path in glob('/kaggle/working/vinbigdata/images/train/*'):
        f.write(path+'\n')
            
with open(join( cwd , 'val.txt'), 'w') as f:
    for path in glob('/kaggle/working/vinbigdata/images/val/*'):
        f.write(path+'\n')

data = dict(
    train =  join( cwd , 'train.txt') ,
    val   =  join( cwd , 'val.txt' ),
    nc    = 14,
    names = classes
    )

with open(join( cwd , 'vinbigdata.yaml'), 'w') as outfile:
    yaml.dump(data, outfile, default_flow_style=False)

f = open(join( cwd , 'vinbigdata.yaml'), 'r')
print('\nyaml:')
print(f.read())

In [None]:
shutil.copytree('/kaggle/input/yolov5-official-v31-dataset/yolov5', '/kaggle/working/yolov5')
os.chdir('/kaggle/working/yolov5')

clear_output()
print('Setup complete. Using torch %s %s' % (torch.__version__, torch.cuda.get_device_properties(0) if torch.cuda.is_available() else 'CPU'))

# Train

In [None]:
!WANDB_MODE="dryrun" python train.py --img 640 --batch 16 --epochs 30 --data /kaggle/working/vinbigdata.yaml --weights yolov5x.pt --cache

# Trực quan hóa dự đoán

In [None]:
fig, ax = plt.subplots(3, 2, figsize = (2*5,3*5), constrained_layout = True)
for row in range(3):
    ax[row][0].imshow(plt.imread(f'runs/train/exp/test_batch{row}_labels.jpg'))
    ax[row][0].set_xticks([])
    ax[row][0].set_yticks([])
    ax[row][0].set_title(f'runs/train/exp/test_batch{row}_labels.jpg - Vùng tin tưởng', fontsize = 12)
    
    ax[row][1].imshow(plt.imread(f'runs/train/exp/test_batch{row}_pred.jpg'))
    ax[row][1].set_xticks([])
    ax[row][1].set_yticks([])
    ax[row][1].set_title(f'runs/train/exp/test_batch{row}_pred.jpg - Hộp dự đoán', fontsize = 12)