In [1]:
from google.colab import drive

drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [2]:
%cd gdrive/MyDrive/petrobras2

/content/gdrive/MyDrive/petrobras2


In [3]:
!ls

data			imres_data_setup.py  imres_train.py
imres_data_setup.ipynb	imres_data_split.py  mask_rcnn_coco.h5


# ====================================================================
# **Projeto de Classificação da Qualidade de Perfis de Imagem LWD**

# Script para segregar as imagens em um set de treino e outro de teste

# Data: Outubro de 2020
# Autor: Ralph Piazza 
# ====================================================================

In [4]:
# import libraries

import pandas as pd
from os import listdir
import numpy as np
from sklearn.model_selection import KFold

In [27]:
# global definitions

ANNOT_DIR = 'data/annots/'
SPLIT_DEST = 'data/'

TOLERANCE = 0.25

In [6]:
# check whether the attempted split generated balanced sets

def check_balance(train_img, df_img_info, fracs):      
    tlength_1 = 1
    tlength_2 = 1
    tlength_3 = 1
    for i in train_img:
        class_info = df_img_info.loc[[i + 1]]
        for index, row in class_info.iterrows(): 
            if(row['CLASS'] == 1):
                tlength_1 += row['BB_LENGTH']
            elif(row['CLASS'] == 2):
                tlength_2 += row['BB_LENGTH']
            elif(row['CLASS'] == 3):
                tlength_3 += row['BB_LENGTH']        
    
    ttotal_length = tlength_1 + tlength_2 + tlength_3
    
    if (tlength_1 / ttotal_length < (1 + TOLERANCE) * fracs[0] and tlength_1 / ttotal_length > (1 - TOLERANCE) * fracs[0] and
        tlength_2 / ttotal_length < (1 + TOLERANCE) * fracs[1] and tlength_2 / ttotal_length > (1 - TOLERANCE) * fracs[1] and
        tlength_3 / ttotal_length < (1 + TOLERANCE) * fracs[2] and tlength_3 / ttotal_length > (1 - TOLERANCE) * fracs[2]):
        return True
    else:
        return False

In [7]:
# cycle through annotation files and consolidate the occurance of each class

df_img_info = pd.DataFrame(columns=['IMAGE', 'CLASS', 'BB_LENGTH'])

for annot_file in listdir(ANNOT_DIR):
    df_annot_single = pd.read_csv(ANNOT_DIR + annot_file, decimal='.', sep=',', usecols=['CLASS', 'YMAX', 'YMIN'], index_col=False)
    image_number = int(annot_file[0:5])    
   
    df_annot_single['BB_LENGTH'] = df_annot_single['YMAX'] - df_annot_single['YMIN'] + 1
    df_annot_single = df_annot_single.drop(['YMAX', 'YMIN'], axis=1)
    df_annot_single = df_annot_single.groupby(['CLASS']).sum()
    df_annot_single.reset_index(inplace=True)
    df_annot_single.insert(0, 'IMAGE', image_number)
    
    df_img_info = df_img_info.append(df_annot_single)    

df_class_info = df_img_info.groupby(['CLASS']).sum('BB_LENGTH')
df_class_info.reset_index(inplace=True)
df_class_info = df_class_info.drop(['IMAGE'], axis=1)
df_img_info = df_img_info.set_index('IMAGE')

In [28]:
# select images to compose the train and test data sets

total_length = df_class_info['BB_LENGTH'].sum()
frac_1 = df_class_info.at[0, 'BB_LENGTH'] / total_length
frac_2 = df_class_info.at[1, 'BB_LENGTH'] / total_length
frac_3 = df_class_info.at[2, 'BB_LENGTH'] / total_length
fracs = np.array([frac_1, frac_2, frac_3])

img_list = np.array(range(1, image_number + 1))
kfold = KFold(5, shuffle=True)
run = 0
for train_img, test_img in kfold.split(img_list):
    run += 1
    print(run)
    if(check_balance(train_img, df_img_info, fracs)):
        train_img += 1
        test_img += 1
        np.savetxt(SPLIT_DEST + 'train_list.txt', train_img, fmt = '%4u')
        np.savetxt(SPLIT_DEST + 'test_list.txt', test_img, fmt = '%4u') 
        print("saving files")       
        break 

1
saving files
