In [1]:
import numpy as np
import pandas as pd
import random
from sklearn.datasets import make_classification
from sklearn.neighbors import NearestNeighbors
from PIL import Image
import os
from pathlib import Path

In [2]:
labels_df = pd.read_csv('../data/fundus/MuReD/train_data.csv')
image_path = Path('../data/fundus/MuReD/images/images')
da_image_path = Path('../data/fundus/MuReD/images/mlsmote')

In [3]:
def get_tail_label(df):
    columns = df.columns[1:]
    n = len(columns)
    irpl = np.zeros(n)
    for column in range(n):
        irpl[column] = df[columns[column]].value_counts()[1]
    irpl = max(irpl)/irpl
    mir = np.average(irpl)
    tail_label = []
    for i in range(n):
        if irpl[i] > mir:
            tail_label.append(columns[i])
    return tail_label

def get_index(df):
  tail_labels = get_tail_label(df)
  index = set()
  for tail_label in tail_labels:
    sub_index = set(df[df[tail_label]==1].index)
    index = index.union(sub_index)
  return list(index)

def process_and_flatten_image(image_path):
    img = Image.open(image_path)
    img = img.resize((256,256))
    img = np.array(img)
    img = img.flatten()
    return img

def get_minority_instace(y, images_path):
    index = get_index(y)
    y = y[y.index.isin(index)].reset_index(drop = True)
    y_sub = y.iloc[:, 1:]
    x_sub = []
    for i in range(len(y)):
        image_path = images_path / f"{y.iloc[i,0]}.png" if os.path.exists(images_path / f"{y.iloc[i,0]}.png") else images_path / f"{y.iloc[i,0]}.tif"
        x_sub.append(process_and_flatten_image(image_path))
    x_sub = np.array(x_sub)
    return x_sub, y_sub



x_sub, y_sub = get_minority_instace(labels_df, image_path)
x_sub.shape, y_sub.shape

((350, 196608), (350, 20))

In [23]:
def nearest_neighbour(X):
    nbs=NearestNeighbors(n_neighbors=5,metric='euclidean',algorithm='kd_tree').fit(X)
    euclidean,indices= nbs.kneighbors(X)
    return indices

def MLSMOTE(X ,y, n_sample):
    indices2 = nearest_neighbour(X)
    n = len(indices2)
    new_X = np.zeros((n_sample, X.shape[1]))
    target = np.zeros((n_sample, y.shape[1]))
    for i in range(n_sample):
        reference = random.randint(0,n-1)
        neighbour = random.choice(indices2[reference,1:])
        all_point = indices2[reference]
        nn_df = y[y.index.isin(all_point)]
        ser = nn_df.sum(axis = 0, skipna = True)
        target[i] = np.array([1 if val>1 else 0 for val in ser])
        ratio = random.random()
        gap = X[reference,:] - X[neighbour,:]
        new_X[i] = np.array(X[reference,:] + ratio * gap)
        
    target = pd.DataFrame(target, columns=y.columns)
    return new_X, target


X_res,y_res = MLSMOTE(x_sub, y_sub, 600)
X_res.shape, y_res.shape

((600, 196608), (600, 20))

In [25]:
def save_new_images(X_res, y_res, labels_df, da_image_path):
    id_name = 1
    for i in range(len(y_res)):
        new_img_name = 'DA_' + str(id_name)
        image_path = da_image_path / f"{new_img_name}.png"
        img = X_res[i].reshape(256,256,3)
        img = Image.fromarray(img.astype('uint8'))
        img.save(image_path)
        id_name += 1
        
        new_row = pd.DataFrame(columns = labels_df.columns)
        new_row.loc[0] = [new_img_name] + y_res.iloc[i,:].tolist()
        # print(new_row)
        labels_df = labels_df._append(new_row)
    
    labels_df.to_csv('../data/fundus/MuReD/' + '/mlsmote_train_data.csv', index = False)
    
    
save_new_images(X_res, y_res, labels_df, da_image_path)