In [1]:
import numpy as np
import pandas as pd
import random
from sklearn.datasets import make_classification
from sklearn.neighbors import NearestNeighbors
from PIL import Image
import os
from pathlib import Path
import math

In [2]:
labels_df = pd.read_csv('../data/fundus/MuReD/remedial_train_data.csv')
image_file_path = Path('../data/fundus/MuReD/images/images')
remedial_image_file_path = Path('../data/fundus/MuReD/images/remedial')
da_image_path = Path('../data/fundus/MuReD/images/mlsmote_remedial')

print(len(labels_df))

2057


In [3]:
def get_tail_label(df):
    columns = df.columns[1:]
    n = len(columns)
    irpl = np.zeros(n)
    for column in range(n):
        irpl[column] = df[columns[column]].value_counts()[1]
    irpl = max(irpl)/irpl
    mir = np.average(irpl)
    tail_label = []
    for i in range(n):
        if irpl[i] > mir:
            tail_label.append(columns[i])
    return tail_label

def get_index(df):
  tail_labels = get_tail_label(df)
  index = set()
  for tail_label in tail_labels:
    sub_index = set(df[df[tail_label]==1].index)
    index = index.union(sub_index)
  return list(index)

def process_and_flatten_image(image_path):
    img = Image.open(image_path)
    img = img.resize((1024,1024))
    img = np.array(img)
    img = img.flatten()
    return img

def get_minority_instace(y, images_path, remedial_image_file_path=None):
    index = get_index(y)
    y = y[y.index.isin(index)].reset_index(drop = True)
    y_sub = y.iloc[:, 1:]
    x_sub = []
    for i in range(len(y)):
        if y.iloc[i,0].startswith('DA_'):
            image_path = remedial_image_file_path / f"{y.iloc[i,0]}.png" if os.path.exists(remedial_image_file_path / f"{y.iloc[i,0]}.png") else remedial_image_file_path / f"{y.iloc[i,0]}.tif"
        else:
            image_path = images_path / f"{y.iloc[i,0]}.png" if os.path.exists(images_path / f"{y.iloc[i,0]}.png") else images_path / f"{y.iloc[i,0]}.tif"
        x_sub.append(process_and_flatten_image(image_path))
    x_sub = np.array(x_sub)
    print(f"img resize to ({math.sqrt(x_sub.shape[1]/3)}, {math.sqrt(x_sub.shape[1]/3)})")
    return x_sub, y_sub



x_sub, y_sub = get_minority_instace(labels_df, image_file_path, remedial_image_file_path)
x_sub.shape, y_sub.shape

img resize to (1024.0, 1024.0)


((350, 3145728), (350, 20))

In [4]:
def nearest_neighbour(X):
    nbs=NearestNeighbors(n_neighbors=5,metric='euclidean',algorithm='kd_tree').fit(X)
    euclidean,indices= nbs.kneighbors(X)
    return indices

def MLSMOTE(X ,y, n_sample):
    indices2 = nearest_neighbour(X)
    n = len(indices2)
    new_X = np.zeros((n_sample, X.shape[1]))
    target = np.zeros((n_sample, y.shape[1]))
    for i in range(n_sample):
        reference = random.randint(0,n-1)
        neighbour = random.choice(indices2[reference,1:])
        all_point = indices2[reference]
        nn_df = y[y.index.isin(all_point)]
        ser = nn_df.sum(axis = 0, skipna = True)
        target[i] = np.array([1 if val>1 else 0 for val in ser])
        ratio = random.random()
        gap = X[reference,:] - X[neighbour,:]
        new_X[i] = np.array(X[reference,:] + ratio * gap)
        
    target = pd.DataFrame(target, columns=y.columns)
    return new_X, target


# p = 0.4
add_sample = int(len(labels_df) * 0.2)
print(f"add {add_sample} samples")
X_res,y_res = MLSMOTE(x_sub, y_sub, add_sample)
X_res.shape, y_res.shape

add 411 samples


((411, 3145728), (411, 20))

In [6]:
def save_new_images(X_res, y_res, labels_df, da_image_path):
    id_name = 1
    for i in range(len(y_res)):
        new_img_name = 'DA_DA_' + str(id_name)
        image_path = da_image_path / f"{new_img_name}.png"
        img = X_res[i].reshape(1024,1024,3)
        img = Image.fromarray(img.astype('uint8'))
        img.save(image_path)
        id_name += 1
        
        new_row = pd.DataFrame(columns = labels_df.columns)
        new_row.loc[0] = [new_img_name] + y_res.iloc[i,:].tolist()
        # print(new_row)
        labels_df = labels_df._append(new_row)
    
    labels_df.to_csv('../data/fundus/MuReD/mlsmote_remedial_train_data.csv', index = False)
    return labels_df
    
os.makedirs(da_image_path, exist_ok=True)
labels_df = save_new_images(X_res, y_res, labels_df, da_image_path)

In [7]:
counts = labels_df.sum(axis=0)
counts.to_dict()

{'ID': 'aria_c_25_1aria_c_7_2aria_c_38_2aria_c_2_8aria_c_26_15aria_c_20_18aria_c_27_3aria_c_47_2aria_c_31_20aria_c_31_2aria_c_39_a_10aria_c_32_2aria_c_32_21aria_c_5_4aria_c_26_2(0031)aria_c_37aria_c_35_27aria_c_9_6aria_c_2_3aria_c_35_26aria_c_6_5aria_c_24_2aria_c_40_a_6aria_c_1_7aria_c_22_1aria_c_46_12aria_c_9_1aria_c_36_44aria_c_20_10aria_c_21_3aria_c_35_2aria_c_5_1aria_c_21_12aria_c_30_2aria_c_38_42aria_c_1_8aria_c_22_9aria_c_6_2aria_c_41_15aria_c_4_1aria_c_4_8aria_c_24_12aria_c_44_12aria_c_20_1aria_c_34_1aria_c_45_2aria_c_45_12aria_c_23_4aria_c_44_3aria_d_13_22aria_d_3_2aria_d_18_21aria_d_16_22aria_d_8_2aria_d_24_6aria_d_15_22aria_d_22_22aria_d_2_6aria_d_5_22aria_d_21_21aria_d_7_22aria_d_8_23aria_d_4_31aria_d_7_17aria_d_18_2aria_d_12_2aria_d_3_32aria_d_20_17(0001)aria_d_27aria_d_22_2aria_d_15_2(0026)aria_d_27aria_d_25_2aria_d_5_2aria_d_3_33aria_d_20_2aria_d_17_12aria_d_12_20(0024)aria_d_30aria_d_4_5(0003)aria_d_29aria_d_9_24aria_d_16_2aria_d_2_31aria_d_17_2(0001)aria_d_26aria_d_13_2

In [None]:
import random
import numpy as np
from collections import Counter
from sklearn.neighbors import NearestNeighbors

def calculate_mean_ir(D, labels):
    irs = [calculate_ir_per_label(D, label) for label in labels]
    return np.mean(irs)

def calculate_ir_per_label(D, label):
    label_counts = Counter(label in instance['labels'] for instance in D)
    majority_count = max(label_counts.values())
    minority_count = label_counts[True] if True in label_counts else 0
    return majority_count / minority_count if minority_count > 0 else float('inf')

def get_all_instances_of_label(D, label):
    return [instance for instance in D if label in instance['labels']]

def new_sample(sample, ref_neigh, neighbors):
    synth_smpl = {'features': [], 'labels': []}
    
    for i, feat in enumerate(sample['features']):
        if isinstance(feat, (int, float)):
            diff = ref_neigh['features'][i] - sample['features'][i]
            offset = diff * random.uniform(0, 1)
            value = sample['features'][i] + offset
        else:
            values = [n['features'][i] for n in neighbors]
            value = Counter(values).most_common(1)[0][0]
        synth_smpl['features'].append(value)
    
    lbl_counts = Counter(sample['labels'])
    for neighbor in neighbors:
        lbl_counts.update(neighbor['labels'])
    
    labels = [label for label, count in lbl_counts.items() if count > (len(neighbors) + 1) / 2]
    synth_smpl['labels'] = labels
    
    return synth_smpl

def preprocess_dataset(D, k):
    labels = set(label for instance in D for label in instance['labels'])
    mean_ir = calculate_mean_ir(D, labels)
    
    for label in labels:
        ir_label = calculate_ir_per_label(D, label)
        if ir_label > mean_ir:
            min_bag = get_all_instances_of_label(D, label)
            features = [instance['features'] for instance in min_bag]
            nn = NearestNeighbors(n_neighbors=k).fit(features)
            distances, indices = nn.kneighbors(features)
            
            for i, sample in enumerate(min_bag):
                neighbors = [min_bag[idx] for idx in indices[i]]
                ref_neigh = random.choice(neighbors)
                synth_smpl = new_sample(sample, ref_neigh, neighbors)
                D.append(synth_smpl)
    
    return D

# Example usage:
# dataset = [{'features': [0.1, 0.2], 'labels': ['A']}, {'features': [0.4, 0.5], 'labels': ['B']}, ...]
# k = 5
# preprocessed_dataset = preprocess_dataset(dataset, k)
