In [None]:
# import libraries
import numpy as np
import pandas as pd
import os
os.chdir("..")

## Load real dataset

In [None]:
real_data_path = "datasets/eye2gene_new_filepaths/all_baf_valid_50deg_filtered_train_0_edited.csv"

In [None]:
real_df = pd.read_csv(real_data_path)
with open("classes.txt") as f:
    classes = f.read().splitlines()
real_df = real_df[real_df.gene.isin(classes)]

In [None]:
print("Length of dataset = " + str(len(real_df)))
print("Keys = ", real_df.keys())

In [None]:
real_df.head()

## Load a Dummy Synthetic Dataset

In [None]:
# from PIL import Image

# # fixed images per class
# n_images_per_class = 100
# os.makedirs("synthetic_datasets/dummy_synthetic_{}".format(n_images_per_class), exist_ok=True)

# np.random.seed(1399)
# df = pd.DataFrame(columns=["file.path", "gene"])
# for i, gene in enumerate(classes):
#     for j in range(n_images_per_class):
#         img = np.random.randint(low=0, high=255, size=(512, 512))
#         img = Image.fromarray(np.uint8(img))
        
#         fname = "synthetic_datasets/dummy_synthetic_{}/{}_image_{}.png".format(n_images_per_class, gene, j)
#         fname = os.path.abspath(fname)
#         row = pd.DataFrame(data=[[fname, gene]], columns=["file.path", "gene"])
#         df = df.append(row)
        
#         # save images
#         img.save(fname)
# df.to_csv("synthetic_datasets/dummy_synthetic_{}/generated_examples.csv".format(n_images_per_class), index=False)

In [None]:
# from tqdm import tqdm
# from concurrent.futures import ThreadPoolExecutor

# classes2, class_sizes = np.unique(real_df.gene, return_counts=True)
# sizes_dict = dict(zip(classes2, class_sizes))
# largest_class = np.max(class_sizes)
# differences = {c:largest_class - sizes_dict[c] for c in classes}
# class_repeats = np.repeat(classes, list(differences.values()))

# dset_path = os.path.abspath("synthetic_datasets/dummy_rebalanced/")
# os.makedirs(dset_path, exist_ok=True)
# df = pd.DataFrame(columns=["file.path", "gene"])

# np.random.seed(1399)
# for i, c in tqdm(enumerate(class_repeats)):
#     img = np.random.randint(low=0, high=255, size=(512, 512))
#     img = Image.fromarray(np.uint8(img))
#     fname = os.path.join(dset_path, "{}_img_{}.png".format(c, i))
#     row = pd.DataFrame(data=[[fname, c]], columns=["file.path", "gene"])
#     df = df.append(row)

#     # save images
#     img.save(fname)
# df.to_csv(os.path.join(dset_path, "generated_examples.csv"), index=False)

## Load Synthetic Dataset

In [None]:
synthetic_data_path = "synthetic_datasets/dummy_rebalanced/generated_examples.csv"
synthetic_df = pd.read_csv(synthetic_data_path)
# synthetic_df["file.path"] = list(map(os.path.abspath, "synthetic_datasets/"+synthetic_df["file.path"]))
# synthetic_df.gene = class_repeats
synthetic_df.head()

In [None]:
print("Length of dataset = " + str(len(synthetic_df)))
print("Keys = ", synthetic_df.keys())

In [None]:
combined_dataset = real_df.merge(synthetic_df, how="outer")

In [None]:
combined_dataset

In [None]:
np.unique(combined_dataset.gene, return_counts=True)

In [None]:
combined_dataset.to_csv("datasets/syntheye/real+dummy3600.csv", index=False)

## Synthetic + Real Combined (For Clinical Graders)

In [None]:
selected_genes = ["ABCA4", "PRPH2", "BEST1"]
real_data_path = "datasets/eye2gene/all_baf_valid_50deg_filtered3.csv"
synthetic_data_path = "synthetic_datasets/stylegan2_synthetic_100perclass/generated_examples.csv"
real_df = pd.read_csv(real_data_path)
real_df = real_df[real_df.gene.isin(selected_genes)]
synthetic_df = pd.read_csv(synthetic_data_path)
synthetic_df["file.path"] = list(map(os.path.abspath, "synthetic_datasets/"+synthetic_df["file.path"]))
synthetic_df = synthetic_df[synthetic_df.gene.isin(selected_genes)]

In [None]:
np.random.seed(1399)
n_samples_wanted_per_class = 100
image_paths = pd.DataFrame(columns=["ID", "Image Path"])
graders_df = pd.DataFrame(columns=["ID", "gene", "Predict (Real/Unsure/Fake)"])
answers_df = pd.DataFrame(columns=["ID", "gene", "Actual (Real/Unsure/Fake)"])
threshold = 0.5
for j, gene in enumerate(selected_genes):
    for i in range(n_samples_wanted_per_class):
        if np.random.rand() < threshold:
            sample_row = synthetic_df[synthetic_df.gene == gene].sample()
            paths_row = np.array(["{:03}".format(n_samples_wanted_per_class*j + i), sample_row["file.path"].item()])[None, :]
            grader_row = np.array(["{:03}".format(n_samples_wanted_per_class*j + i), sample_row["gene"].item(), ""])[None, :]
            answer_row = np.array(["{:03}".format(n_samples_wanted_per_class*j + i), sample_row["gene"].item(), "Fake"])[None, :]
            paths_row = pd.DataFrame(paths_row, columns=["ID", "Image Path"])
            grader_row = pd.DataFrame(grader_row, columns=["ID", "gene", "Predict (Real/Unsure/Fake)"])
            answer_row = pd.DataFrame(answer_row, columns=["ID", "gene", "Actual (Real/Unsure/Fake)"])
        else:
            sample_row = real_df[real_df.gene == gene].sample()
            paths_row = np.array(["{:03}".format(n_samples_wanted_per_class*j + i), sample_row["file.path"].item()])[None, :]
            grader_row = np.array(["{:03}".format(n_samples_wanted_per_class*j + i), sample_row["gene"].item(), ""])[None, :]
            answer_row = np.array(["{:03}".format(n_samples_wanted_per_class*j + i), sample_row["gene"].item(), "Real"])[None, :]
            paths_row = pd.DataFrame(paths_row, columns=["ID", "Image Path"])
            grader_row = pd.DataFrame(grader_row, columns=["ID", "gene", "Predict (Real/Unsure/Fake)"])
            answer_row = pd.DataFrame(answer_row, columns=["ID", "gene", "Actual (Real/Unsure/Fake)"])
        image_paths = image_paths.append(paths_row)
        graders_df = graders_df.append(grader_row)
        answers_df = answers_df.append(answer_row)
image_paths = image_paths.reset_index(drop=True)
graders_df = graders_df.reset_index(drop=True)
answers_df = answers_df.reset_index(drop=True)

In [None]:
image_paths

In [None]:
graders_df

In [None]:
answers_df

In [None]:
np.unique(graders_df.gene, return_counts=True)

In [None]:
image_paths.to_excel("graders_eval_images_stylegan2/image_paths.xlsx", index=False)
graders_df.to_excel("graders_eval_images_stylegan2/graders_eval.xlsx", index=False)
answers_df.to_excel("graders_eval_images_stylegan2/answers_eval.xlsx", index=False)

In [None]:
from shutil import copyfile
from PIL import Image
os.makedirs("graders_eval_images_stylegan2/images", exist_ok=True)
for i, row in image_paths.iterrows():
    filepath = row["Image Path"]
    img = Image.open(filepath)
    if img.size != (512, 512):
        img = img.resize((512, 512))
        save_path = os.path.join("graders_eval_images_stylegan2/images/{:03}_{}.png".format(i, graders_df.iloc[i].gene))
        img.save(save_path)
    else:
        copyfile(filepath, "graders_eval_images_stylegan2/images/{:03}_{}.png".format(i, graders_df.iloc[i].gene))