# Define packages

In [1]:
import os
import random
import os.path as osp
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch.nn as nn

from keras.layers import Dense, Flatten, Embedding, Multiply, Concatenate, Input
from keras.layers.advanced_activations import LeakyReLU
from keras.models import Model, Sequential
from joblib import dump, load

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score, matthews_corrcoef, balanced_accuracy_score, roc_curve, auc
def fix_all_seeds(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
fix_all_seeds(2021)

# Load data from Google drive or download from GitHub


In [2]:
from google.colab import drive
drive.mount('/content/drive')
BASE_DIR = "/content/drive/MyDrive/DSM-AS/input"
OUTPUT_DIR = "/content/drive/MyDrive/DSM-AS/output"
data_df = pd.read_csv(osp.join(BASE_DIR, "data.csv"))
if not osp.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Define hyperparameters

In [3]:
Y_FEATURE = 'diagnosis'
X_FEATURES = ['radius_mean', 'texture_mean', 'perimeter_mean',
              'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
              'concave points_mean', 'concavity_mean', 'fractal_dimension_mean',
              'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
              'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
              'fractal_dimension_se', 'radius_worst', 'texture_worst',
              'perimeter_worst', 'area_worst', 'smoothness_worst',
              'compactness_worst', 'concavity_worst', 'concave points_worst',
              'symmetry_worst', 'fractal_dimension_worst']
DISTRIBUTION_MODES = ["setting1", "setting2", "setting3"]
STATION_LIST = ["uka", "umg", "ukk", "ukl", "imise", "hsm"]

# Define Models

In [4]:
##################################################################################
## Synthetic Data
##################################################################################
def build_gen(zdim):
    model = Sequential()
    model.add(Dense(31, input_dim=zdim))
    model.add(LeakyReLU(alpha=0.01))
    model.add(Dense(1 * 31, activation='tanh'))
    return model

def build_cgen(zdim, num_classes=2):
    z = Input(shape=(zdim,))
    lable = Input(shape=(1,), dtype='int32')
    lable_emb = Embedding(num_classes, zdim, input_length=1)(lable)
    lable_emb = Flatten()(lable_emb)
    joined_rep = Multiply()([z, lable_emb])
    gen_v = build_gen(zdim)
    c_img = gen_v(joined_rep)
    return Model([z, lable], c_img)

def build_dis(img_shape, num_classes=2):
    model = Sequential()
    model.add(Flatten(input_shape=img_shape))
    model.add(Dense(31))
    model.add(LeakyReLU(alpha=0.01))
    model.add(Dense(1, activation='sigmoid'))
    return model

def build_cdis(img_shape, img_rows=1, img_cols=31, num_classes=2):
    img = Input(shape=(img_cols,))
    lable = Input(shape=(1,), dtype='int32')
    lable_emb = Embedding(num_classes, np.prod((31)), input_length=1)(lable)
    lable_emb = Flatten()(lable_emb)
    concate_img = Concatenate(axis=-1)([img, lable_emb])
    dis_v = build_dis((img_cols * 2,))
    classification = dis_v(concate_img)
    return Model([img, lable], classification)

def build_cgan(genrator, discriminator, zdim=100):
    z = Input(shape=(zdim,))
    lable = Input(shape=(1,), dtype='int32')
    f_img = genrator([z, lable])
    classification = discriminator([f_img, lable])
    model = Model([z, lable], classification)
    return model

def train_gans(data, batch_size=128, iterations=5000, interval=1000):
    img_rows = 1
    img_cols = 31
    img_shape = (img_rows, img_cols)
    zdim = 100
    num_classes = 2
    dis_v = build_cdis(img_shape)
    dis_v.compile(loss='binary_crossentropy', optimizer="adam", metrics=['accuracy'])
    dis_v.trainable = False
    gen_v = build_cgen(zdim)
    gan_v = build_cgan(gen_v, dis_v)
    gan_v.compile(loss='binary_crossentropy', optimizer="adam")
    labels_training = data[Y_FEATURE]
    labels_training = labels_training.map(dict(M=1, B=0))
    features_training = data[X_FEATURES]
    Xtrainnew = pd.concat([labels_training, features_training], axis=1, join='inner')
    mydata = Xtrainnew.values.tolist()
    ytrain = []
    for j in mydata:
        ytrain.append(j[0])
    Xtrainnew = pd.DataFrame(data=mydata)
    Ytrainnew = np.array(ytrain)
    scaler = StandardScaler()
    scaled = scaler.fit_transform(Xtrainnew)
    Xtrain = scaled
    ytrain = Ytrainnew
    real = np.ones((batch_size, 1))
    fake = np.zeros((batch_size, 1))
    for iteration in range(iterations):
        ids = np.random.randint(0, Xtrain.shape[0], batch_size)
        imgs = Xtrain[ids]
        labels = ytrain[ids]
        z = np.random.normal(0, 1, (batch_size, 100))
        gen_imgs = gen_v.predict([z, labels])
        dloss_real = dis_v.train_on_batch([imgs, labels], real)
        dloss_fake = dis_v.train_on_batch([gen_imgs, labels], fake)
        dloss, accuracy = 0.5 * np.add(dloss_real, dloss_fake)
        z = np.random.normal(0, 1, (batch_size, 100))
        labels = np.random.randint(0, num_classes, batch_size).reshape(-1, 1)
        gloss = gan_v.train_on_batch([z, labels], real)
        if (iteration + 1) % interval == 0:
            print("%d [D loss: %f , acc: %.2f] [G loss: %f]"%(iteration + 1, dloss, 100.0 * accuracy, gloss))
    return gen_v, scaler, gan_v, dis_v

def generate_synthetic_data(gen, scaler, number_of_rows):
    z = np.random.normal(0, 1, (number_of_rows, 100))
    labels = np.random.randint(2, size=number_of_rows)
    synthetic_data = gen.predict([z, labels])
    synthetic_data = scaler.inverse_transform(synthetic_data)
    for index in range(0, number_of_rows):
        synthetic_data[index] = np.around(synthetic_data[index], 4)
        synthetic_data[index][0] = np.around(synthetic_data[index][0], 0)
    label_synthetic = []
    patient_id_synthetic = []
    patient_gender_synthetic = []
    patient_birthday_synthetic = []
    # very simple approach for the missing data generation, note: can be improved
    for row in range(0, len(synthetic_data)):
        synthetic_data_row = synthetic_data[row,]
        if (synthetic_data_row[0] == 1):
            label_synthetic.append("M")
        else:
            label_synthetic.append("B")
        patient_id_synthetic.append(("bbmri" + str(row)))
        p_g = "female"
        p_b = "01.01.2000"
        patient_birthday_synthetic.append(p_b)
        patient_gender_synthetic.append(p_g)
    # use the X_FEATURES of the synthetic data
    synthetic_data = synthetic_data[:, 1:31]
    # write everything in a dataframe for representation
    synthetic_df = pd.DataFrame(np.c_[patient_id_synthetic, patient_gender_synthetic, patient_birthday_synthetic, synthetic_data, label_synthetic],
                      columns=["patient_id", "gender", "birthDate", "radius_mean", "texture_mean",
                                         "perimeter_mean", "area_mean", "smoothness_mean", "compactness_mean",
                                         "concavity_mean", "concave points_mean", "symmetry_mean",
                                         "fractal_dimension_mean",
                                         "radius_se", "texture_se", "perimeter_se", "area_se", "smoothness_se",
                                         "compactness_se", "concavity_se", "concave points_se", "symmetry_se",
                                         "fractal_dimension_se", "radius_worst", "texture_worst", "perimeter_worst",
                                         "area_worst", "smoothness_worst", "compactness_worst", "concavity_worst",
                                         "concave points_worst", "symmetry_worst", "fractal_dimension_worst", "diagnosis"])
    return synthetic_df

In [5]:
df_data = pd.read_csv(osp.join(BASE_DIR, "data.csv"))
df_data = df_data.drop(["Unnamed: 32"], axis=1)
for distribution in DISTRIBUTION_MODES:
    print("##################################################################################")
    print(f"Start {distribution}")
    print("##################################################################################")
    for station in STATION_LIST:
        print(f"On Station {station}")
        station_df = pd.read_csv(osp.join(BASE_DIR, "{}/{}.csv".format(distribution, station))).rename(columns={"patient_id":"id"})
        station_df["id"] = station_df["id"].map(lambda x: int(x[6:]))
        station_df = pd.merge(df_data, station_df, on='id', how='right')
        gen, scaler, gan_v, dis_v = train_gans(station_df, iterations=5000, interval=500)
        gen.save(osp.join(BASE_DIR, "synthetic", distribution, station, "genv.model"))
        gan_v.save(osp.join(BASE_DIR, "synthetic", distribution, station, "ganv.model"))
        dis_v.save(osp.join(BASE_DIR, "synthetic", distribution, station,  "disv.model"))
        dump(scaler, osp.join(BASE_DIR, "synthetic", distribution, station, "standard_scaler.joblib"))
        synthetic_df = generate_synthetic_data(gen, scaler, len(station_df))
        synthetic_df.to_csv(osp.join(BASE_DIR, "synthetic", distribution, station, "S.csv"))
        synthetic_df = generate_synthetic_data(gen, scaler, 1000)
        synthetic_df.to_csv(osp.join(BASE_DIR, "synthetic", distribution, station, "S1000.csv"))

##################################################################################
Start setting1
##################################################################################
On Station uka
500 [D loss: 0.324845 , acc: 84.77] [G loss: 1.969732]
1000 [D loss: 0.418214 , acc: 83.59] [G loss: 2.357870]
1500 [D loss: 0.467107 , acc: 74.22] [G loss: 1.639418]
2000 [D loss: 0.476552 , acc: 75.78] [G loss: 1.265846]
2500 [D loss: 0.348558 , acc: 87.89] [G loss: 1.659308]
3000 [D loss: 0.362375 , acc: 83.98] [G loss: 1.761549]
3500 [D loss: 0.468168 , acc: 74.61] [G loss: 1.439487]
4000 [D loss: 0.378986 , acc: 83.59] [G loss: 1.821488]
4500 [D loss: 0.324169 , acc: 87.89] [G loss: 2.016687]
5000 [D loss: 0.314681 , acc: 85.94] [G loss: 2.090788]
INFO:tensorflow:Assets written to: /content/drive/MyDrive/DSM-AS/input/synthetic/setting1/uka/genv.model/assets
INFO:tensorflow:Assets written to: /content/drive/MyDrive/DSM-AS/input/synthetic/setting1/uka/ganv.model/assets
INFO:tensorflow:Asset