In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import os
os.chdir('/content/drive/MyDrive/Thesis OM/GAN/GAN/SyntheticDataGAN')

In [5]:
import math
import torch
import torch.nn as nn
from torch.nn import TransformerEncoder, TransformerEncoderLayer

class Generator(nn.Module):
    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, padding_index, dropout=0.5):
        super(Generator, self).__init__()
        self.pos_encoder = PositionalEncoding(ninp, dropout)
        encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding(ntoken, ninp, padding_idx=padding_index)
        self.ninp = ninp
        self.decoder = nn.Linear(ninp, ntoken)
        self.init_weights()

    def generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src, src_mask):
        src = self.encoder(src)
        src = src * math.sqrt(self.ninp)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, src_mask)
        output = self.decoder(output)
        return output

In [6]:
import math
import torch
from torch import nn


class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)

        if d_model % 2 == 0:
            pe[:, 1::2] = torch.cos(position * div_term)
        else:
            pe[:, 1::2] = torch.cos(position * div_term[:-1])

        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [7]:
import torch
import torch.nn as nn
from torch.nn import TransformerEncoder, TransformerEncoderLayer

class Discriminator(nn.Module):
    def __init__(self, dim_out, ninp, nhead, nhid, nlayers, dropout):
        super(Discriminator, self).__init__()
        self.pos_encoder = PositionalEncoding(ninp, dropout)
        encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.ninp = ninp
        self.decoder = nn.Linear(ninp, dim_out)
        self.init_weights()
        self.sigmoid = nn.Sigmoid()

    def generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        initrange = 0.1
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src, src_mask):
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, src_mask)
        output = output.permute(1,0,2)
        output = output[:,-1,:]
        output = self.decoder(output)
        output = self.sigmoid(output)
        return output

In [8]:
from torch.utils.data import Dataset
import torch


class load_ar_data(Dataset):
    """Dataloader for autoregressive models: LSTM"""
    def __init__(self, Input, Target):
        self.Input = Input
        self.Target = Target

    def __len__(self):
        return len(self.Input)

    def __getitem__(self, index):
        inp = torch.Tensor(self.Input[index])
        tar = torch.IntTensor(self.Target[index])
        return inp.long(), tar.long()


class load_nar_data(Dataset):

    def __init__(self, Target):
        self.Target = Target

    def __len__(self):
        return len(self.Target)

    def __getitem__(self, index):
        tar = torch.IntTensor(self.Target[index])
        return tar.long()


In [9]:
import torch
import numpy as np


# prepare the dataset for autoregressive models (rnn, transformer-ar)
def prepare_ar_data(path, seq_len, vocab, start_token):
    f = open(path)
    all_seq = []
    for line in f:
        line = line.split()
        seq = []
        for i in range(seq_len + 1):
            seq.append(int(line[i]) if i < len(line) else (vocab + 1))
        all_seq.append(seq)
    target = np.array(all_seq)
    data_size, seq_len = target.shape
    input = np.zeros((data_size, seq_len))
    input[:, 0] = start_token
    input[:, 1:] = target[:, :seq_len-1]
    return input, target


# get authentic data for Transformer_Non-autoregressive model
def prepare_nar_data(path, seq_len, token_num):
    f = open(path)
    seq_list = []
    for line in f:
        line = line.split()
        n = len(line)
        seq = []
        for i in range(seq_len+1):
            if i < n:
                ind = int(line[i])
                seq.append(ind-1)
            else:
                seq.append(token_num)
        seq_list.append(seq)
    seqs = np.array(seq_list)
    return seqs


# prepare the one-hot format authentic data for discriminator of GAN models
def prepare_onehot_aut_data(path, ntoken, seq_len):
    end_token = ntoken + 1
    f = open(path)
    onehotdict = []
    for line in f:
        line = line.split()
        seq = []
        for i in range(seq_len+1):
            onehot = [0 for id in range(end_token)]
            if i < len(line):
                ind = int(line[i])
                onehot[ind-1] = 1
                seq.append(onehot)
            else:
                onehot[end_token-1] = 1
                seq.append(onehot)
        onehotdict.append(seq)
    onehot_data = np.array(onehotdict)
    return onehot_data


# prepare discriminator labels
def prepare_dis_label(size):
    pos_label = np.ones(size)
    neg_label = np.zeros(size)
    pos_label = torch.tensor(pos_label, dtype=torch.float32, requires_grad=False)
    neg_label = torch.tensor(neg_label, dtype=torch.float32, requires_grad=False)
    return pos_label, neg_label

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


def get_freq_dict2(seqs):
    freqdict = {}
    for line in seqs:
        n = len(line)
        if n not in freqdict:
            freqdict[n] = 1
        else:
            freqdict[n] += 1
    return freqdict


def get_length_stats(seqs):
    len_list = []
    for seq in seqs:
        length = len(seq)
        len_list.append(length)

    mean = sum(len_list) / len(seqs)
    variance = sum([((x - mean) ** 2) for x in len_list]) / len(seqs)
    stddev = variance ** 0.5
    max_len = max(len_list)
    print('max_length',max_len)

    return mean, stddev, max_len


def write(item, save_path):
    with open(save_path + 'dif_log.txt', 'a') as filehandle:
        filehandle.write('%s\n' % item)


def save_len_difference(gen_seqs, aut_seqs, save_path):
    gen_mean, gen_std, gen_max_len = get_length_stats(gen_seqs)
    aut_mean, aut_std, aut_max_len = get_length_stats(aut_seqs)
    print(gen_seqs)
    print(aut_seqs)
    print(save_path)
    write('aut_mean: ' + str(aut_mean), save_path)
    write('aut_std: ' + str(aut_std), save_path)
    write('syn_mean: ' + str(gen_mean), save_path)
    write('syn_std: ' + str(gen_std), save_path)

    print('syn_mean: ' +str(gen_mean))
    print('aut_mean: ' + str(aut_mean))
    print('syn_std: ' + str(gen_std))
    print('aut_std: ' +str(aut_std))
    print('gen max length',gen_max_len)
    print('auto max',aut_max_len)

    save_len_diff_figure(gen_seqs, aut_seqs, save_path)

def save_len_diff_figure(gen_seqs, aut_seqs, save_path):
    gen_freq_dict = get_freq_dict2(gen_seqs)
    aut_freq_dict = get_freq_dict2(aut_seqs)
    print('gen_seqs inside image ',gen_seqs)
    print('aut_seqs inside image ',aut_seqs)
    print('gen length',gen_freq_dict)
    print('auto length',aut_freq_dict)
    all_freq_dict = {}
    _, _, aut_max_len = get_length_stats(aut_seqs)
    for seq_len in aut_freq_dict:
        if seq_len not in gen_freq_dict:
            all_freq_dict[seq_len] = [aut_freq_dict[seq_len]/len(aut_seqs), 0]
        else:
            all_freq_dict[seq_len] = [aut_freq_dict[seq_len]/len(aut_seqs), gen_freq_dict[seq_len]/len(gen_seqs)]

    for seq_len in gen_freq_dict:

        if seq_len not in all_freq_dict:
            all_freq_dict[seq_len] = [0, gen_freq_dict[seq_len]/len(gen_seqs)]

    my_df = pd.DataFrame([[k, *v] for k, v in all_freq_dict.items()],
                         columns=['sequence length', 'authentic processes', 'semi-synthetic processes'])
    my_df = my_df.sort_values(by=['sequence length'])
    print(my_df)



    # draw histogram
    SMALL_SIZE = 15
    MEDIUM_SIZE = 15
    BIGGER_SIZE = 18

    plt.rc('font', size=MEDIUM_SIZE)  # controls default text sizes
    plt.rc('axes', titlesize=MEDIUM_SIZE)  # fontsize of the axes title
    plt.rc('axes', labelsize=MEDIUM_SIZE)  # fontsize of the x and y labels
    plt.rc('xtick', labelsize=SMALL_SIZE)  # fontsize of the tick labels
    plt.rc('ytick', labelsize=SMALL_SIZE)  # fontsize of the tick labels
    plt.rc('legend', fontsize=MEDIUM_SIZE)  # legend fontsize
    plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

    barWidth = 1
    bars1 = my_df['authentic processes']
    bars2 = my_df['semi-synthetic processes']

    max_height = max(bars1.max(), bars2.max()) + 0.02

    r1 = np.array(my_df['sequence length'])
    r2 = r1

    plt.figure(figsize=(10, 6))
    plt.bar(r1, bars1, label='Authentic', width=barWidth, color='blue', alpha=0.4)
    plt.bar(r2, bars2, label='Synthetic', width=barWidth, color='red', alpha=0.3)

    plt.title('Sequence Length Distribution')
    plt.xlabel('Sequence Length')
    plt.ylabel('Frequency')
    plt.xlim(0, 25)
    print(aut_max_len)
    plt.xticks(np.arange(0, aut_max_len+1, step=1))
    plt.xticks(rotation=75)
    plt.ylim(0, max_height)
    plt.legend()
    plt.savefig(save_path + 'length_distribution.png')
    plt.show()

def get_seqs_from_path(path):
    f = open(path)
    all_seq = [[int(ind) for ind in line.split()] for line in f]
    return all_seq


def get_length_dif(aut_path, seq_path, save_path):
    aut_seqs = get_seqs_from_path(aut_path)
    gen_seqs = get_seqs_from_path(seq_path)
    save_len_difference(gen_seqs, aut_seqs, save_path)

In [11]:
import random
import numpy as np
import torch
import torch.nn.functional as F
from matplotlib import pyplot as plt



def generate_random_data(bs, vocab_size, seq_len):
    rand_data = []
    end_token = vocab_size
    for i in range(bs):
        randomlist = random.choices(range(0, end_token + 1), k=seq_len + 1)
        rand_data.append(randomlist)
    return rand_data


def gen_data_from_rand(size, g_model, ntokens, device, result_file, save_path, seq_len):
    gen_list = []
    print('gen_data_from_rand')
    print('size',size)
    print('seq_len',seq_len)
    for gen in range(size):
        gen_rand_set = generate_random_data(1, ntokens, seq_len)
        gen_rand_set = torch.tensor(gen_rand_set, dtype=torch.int64).to(device)
        gen_rand_set = torch.transpose(gen_rand_set, 0, 1)
        mask_len = gen_rand_set.size()[0]
        src_mask = g_model.generate_square_subsequent_mask(mask_len).to(device)
        g_output = g_model(gen_rand_set, src_mask)
        g_output = g_output.permute(1, 0, 2)
        out = F.gumbel_softmax(g_output, tau=1, hard=True)
        out_list = out.tolist()
        seq = []
        for j in range(seq_len + 1):
            for k in range(ntokens + 1):
                if out_list[0][j][k] == 1:
                    seq.append(k)
        sub_samp = []
        n = len(seq)
        for j in range(n):
            tok = seq[j]
            if tok != ntokens:
                sub_samp.append(tok + 1)
            else:
                break
        gen_list.append(sub_samp)
    with open(save_path + result_file + '.txt', 'a') as f:
        f.writelines(' '.join(str(token) for token in list) + '\n' for list in gen_list)
    return gen_list


def write_log(save_path, log, file_name):
    with open(save_path + file_name, 'a') as filehandle:
        for listitem in log:
            filehandle.write('%s\n' % listitem)


def plot_loss(save_path, log, file_name, type):
    print("loss plot fun",save_path, log, file_name, type)
    fig, ax = plt.subplots()
    losses = np.array(log)
    if len(losses.shape) == 2:
        plt.plot(losses.T[0], label='train loss')
        plt.plot(losses.T[1], label='val loss')
    else:
        plt.plot(losses, label=type)
    plt.xlabel('epochs')
    plt.ylabel(type)
    plt.legend()
    fig.savefig(save_path + file_name)

def eval_result(save_path, gen_list, test_list):
    print(save_path)
    save_len_difference(gen_list, test_list, save_path)
    print(save_path)
    print("eval_result func")
    print(gen_list,"-------------------------------")
    print(test_list,"-------------------------------")
    save_act_difference(gen_list, test_list, save_path)
    save_variance_dif(gen_list, test_list, save_path)

def get_pad_mask(output, batch_size, seq_len, vocab_size, padding_ind, device):
    out_list = output.tolist()
    pad_mask = []
    for i in range(batch_size):
        pad = seq_len
        for j in range(seq_len):
            if out_list[i][j][padding_ind] == 1:
                pad = j
                break
        pad_mask.append(pad)
    n = len(pad_mask)
    pad_mask_mul = []
    pad_mask_add = []
    for i in range(n):
        seq_mul = []
        seq_add = []
        onehot_one = [1 for _ in range(vocab_size)]
        onehot_zero = [0 for _ in range(vocab_size)]
        onehot_pad = [0 for _ in range(vocab_size - 1)]
        onehot_pad.append(1)
        for j in range(seq_len):
            if j < pad_mask[i]:
                seq_mul.append(onehot_one)
                seq_add.append(onehot_zero)
            else:
                seq_mul.append(onehot_zero)
                seq_add.append(onehot_pad)
        pad_mask_mul.append(seq_mul)
        pad_mask_add.append(seq_add)
    pad_mask_mul = torch.tensor(pad_mask_mul, dtype=torch.int64)
    pad_mask_add = torch.tensor(pad_mask_add, dtype=torch.int64)
    return pad_mask_mul.to(device), pad_mask_add.to(device)


def pad_after_end_token(g_output_t, pad_mask_mul, pad_mask_add):
    g_output_t = g_output_t * pad_mask_mul
    g_output_t = g_output_t + pad_mask_add
    g_output_t = g_output_t.permute(1, 0, 2)
    return g_output_t


def get_act_distribution(g_output, aut_seqs):
    g_output_t_act = g_output.sum(0)
    g_output_t_act = g_output_t_act.sum(0)
    g_authentic_act = aut_seqs.sum(0)
    g_authentic_act = g_authentic_act.sum(0)
    return g_output_t_act, g_authentic_act


def reverse_torch_to_list(seqs, vocab_num):
    result = []
    for seq in seqs:
        seq_i = []
        for i in seq:
            if i != vocab_num:
                seq_i.append(i + 1)
            else:
                break
        result.append(seq_i)
    return result


def remove_end_token(seqs, vocab_num):
    print('inside remove end token')
    result = []
    for seq in seqs:
        seq_i = []
        for i in seq:
            if i != vocab_num + 1 and i != 0:
                seq_i.append(i)
            else:
                break
        result.append(seq_i)
        print(result)

    return result


def write_generated_seqs(save_path, model, gen_seqs):
    with open(save_path + 'result_' + model + '.txt', 'a') as f:
        f.writelines(' '.join(str(token) for token in list) + '\n' for list in gen_seqs)

In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


def get_freq_dic1(seqs):
    freqdict = {}
    for line in seqs:
        for i in range(len(line)):
            ind = line[i]
            if ind not in freqdict:
                freqdict[ind] = 1
            else:
                freqdict[ind] += 1
    return freqdict


def write(item, save_path):
    with open(save_path + 'dif_log.txt', 'a') as filehandle:
        filehandle.write('%s\n' % item)


def get_act_stats(gen_seqs, aut_seqs):
    # build distribution dataframe
    gen_lengths =len(gen_seqs)
    aut_lengths = len(aut_seqs)
    gen_freq_dict = get_freq_dic1(gen_seqs)
    aut_freq_dict = get_freq_dic1(aut_seqs)
    print(gen_freq_dict)
    print(aut_freq_dict)

    all_freq_dict = {}

    for activity in aut_freq_dict:
        if activity not in all_freq_dict:
            all_freq_dict[activity] = [aut_freq_dict[activity]]

    for activity in all_freq_dict:
        if int(activity) in gen_freq_dict:
            all_freq_dict[activity].append(gen_freq_dict[int(activity)])
        else:
            all_freq_dict[activity].append(0)

    # build frequency dataframe
    frequency_dict = {}
    aut_count = 0
    gen_count = 0

    for act in all_freq_dict:
        aut_count += all_freq_dict[act][0]
        gen_count += all_freq_dict[act][1]

    act_type_distance = 0  # the summation of activity type fraction difference
    for act in all_freq_dict:
        if act not in frequency_dict:
            frequency_dict[act] = [all_freq_dict[act][0] / aut_count, all_freq_dict[act][1] / gen_count]
            act_type_distance += abs(all_freq_dict[act][0] / aut_count - all_freq_dict[act][1] / gen_count)
    print('act difference: ' + str(act_type_distance))

    return act_type_distance, frequency_dict

    import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Mapping from integer codes to string values
activity_mapping = {
    1: 'A_ACCEPTED',
    2: 'A_ACTIVATED',
    3: 'A_APPROVED',
    4: 'A_CANCELLED',
    5: 'A_DECLINED',
    6: 'A_FINALIZED',
    7: 'A_PARTLYSUBMITTED',
    8: 'A_PREACCEPTED',
    9: 'A_REGISTERED',
    10: 'A_SUBMITTED',
    11: 'O_ACCEPTED',
    12: 'O_CANCELLED',
    13: 'O_CREATED',
    14: 'O_DECLINED',
    15: 'O_SELECTED',
    16: 'O_SENT',
    17: 'O_SENT_BACK',
    18: 'W_Afhandelen leads',
    19: 'W_Beoordelen fraude',
    20: 'W_Completeren aanvraag',
    21: 'W_Nabellen incomplete dossiers',
    22: 'W_Nabellen offertes',
    23: 'W_Valideren aanvraag',
    24: 'W_Wijzigen contractgegevens'
}

def save_act_dif_figure(frequency_dict, save_path):
    # Convert frequency_dict's keys to string values based on mapping
    updated_freq_dict = {activity_mapping[int(k)]: v for k, v in frequency_dict.items()}

    my_df = pd.DataFrame([[k, *v] for k, v in updated_freq_dict.items()], columns=['activity', 'real', 'syn'])

    # Sorting by 'real' frequency for plotting
    df_sorted = my_df.sort_values(by='real', ascending=False)
    real_color = '#76b5c5'  # A softer blue
    syn_color = '#f2a65a'   # A softer orange
    # Set the figure size for better visibility
    plt.figure(figsize=(30, 15))

    # Adjusting the bar width and positions
    bar_width = 0.35
    index_sorted = np.arange(len(df_sorted['activity']))

    # Adjusting the plot with different colors and a slightly different look

    # Set the figure size for better visibility

    # New colors for the bars
    real_color = '#76b5c5'  # A softer blue
    syn_color = '#f2a65a'   # A softer orange

    # Create horizontal bars with new colors and adjusted layout for a fresh look
    plt.barh(index_sorted, df_sorted['real'], bar_width, label='Real', color=real_color, edgecolor='grey')
    plt.barh(index_sorted + bar_width, df_sorted['syn'], bar_width, label='Synthetic', color=syn_color, edgecolor='grey')

    # Adding some design tweaks for a more modern look
    plt.yticks(index_sorted + bar_width / 2, df_sorted['activity'], fontsize=10)
    plt.xlabel('Frequency', fontsize=12)
    plt.title('Comparison of Activity Frequency between Real and Synthetic Data', fontsize=14)
    plt.legend()

    # Adding grid for better readability
    plt.grid(color='grey', linestyle='--', linewidth=0.5, axis='x')
    plt.gca().invert_yaxis()
    plt.savefig(save_path + 'act_distribution.png')

    # Adjust layout
    plt.tight_layout()

    plt.show()


# Example usage (replace 'frequency_dict' and 'save_path' with your actual data and desired save location)
# save_act_dif_figure(frequency_dict, '/path/to/save/')


# def save_act_dif_figure(frequency_dict, save_path):
#     print(frequency_dict)
#     my_df = pd.DataFrame([[k, *v] for k, v in frequency_dict.items()], columns=['activity', 'real', 'syn'])
#     print(my_df,"_____________________")
#     # draw histogram
#     df_sorted = my_df.sort_values(by='real', ascending=False)

#     # Set the figure size for better visibility
#     plt.figure(figsize=(10, 6))
#     bar_width = 0.35
#     # Set up the bar widths and positions after sorting
#     index_sorted = np.arange(len(df_sorted['activity']))

#     # Create the bars for the sorted plot
#     plt.bar(index_sorted, df_sorted['real'], bar_width, label='Real', color='b')
#     plt.bar(index_sorted + bar_width, df_sorted['syn'], bar_width, label='Synthetic', color='orange')

#     # Set the plot details for the sorted data
#     plt.xlabel('Activity Type')
#     plt.ylabel('Frequency')
#     plt.title('Comparison of Activity Frequency between Real and Synthetic Data (Sorted by Real)')
#     plt.xticks(index_sorted + bar_width / 2, df_sorted['activity'])
#     plt.legend()
#     plt.savefig(save_path + 'act_distribution.png')


#     # Show the sorted plot
#     plt.show()

def save_act_difference(gen_seqs, aut_seqs, save_path):
    act_type_distance, frequency_dict = get_act_stats(gen_seqs, aut_seqs)
    write('act difference: ' + str(act_type_distance), save_path)
    print('save_act_dif_figure',)
    save_act_dif_figure(frequency_dict, save_path)


def get_seqs_from_path(path):
    print('path inside eval',path)
    f = open(path)
    all_seq = [[int(ind) for ind in line.split()] for line in f]
    return all_seq


def get_act_dif(aut_path, gen_path, save_path):
    print('aut_seqs',aut_seqs)
    print('gen_seqs',gen_seqs)
    aut_seqs = get_seqs_from_path(aut_path)
    gen_seqs = get_seqs_from_path(gen_path)
    save_act_difference(gen_seqs, aut_seqs, save_path)

In [13]:
import editdistance


def write(item, save_path):
    with open(save_path + 'dif_log.txt', 'a') as filehandle:
        filehandle.write('%s\n' % item)


def get_variance(seqs):
    num_seq = len(seqs)
    ed_m = [[0 for _ in range(0, num_seq)] for _ in range(0, num_seq)]
    ed_m_norm = [[0 for _ in range(0, num_seq)] for _ in range(0, num_seq)]

    variance = 0

    for i in range(0, num_seq):
        for j in range(0, num_seq):
            ed_m[i][j] = editdistance.eval(seqs[i], seqs[j])
            if len(seqs[i]) == 0 and len(seqs[j]) == 0:
                ed_m_norm[i][j] = 0
            else:
                ed_m_norm[i][j] = ed_m[i][j] / (len(seqs[i]) + len(seqs[j]))
            variance += ed_m_norm[i][j]
    variance = variance / (2 * num_seq * num_seq)

    return variance


def save_variance_dif(gen_seqs, aut_seqs, save_path):
    gen_variance = get_variance(gen_seqs)
    aut_variance = get_variance(aut_seqs)
    diff = aut_variance - gen_variance
    write('aut_variance: ' + str(aut_variance), save_path)
    write('syn_variance: ' + str(gen_variance), save_path)
    write('variance difference: ' + str(diff), save_path)
    write('\n', save_path)
    print('aut_variance: ' + str(aut_variance))
    print('syn_variance: ' + str(gen_variance))


def get_seq_from_path(path):
    f = open(path)
    all_seq = [[int(ind) for ind in line.split()] for line in f]
    return all_seq


def get_variance_dif(aut_path, gen_path, save_path):
    aut_seqs = get_seq_from_path(aut_path)
    gen_seqs = get_seq_from_path(gen_path)
    save_variance_dif(gen_seqs, aut_seqs, save_path)

In [14]:
import random
import numpy as np
import torch
import torch.nn.functional as F
from matplotlib import pyplot as plt



def generate_random_data(bs, vocab_size, seq_len):
    rand_data = []
    end_token = vocab_size
    for i in range(bs):
        randomlist = random.choices(range(0, end_token + 1), k=seq_len + 1)
        rand_data.append(randomlist)
    return rand_data


def gen_data_from_rand(size, g_model, ntokens, device, result_file, save_path, seq_len):
    gen_list = []
    print('gen_data_from_rand')
    print('size',size)
    print('seq_len',seq_len)
    for gen in range(size):
        gen_rand_set = generate_random_data(1, ntokens, seq_len)
        gen_rand_set = torch.tensor(gen_rand_set, dtype=torch.int64).to(device)
        gen_rand_set = torch.transpose(gen_rand_set, 0, 1)
        mask_len = gen_rand_set.size()[0]
        src_mask = g_model.generate_square_subsequent_mask(mask_len).to(device)
        g_output = g_model(gen_rand_set, src_mask)
        g_output = g_output.permute(1, 0, 2)
        out = F.gumbel_softmax(g_output, tau=1, hard=True)
        out_list = out.tolist()
        seq = []
        for j in range(seq_len + 1):
            for k in range(ntokens + 1):
                if out_list[0][j][k] == 1:
                    seq.append(k)
        sub_samp = []
        n = len(seq)
        for j in range(n):
            tok = seq[j]
            if tok != ntokens:
                sub_samp.append(tok + 1)
            else:
                break
        gen_list.append(sub_samp)
    with open(save_path + result_file + '.txt', 'a') as f:
        f.writelines(' '.join(str(token) for token in list) + '\n' for list in gen_list)
    return gen_list


def write_log(save_path, log, file_name):
    with open(save_path + file_name, 'a') as filehandle:
        for listitem in log:
            filehandle.write('%s\n' % listitem)


def plot_loss(save_path, log, file_name, type):
    print("loss plot fun",save_path, log, file_name, type)
    fig, ax = plt.subplots()
    losses = np.array(log)
    if len(losses.shape) == 2:
        plt.plot(losses.T[0], label='train loss')
        plt.plot(losses.T[1], label='val loss')
    else:
        plt.plot(losses, label=type)
    plt.xlabel('epochs')
    plt.ylabel(type)
    plt.legend()
    fig.savefig(save_path + file_name)

def eval_result(save_path, gen_list, test_list):
    print(save_path)
    save_len_difference(gen_list, test_list, save_path)
    print(save_path)
    print("eval_result func")
    print(gen_list,"-------------------------------")
    print(test_list,"-------------------------------")
    save_act_difference(gen_list, test_list, save_path)
    save_variance_dif(gen_list, test_list, save_path)

def get_pad_mask(output, batch_size, seq_len, vocab_size, padding_ind, device):
    out_list = output.tolist()
    pad_mask = []
    for i in range(batch_size):
        pad = seq_len
        for j in range(seq_len):
            if out_list[i][j][padding_ind] == 1:
                pad = j
                break
        pad_mask.append(pad)
    n = len(pad_mask)
    pad_mask_mul = []
    pad_mask_add = []
    for i in range(n):
        seq_mul = []
        seq_add = []
        onehot_one = [1 for _ in range(vocab_size)]
        onehot_zero = [0 for _ in range(vocab_size)]
        onehot_pad = [0 for _ in range(vocab_size - 1)]
        onehot_pad.append(1)
        for j in range(seq_len):
            if j < pad_mask[i]:
                seq_mul.append(onehot_one)
                seq_add.append(onehot_zero)
            else:
                seq_mul.append(onehot_zero)
                seq_add.append(onehot_pad)
        pad_mask_mul.append(seq_mul)
        pad_mask_add.append(seq_add)
    pad_mask_mul = torch.tensor(pad_mask_mul, dtype=torch.int64)
    pad_mask_add = torch.tensor(pad_mask_add, dtype=torch.int64)
    return pad_mask_mul.to(device), pad_mask_add.to(device)


def pad_after_end_token(g_output_t, pad_mask_mul, pad_mask_add):
    g_output_t = g_output_t * pad_mask_mul
    g_output_t = g_output_t + pad_mask_add
    g_output_t = g_output_t.permute(1, 0, 2)
    return g_output_t


def get_act_distribution(g_output, aut_seqs):
    g_output_t_act = g_output.sum(0)
    g_output_t_act = g_output_t_act.sum(0)
    g_authentic_act = aut_seqs.sum(0)
    g_authentic_act = g_authentic_act.sum(0)
    return g_output_t_act, g_authentic_act


def reverse_torch_to_list(seqs, vocab_num):
    result = []
    for seq in seqs:
        seq_i = []
        for i in seq:
            if i != vocab_num:
                seq_i.append(i + 1)
            else:
                break
        result.append(seq_i)
    return result


def remove_end_token(seqs, vocab_num):
    print('inside remove end token')
    result = []
    for seq in seqs:
        seq_i = []
        for i in seq:
            if i != vocab_num + 1 and i != 0:
                seq_i.append(i)
            else:
                break
        result.append(seq_i)
        print(result)

    return result


def write_generated_seqs(save_path, model, gen_seqs):
    with open(save_path + 'result_' + model + '.txt', 'a') as f:
        f.writelines(' '.join(str(token) for token in list) + '\n' for list in gen_seqs)

In [21]:
import datetime
import random
import torch
import time
import numpy as np
import torch.nn.functional as F
from torch import nn
from torch.utils.data import Dataset, DataLoader
from datetime import datetime, timedelta
import random


class GANs:
    def __init__(self, res_path, save_path, model, config, gen_num):
        """GAN Model and the variants

        Parameters:
            'seq_len'   : the longest sequence length in data
            'vocab_num' : the size of vocabulary
            'emb_size'  : embedding dimension
            'n_hid'     : the dimension of the feedforward network model in nn.TransformerEncoder
            'n_layer'   : the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
            'n_head_g'  : the number of heads in the multi-head-attention models of generator
            'n_head_d'  : the number of heads in the multi-head-attention models of discriminator
            'drop_out'  : the dropout value
            'gd_ratio'  : k value: the generator updates k times and discriminator updates 1 time
            'lr_gen'    : generator learning rate
            'lr_dis'    : discriminator learning rate
            'epochs'    : total epochs

        """
        self.res_path = res_path
        self.save_path = save_path
        self.gen_num = gen_num
        self.mode = model


        # print(res_path,save_path,mode,config['seq_len'])
        self.seq_len = 25
        self.vocab_num = config['vocab_num']
        self.emb_size = config['emb_size']
        self.n_hid = config['n_hid']
        self.n_layer = config['n_layer']
        self.n_head_g = config['n_head_g']
        self.n_head_d = config['n_head_d']
        self.drop_out = config['drop_out']
        self.gd_ratio = config['gd_ratio']
        self.lr_gen = config['lr_gen']
        self.lr_dis = config['lr_dis']
        self.batch_size = config['batch_size']
        self.epochs = config['epochs']
        self.seed = config['seed']
        self.device = config['device']
        self.test_size = config['test_size']
        self.valid_size = config['valid_size']
        self.train_size = config['train_size']
        self.n_inp = self.vocab_num + 1
        self.pad_ind = self.vocab_num
        self.gen_losses = []
        self.dis_losses = []
        self.d_accuracies = []
        self.gen_accuracies=[]

    def get_training_metrics(self):
        return self.gen_losses, self.dis_losses, self.d_accuracies, self.gen_accuracies

    def train(self, target, aut_data):
        random.seed(self.seed)
        np.random.seed(self.seed)

        # initialize generator
        g_model = Generator(self.n_inp, self.emb_size, self.n_head_g, self.n_hid, self.n_layer, self.pad_ind, self.drop_out).to(self.device)
        # the optimizer of generator

        print('.....',self.emb_size,self.n_head_g)
        gd_optimizer = torch.optim.Adam(g_model.parameters(), lr=self.lr_gen, betas=(0.5, 0.999))

        # initialize discriminator
        emsize_d = self.vocab_num + 1
        dis_output_dim = 1
        d_model = Discriminator(dis_output_dim, emsize_d, self.n_head_d, self.n_hid, self.n_layer, self.drop_out).to(self.device)
        d_criterion = nn.BCELoss()
        d_optimizer = torch.optim.Adam(d_model.parameters(), lr=self.lr_dis, betas=(0.5, 0.999))

        # record the parameters
        para_list = [self.vocab_num, self.emb_size, self.n_head_g, self.n_hid, self.n_layer, self.pad_ind, self.drop_out, self.n_head_d,  self.batch_size, self.epochs, self.lr_dis]
        write_log(self.save_path, para_list, 'parameter_log.txt')

        # load the one-hot format of training data
        dataset = load_nar_data(target)
        train_data, _, _ = torch.utils.data.random_split(dataset, (self.train_size, self.valid_size, self.test_size), generator=torch.Generator().manual_seed(self.seed))
        train_dataloader = DataLoader(train_data, batch_size=self.batch_size, drop_last=False, shuffle=False, num_workers=1)

        # load the original format of test data for activity loss calculation
        dataset_2 = load_nar_data(aut_data)
        _, _, test_data = torch.utils.data.random_split(dataset_2, (self.train_size, self.valid_size, self.test_size), generator=torch.Generator().manual_seed(self.seed))
        test_dataloader = DataLoader(test_data, batch_size=self.batch_size, drop_last=False, shuffle=False, num_workers=1)
        test_seqs = next(iter(test_dataloader)).tolist()
        test_list = reverse_torch_to_list(test_seqs, self.vocab_num)

        # run pre epochs if add activity loss
        pre_epoch = 25

        # log the discriminator's accuracies
        d_acc = []

        mean_act_loss, mean_gen_loss = self.get_pre_exp_loss(pre_epoch, g_model, d_model, train_dataloader)

        for big_epoch in range(1,  self.epochs + 1):
            start_time = time.time()
            g_model.train()
            d_model.train()

            dis_total_loss = 0
            gen_total_loss = 0

            # generate random sequences for generator input
            rand_set = generate_random_data(self.train_size, self.vocab_num, self.seq_len)
            rand_set = torch.tensor(rand_set, dtype=torch.int64).to(self.device)

            acc_i = 0

            for i, item in enumerate(train_dataloader):
                # update generator
                dis_data_pos = item
                dis_data_pos = dis_data_pos.to(self.device)
                batch = dis_data_pos.size()[0]

                # [LENGTH, BATCH_SIZE, VOCAB]
                dis_data_pos = dis_data_pos.permute(1, 0, 2)
                real_labels = torch.ones(batch, 1).to(self.device)
                random_data = rand_set[i:i + batch]
                random_data = torch.transpose(random_data, 0, 1)

                # generate sequences from random_data
                gd_optimizer.zero_grad()
                gen_loss, g_output_t = self.generator(random_data, g_model, d_model, batch, d_criterion, real_labels)
                g_output_t_act, g_authentic_act = get_act_distribution(g_output_t, dis_data_pos)
                act_loss = self.get_act_loss(g_output_t_act, g_authentic_act, batch)
                # Inside the training loop, after computing gen_loss

                gen_loss = gen_loss / (mean_gen_loss)
                # back-propagate the generator
                g_loss = act_loss + gen_loss
                g_loss.backward()
                gd_optimizer.step()
                gen_total_loss += g_loss

                # update discriminator
                if big_epoch % self.gd_ratio == 0:
                    d_optimizer.zero_grad()
                    dis_loss, gd_acc_neg, gd_acc_pos = self.discrminator(dis_data_pos, g_output_t, g_model, d_model, d_criterion, batch)
                    dis_total_loss += dis_loss

                    # back-propagate the discriminator
                    dis_loss.backward()
                    d_optimizer.step()
                    acc_i += (gd_acc_neg + gd_acc_pos)/2


            if big_epoch % self.gd_ratio == 0:
                end_time = time.time()
                variance = 0.
                acc = acc_i/len(train_dataloader)
                #  * random.uniform(0.5, 0.8)
                print(":::::::::::::::::::::accuracy disc",acc)

                d_acc.append(acc.detach().cpu())

                gen_total_loss = gen_total_loss/len(train_dataloader)
                dis_total_loss = dis_total_loss/len(train_dataloader)


                    # Store the averages for plotting
                self.gen_losses.append(gen_total_loss.item())  # Assuming gen_total_loss is a tensor
                self.dis_losses.append(dis_total_loss.item())
                self.d_accuracies.append(acc.item())
                print('ad epoch {:3d} |  g_loss {:5.4f} | d_loss {:5.4f} | d_acc_real {:5.2f} | d_acc_fake {:5.2f} | d_acc {:5.2f}'
                        .format(big_epoch, gen_total_loss, dis_total_loss, gd_acc_pos, gd_acc_neg, acc))

            # generate and evaluate samples every 50 epochs
            if big_epoch % 50 == 0:
                torch.save(g_model, self.save_path + str(big_epoch)+'g_model.pt')
                torch.save(d_model, self.save_path + str(big_epoch)+'d_model.pt')
                plot_loss(self.save_path, d_acc, str(big_epoch)+'d_acc.png', 'd_acc')
                g_model.eval()

                # generate synthetic sequences using the generator and save the sequences
                with torch.no_grad():
                    gen_list = gen_data_from_rand(self.gen_num, g_model, self.vocab_num, self.device, str(big_epoch) + '_result_trans', self.save_path, self.seq_len)
                # evaluate and record the results
                with open(self.save_path +'stats/' + 'dif_log.txt', 'a') as filehandle:
                    filehandle.write('%s\n' % big_epoch)
                eval_result(self.save_path +'stats/', gen_list, test_list)

    def get_act_loss(self, g_output_t_act, g_authentic_act, batch_size):
        """get the additional activity distribution loss between generated sequences and real sequences"""
        if self.mode == "gan":
            act_loss_criterion = nn.MSELoss()
            act_loss = act_loss_criterion(g_output_t_act.float(), g_authentic_act.float()) / (batch_size)
        else:
            act_loss = 0

        # return the activity distribution loss
        return act_loss
    import matplotlib as plt


    def generator(self, data, g_model, d_model, batch, d_criterion, real_labels):
        """Transformer encoder-based Generator"""
        mask_len = data.size()[0]
        src_mask = g_model.generate_square_subsequent_mask(mask_len).to(self.device)
        g_output = g_model(data, src_mask)
        g_output_st = g_output.permute(1, 0, 2)

        # use straight-through Gumbel-softmax to obtain gradient from discriminator
        g_output_t = F.gumbel_softmax(g_output_st, tau=1, hard=True)

        # the tokens generated after the end token will be padded
        pad_mask_mul, pad_mask_add = get_pad_mask(g_output_t, batch, self.seq_len + 1, self.vocab_num + 1, self.pad_ind, self.device)
        g_output_t = pad_after_end_token(g_output_t, pad_mask_mul, pad_mask_add)

        # generator loss is given by discriminator's prediction
        d_predict = d_model(g_output_t, src_mask)
        gen_loss = d_criterion(d_predict, real_labels)
        gen_accuracy = (d_predict > 0.5).float().mean().item()
        self.gen_accuracies.append(gen_accuracy)

        # return the generator loss, and the generated sequences
        return gen_loss, g_output_t

    def discrminator(self, dis_data_pos, g_output_t, g_model, d_model, d_criterion, batch):
        """Transformer encoder-based Discriminator"""
        mask_len = dis_data_pos.size()[0]
        src_mask = g_model.generate_square_subsequent_mask(mask_len).to(self.device)
        dis_label_pos, dis_label_neg = prepare_dis_label(batch)
        dis_label_neg = dis_label_neg.to(self.device)
        dis_label_pos = dis_label_pos.to(self.device)

        dis_predict_pos = d_model(dis_data_pos, src_mask)
        dis_predict_neg = d_model(g_output_t.detach(), src_mask)
        dis_loss_pos = d_criterion(dis_predict_pos, dis_label_pos.reshape(-1, 1))
        dis_loss_neg = d_criterion(dis_predict_neg, dis_label_neg.reshape(-1, 1))

        predict_neg = (dis_predict_neg.flatten().round())
        gd_acc_neg = (predict_neg == dis_label_neg.flatten()).sum() / batch
        predict_pos = (dis_predict_pos.flatten().round())
        gd_acc_pos = (predict_pos == dis_label_pos.flatten()).sum() / batch

        dis_loss = dis_loss_pos + dis_loss_neg

        # return the discriminator loss, the accuracy of negative samples and positive samples
        return dis_loss, gd_acc_neg, gd_acc_pos

    def get_pre_exp_loss(self, pre_epoch, g_model, d_model, train_dataloader):
        """Calculate the expectation loss values.
        Generator loss and activity distribution loss are expected in the same scale when added together.
        total_loss = weight*act_loss + gen_loss
        where:
        weight -> mean(gen_loss)/mean(act_loss)
        """
        total_act_loss = 0
        total_gen_loss = 0
        d_criterion = nn.BCELoss()
        for epoch in range(pre_epoch):
            print('pre epoch' + str(epoch))
            g_model.train()
            d_model.train()
            rand_set = generate_random_data(self.train_size, self.vocab_num, self.seq_len)
            rand_set = torch.tensor(rand_set, dtype=torch.int64).to(self.device)
            for i, item in enumerate(train_dataloader):
                dis_data_pos = item
                dis_data_pos = dis_data_pos.to(self.device)
                batch = dis_data_pos.size()[0]
                real_labels = torch.ones(batch, 1).to(self.device)
                data = rand_set[i:i + batch]
                data = torch.transpose(data, 0, 1)
                gen_loss, pre_g_output_t = self.generator(data, g_model, d_model, batch, d_criterion, real_labels)
                pre_g_output_t_act, g_authentic_act = get_act_distribution(pre_g_output_t, dis_data_pos)
                act_loss = self.get_act_loss(pre_g_output_t_act, g_authentic_act, batch)
                total_act_loss += act_loss.item()
                total_gen_loss += gen_loss.item()
        mean_act = total_act_loss / pre_epoch
        mean_gen = total_gen_loss / pre_epoch



        # return the mean value of the activity distribution loss, and the generator loss
        return mean_act, mean_gen

    def run(self):
        aut_onehot_data = prepare_onehot_aut_data(self.res_path, self.vocab_num, self.seq_len)
        aut_data = prepare_nar_data(self.res_path, self.seq_len, self.vocab_num)
        self.train(aut_onehot_data, aut_data)

In [None]:
import os
import torch
import numpy as np
import random
from datetime import datetime
import csv

# Set the random seed for reproducibility
seed = 88
np.random.seed(seed)
random.seed(seed)
torch.manual_seed(seed)


def get_config():
    """
    Returns the configuration for the LSTM model.
    """
    return {

        'train_size': 6917,
        'test_size' : 865,
        'valid_size': 865,
        'seq_len'   : 25,    # the longest sequence length in data
        'vocab_num' : 24,
        'emb_size'  : 8,   # embedding dimension
        'n_hid'     : 32,  # the dimension of the feedforward network model in nn.TransformerEncoder
        'n_layer'   : 2,   # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
        'n_head_g'  : 4,   # the number of heads in the multi-head-attention models of generator
        'n_head_d'  : 1,   # the number of heads in the multi-head-attention models of discriminator
        'drop_out'  : 0.1, # the dropout value
        'batch_size': 32,
        'gd_ratio'  : 2,       # k value: the generator updates k times and discriminator updates 1 time
        'lr_gen'    : 0.001,  # generator learning rate
        'lr_dis'    : 0.001,  # discriminator learning rate
        'epochs'    : 600,    # total epochs
        'seed'      : seed,
        'device'    : torch.device("cuda" if torch.cuda.is_available() else "cpu"),

    }
def run():
    # Define your dataset and model details directly
    seq_path = 'Dataset/bpi25.txt'
    model_name = 'gan'
    gen_num = 1000

    # Generate a timestamped directory to save the results
    dateTimeObj = datetime.now()
    save_time = dateTimeObj.strftime("%m-%d-%H-%M")
    save_path = f'result/{save_time}_{model_name}/'

    os.makedirs(save_path, exist_ok=True)
    save_path_res = save_path + 'stats/'
    print(save_path_res)
    os.makedirs(os.path.dirname(save_path_res), exist_ok=True)

    config = get_config()

    print(config['seq_len'])
    gan = GANs(seq_path, save_path, model_name, config, gen_num)
    print(gan)

    gan.run()
    gen_losses = gan.gen_losses
    dis_losses = gan.dis_losses
    d_accuracies = gan.d_accuracies
    gen_accuracies = gan.gen_accuracies
    print("**********************************")
    print(gen_losses)
    print(dis_losses)
    print(d_accuracies)
    print(gen_accuracies)
    gen_losses, dis_losses, d_accuracies, gen_accuracies = gan.get_training_metrics()
    metrics_path = save_path + 'training_metrics.csv'
    with open(metrics_path, 'w', newline='') as csvfile:
        metric_writer = csv.writer(csvfile, delimiter=',',
                                   quotechar='"', quoting=csv.QUOTE_MINIMAL)
        # Write headers
        metric_writer.writerow(['Epoch', 'Generator Loss', 'Discriminator Loss', 'Discriminator Accuracy', 'Generator Accuracy'])

        # Write metrics for each epoch
        for epoch in range(len(gen_losses)):
            metric_writer.writerow([epoch+1, gen_losses[epoch], dis_losses[epoch], d_accuracies[epoch], gen_accuracies[epoch]])


if __name__ == '__main__':
    run()


Output hidden; open in https://colab.research.google.com to view.

In [None]:
import matplotlib.pyplot as plt

# Plotting
plt.figure(figsize=(14, 7))

# Generator and Discriminator accuracy
plt.plot(data['Epoch'], data['Generator Accuracy'], label='Generator Accuracy', marker='o')
plt.plot(data['Epoch'], data['Discriminator Accuracy'], label='Discriminator Accuracy', marker='x')

# Labeling
plt.title('Generator and Discriminator Accuracy over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True)

plt.show()
