In [1]:
import uuid

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.mixture import GaussianMixture


In [None]:
df_raw = pd.read_csv('og-transfer.csv')
df_raw = df_raw.rename(columns={'Unnamed: 0': 'record_id'})

# create a duration column
df_raw['request_time'] = pd.to_datetime(df_raw['request_time'])
df_raw['complete_time'] = pd.to_datetime(df_raw['complete_time'])
df_raw = df_raw.sort_values('request_time').reset_index(drop=True)
df_raw['record_id'] = np.arange(len(df_raw))
df_raw['duration'] = (df_raw['complete_time'] - df_raw['request_time']).dt.total_seconds()

# calculation of per-row slack to be added back to synthetic data later
st_files_diff = (
    df_raw['st_files']
    - df_raw[['st_successful', 'st_failed', 'st_expired', 'st_canceled']].sum(axis=1)
).to_numpy()

# model for GMM: dropping features with non-GMM compatiable data types
model_df = df_raw.drop([
    'record_id', 'grp_uuid', 'request_time', 'complete_time',
    'encrypt_data', 'grp_delete', 'st_skipped_errors'
], axis=1)

# categorical_cols will be resampled and stitched back onto the synthetic data
categorical_cols = ['user_id', 'grp_status', 'src_host_ep_id', 'dst_host_ep_id']

# continuous transfer metrics for GMM fitting
numeric_cols = [c for c in model_df.columns if c not in categorical_cols]


X_log = model_df[numeric_cols].clip(lower=0).apply(np.log1p)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_log)

gmm = GaussianMixture(n_components=4, covariance_type='full', random_state=42)
gmm.fit(X_scaled)


0,1,2
,n_components,4
,covariance_type,'full'
,tol,0.001
,reg_covar,1e-06
,max_iter,100
,n_init,1
,init_params,'kmeans'
,weights_init,
,means_init,
,precisions_init,


In [3]:
num_samples = 1000
rng = np.random.default_rng(42)

synthetic_scaled, _ = gmm.sample(num_samples)
synthetic_log = pd.DataFrame(
    scaler.inverse_transform(synthetic_scaled),
    columns=numeric_cols,
)
synthetic_numeric = synthetic_log.apply(np.expm1).clip(lower=0)

sampled_rows = df_raw.sample(n=num_samples, replace=True, random_state=43).reset_index(drop=True)
blend_weight = 0.3
synthetic_numeric = (
    blend_weight * synthetic_numeric[numeric_cols].to_numpy()
    + (1 - blend_weight) * sampled_rows[numeric_cols].to_numpy()
)
synthetic_numeric = pd.DataFrame(synthetic_numeric, columns=numeric_cols)

for col in ['st_dirs', 'st_failed']:
    synthetic_numeric[col] = sampled_rows[col].values

synthetic_numeric['st_bytes_xfered'] = (
    0.1 * synthetic_numeric['st_bytes_xfered']
    + 0.9 * sampled_rows['st_bytes_xfered']
)

real_min = model_df[numeric_cols].min()
real_max = model_df[numeric_cols].max()
synthetic_numeric = synthetic_numeric.clip(lower=real_min, upper=real_max, axis=1)

mean_align_cols = ['duration', 'st_successful', 'st_expired', 'st_canceled', 'st_bytes_xfered', 'st_faults']
real_means = model_df[mean_align_cols].mean()
for col in mean_align_cols:
    synth_mean = synthetic_numeric[col].mean()
    if synth_mean > 0:
        synthetic_numeric[col] *= real_means[col] / synth_mean

synthetic_numeric = synthetic_numeric.clip(lower=real_min, upper=real_max, axis=1)

int_cols = ['st_files', 'st_dirs', 'st_successful', 'st_failed', 'st_expired', 'st_canceled', 'st_faults']
synthetic_numeric[int_cols] = (
    synthetic_numeric[int_cols]
    .round()
    .astype(int)
    .clip(lower=0)
)
component_sum = synthetic_numeric[['st_successful', 'st_failed', 'st_expired', 'st_canceled']].sum(axis=1).to_numpy()
diff_sample = rng.choice(st_files_diff, size=num_samples, replace=True)
synthetic_numeric['st_files'] = component_sum + diff_sample
synthetic_numeric['st_files'] = np.clip(synthetic_numeric['st_files'], 0, real_max['st_files']).astype(int)

synthetic_numeric['st_bytes_xfered'] = (
    synthetic_numeric['st_bytes_xfered']
    .round()
    .clip(lower=0)
    .astype('int64')
)
synthetic_numeric['duration'] = synthetic_numeric['duration'].clip(lower=1)


In [4]:
duration_seconds = np.ceil(synthetic_numeric['duration']).astype(int)
request_times = sampled_rows['request_time']
complete_times = request_times + pd.to_timedelta(duration_seconds, unit='s')

synthetic_dataset = pd.DataFrame({
    'grp_uuid': [str(uuid.uuid4()) for _ in range(num_samples)],
    'user_id': sampled_rows['user_id'],
    'request_time': request_times,
    'grp_status': sampled_rows['grp_status'],
    'encrypt_data': sampled_rows['encrypt_data'],
    'grp_delete': sampled_rows['grp_delete'],
    'st_files': synthetic_numeric['st_files'],
    'st_dirs': synthetic_numeric['st_dirs'],
    'st_successful': synthetic_numeric['st_successful'],
    'st_failed': synthetic_numeric['st_failed'],
    'st_expired': synthetic_numeric['st_expired'],
    'st_canceled': synthetic_numeric['st_canceled'],
    'st_bytes_xfered': synthetic_numeric['st_bytes_xfered'],
    'st_faults': synthetic_numeric['st_faults'],
    'st_skipped_errors': sampled_rows['st_skipped_errors'],
    'src_host_ep_id': sampled_rows['src_host_ep_id'],
    'dst_host_ep_id': sampled_rows['dst_host_ep_id'],
    'complete_time': complete_times,
})

synthetic_dataset = synthetic_dataset.sort_values('request_time').reset_index(drop=True)
synthetic_dataset.insert(0, 'Unnamed: 0', synthetic_dataset.index)

columns_order = [
    'Unnamed: 0', 'grp_uuid', 'user_id', 'request_time', 'grp_status', 'encrypt_data',
    'grp_delete', 'st_files', 'st_dirs', 'st_successful', 'st_failed', 'st_expired',
    'st_canceled', 'st_bytes_xfered', 'st_faults', 'st_skipped_errors', 'src_host_ep_id',
    'dst_host_ep_id', 'complete_time',
]
synthetic_dataset = synthetic_dataset[columns_order]
synthetic_dataset.to_csv('synth-transfer.csv', index=False)
synthetic_dataset.head()


Unnamed: 0.1,Unnamed: 0,grp_uuid,user_id,request_time,grp_status,encrypt_data,grp_delete,st_files,st_dirs,st_successful,st_failed,st_expired,st_canceled,st_bytes_xfered,st_faults,st_skipped_errors,src_host_ep_id,dst_host_ep_id,complete_time
0,0,0e746fdd-f78b-4a07-8db1-b2d9b7b9e2f1,1,2010-07-07 18:38:18.356061,2,False,False,4,0,4,0,0,0,236006123,0,0,3,2,2010-07-07 18:39:14.356061
1,1,41f23433-f3b7-42a7-b0c2-5b4b8738a78a,1,2010-07-07 18:38:18.356061,2,False,False,5,0,4,0,1,0,236006631,113,0,3,2,2010-07-07 18:51:20.356061
2,2,56b45453-c9d8-4bd7-816d-b5d3a7805d53,1,2010-07-07 19:04:20.249963,2,False,False,4261,0,4261,0,0,0,236006122702,124,0,3,2,2010-07-07 20:22:31.249963
3,3,3abadf8c-5c06-488c-a65c-14ebb4ba65a4,1,2010-07-08 15:24:40.780741,2,False,False,9,0,9,0,0,0,236006123,1,0,3,2,2010-07-08 15:28:43.780741
4,4,6761ebc4-7c65-4f8a-94c0-483cc9040d33,1,2010-07-08 15:24:40.780741,2,False,False,6,0,4,0,1,1,236006123,682,0,3,2,2010-07-08 18:28:32.780741


In [5]:
real_numeric = df_raw.copy()
real_numeric['duration_seconds'] = (real_numeric['complete_time'] - real_numeric['request_time']).dt.total_seconds()

synth_numeric = synthetic_dataset.copy()
synth_numeric['request_time'] = pd.to_datetime(synth_numeric['request_time'])
synth_numeric['complete_time'] = pd.to_datetime(synth_numeric['complete_time'])
synth_numeric['duration_seconds'] = (synth_numeric['complete_time'] - synth_numeric['request_time']).dt.total_seconds()

numeric_cols_report = [
    'duration_seconds',
    'st_files',
    'st_dirs',
    'st_successful',
    'st_failed',
    'st_expired',
    'st_canceled',
    'st_bytes_xfered',
    'st_faults',
]

comparison = pd.DataFrame({
    'real_mean': real_numeric[numeric_cols_report].mean(),
    'synthetic_mean': synth_numeric[numeric_cols_report].mean(),
    'real_std': real_numeric[numeric_cols_report].std(ddof=0),
    'synthetic_std': synth_numeric[numeric_cols_report].std(ddof=0),
})
comparison['mean_diff_pct'] = ((comparison['synthetic_mean'] - comparison['real_mean']) / comparison['real_mean']) * 100
comparison['std_diff_pct'] = ((comparison['synthetic_std'] - comparison['real_std']) / comparison['real_std']) * 100
comparison.round(2)


Unnamed: 0,real_mean,synthetic_mean,real_std,synthetic_std,mean_diff_pct,std_diff_pct
duration_seconds,15305.18,15305.69,32051.21,30104.81,0.0,-6.07
st_files,18256.51,18256.37,107930.6,83320.12,-0.0,-22.8
st_dirs,0.16,0.16,0.38,0.39,4.43,4.42
st_successful,10164.0,10163.98,70957.69,52657.63,-0.0,-25.79
st_failed,0.01,0.0,0.08,0.06,-33.33,-18.27
st_expired,1573.03,1572.96,31223.49,31199.51,-0.0,-0.08
st_canceled,6519.63,6519.58,54103.11,44008.31,-0.0,-18.66
st_bytes_xfered,167524200000.0,167524200000.0,1719448000000.0,968010000000.0,-0.0,-43.7
st_faults,3476.77,3476.76,23348.6,20424.94,-0.0,-12.52
