In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_pickle(f"dataset/3p6-selected.pkl", compression='zip')

In [3]:
train_df = df[df["time"] != 303]
train_df = train_df.drop(columns=["x","y","z","px","py","pz"], axis=1)

In [4]:
test_df = df[df["time"] == 303]
test_df = test_df.drop(columns=["x","y","z","px","py","pz"], axis=1)

In [5]:
train_df.shape, test_df.shape

((253430, 5), (50686, 5))

In [6]:
df[['x', 'y', 'z']].drop_duplicates().count()[0]

50686

In [7]:
import sdv

In [8]:
print(sdv.__version__)

1.0.0


In [9]:
from sdv.metadata import SingleTableMetadata

metadata = SingleTableMetadata()

In [44]:
metadata.detect_from_dataframe(data=train_df)
python_dict = metadata.to_dict()

In [45]:
python_dict = metadata.to_dict()
python_dict

{'columns': {'vx': {'sdtype': 'numerical'},
  'vy': {'sdtype': 'numerical'},
  'vz': {'sdtype': 'numerical'},
  'time': {'sdtype': 'numerical'},
  'distance': {'sdtype': 'numerical'}},
 'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1'}

In [40]:
# metadata.update_column(
#     column_name='xyz_id',
#     sdtype='id')
# metadata.set_primary_key(column_name='xyz_id')

In [41]:
metadata.validate()

In [10]:
from sdv.single_table import CopulaGANSynthesizer

In [47]:
synthesizer = CopulaGANSynthesizer(metadata)
synthesizer.fit(train_df)

In [48]:
synthetic_data = synthesizer.sample(num_rows=10)

In [49]:
synthetic_data[(synthetic_data["time"]==302) & (synthetic_data["distance"]==74)]

Unnamed: 0,vx,vy,vz,time,distance


In [48]:
train_df[(train_df["xyz_id"]==30164)]

Unnamed: 0,vx,vy,vz,time,distance,xyz_id
1872348,0.47,0.01,0.06,301,82.0,30164
3291556,0.5,-0.02,-0.01,305,82.0,30164
9728678,0.48,0.01,0.07,300,82.0,30164
20727540,0.48,0.01,0.06,302,82.0,30164
30155136,0.49,-0.02,-0.02,304,82.0,30164


In [49]:
train_df[(train_df["time"]==302) & (train_df["distance"]==74)]

Unnamed: 0,vx,vy,vz,time,distance,xyz_id
20680297,0.49,-0.00,0.04,302,74.0,22048
20680307,0.48,0.01,0.03,302,74.0,29968
20680358,0.50,-0.01,0.05,302,74.0,19654
20680373,0.47,0.01,0.01,302,74.0,31534
20680439,0.47,0.01,0.00,302,74.0,33100
...,...,...,...,...,...,...
20730085,0.53,-0.01,0.01,302,74.0,18357
20730151,0.53,-0.02,0.00,302,74.0,19923
20730218,0.54,-0.01,-0.00,302,74.0,22281
20730225,0.50,0.02,-0.00,302,74.0,27825


In [50]:
synthesizer.save(
    filepath=f"temp/3p6-learned-1.pkl"
)

In [11]:
synthesizer_saved = CopulaGANSynthesizer.load(
    filepath=f"temp/3p6-learned-1.pkl"
)

In [12]:
from sdv.sampling import Condition

In [62]:
generate_test_data = Condition(
    num_rows=20,
    column_values={'time': 301, 'distance': 82}
)


In [63]:
synthetic_data = synthesizer_saved.sample_from_conditions(
    conditions=[generate_test_data],
    output_file_path=f"result/test1.csv"
)

Sampling conditions: 100%|██████████| 20/20 [00:00<00:00, 20.06it/s]


In [61]:
train_df[(train_df["time"]==301) & (train_df["distance"]==82)]

Unnamed: 0,vx,vy,vz,time,distance
1824916,0.49,0.01,0.05,301,82.0
1824917,0.49,0.01,0.06,301,82.0
1824918,0.49,0.01,0.06,301,82.0
1824919,0.49,0.01,0.07,301,82.0
1824988,0.48,0.01,0.04,301,82.0
...,...,...,...,...,...
1874977,0.49,-0.03,0.09,301,82.0
1875020,0.52,-0.02,0.01,301,82.0
1875086,0.52,-0.03,-0.00,301,82.0
1875152,0.52,-0.03,0.00,301,82.0


In [66]:
test_df

Unnamed: 0,vx,vy,vz,time,distance
25697802,0.48,0.01,0.03,303,150.0
25697803,0.48,0.01,0.03,303,146.0
25697804,0.47,0.01,0.03,303,143.0
25697805,0.47,0.01,0.03,303,140.0
25697806,0.47,0.02,0.03,303,137.0
...,...,...,...,...,...
25748483,0.47,-0.01,0.06,303,141.0
25748484,0.46,-0.01,0.06,303,144.0
25748485,0.46,-0.01,0.06,303,147.0
25748486,0.46,-0.01,0.05,303,151.0


In [69]:
synthetic_data

Unnamed: 0,vx,vy,vz,time,distance
0,0.28,-0.02,0.04,301,82
1,0.42,0.01,0.04,301,82
2,0.62,-0.02,0.01,301,82
3,0.49,0.04,0.04,301,82
4,0.5,0.01,0.01,301,82
5,0.23,-0.07,0.04,301,82
6,0.48,0.06,0.07,301,82
7,0.48,0.03,0.06,301,82
8,0.5,-0.04,0.05,301,82
9,0.49,-0.03,0.02,301,82


In [14]:
data = []
# try:
for i in range(100):
    idx = i+25697802
    generate_test_data = Condition(
    num_rows=1,
    column_values={'time': 301, 'distance': test_df.loc[idx]["distance"]})
    synthetic_data = synthesizer_saved.sample_from_conditions(
    conditions=[generate_test_data], max_tries_per_batch=2000)
    synthetic_data["x"] = df.loc[idx]["x"]
    synthetic_data["y"] = df.loc[idx]["y"]
    synthetic_data["z"] = df.loc[idx]["z"]
    synthetic_data["id"] = idx
    data.append(synthetic_data)



# except:
#     # print(data)
#     pass

print("last id", idx)
df_result = pd.concat(data, ignore_index=True)
df_result.columns=["vx","vy","vz","time","distance","x","y","z","id"]
df_result.to_csv(f"result/total.csv")


Sampling conditions: 100%|██████████| 1/1 [00:17<00:00, 17.15s/it]
Sampling conditions: 100%|██████████| 1/1 [00:14<00:00, 14.32s/it]
Sampling conditions: 100%|██████████| 1/1 [00:02<00:00,  2.44s/it]
Sampling conditions: 100%|██████████| 1/1 [00:00<00:00,  1.08it/s]
Sampling conditions: 100%|██████████| 1/1 [00:00<00:00,  1.50it/s]
Sampling conditions: 100%|██████████| 1/1 [00:01<00:00,  1.06s/it]
Sampling conditions: 100%|██████████| 1/1 [00:06<00:00,  6.84s/it]
Sampling conditions: 100%|██████████| 1/1 [00:16<00:00, 16.73s/it]
Sampling conditions: 100%|██████████| 1/1 [00:05<00:00,  5.30s/it]
Sampling conditions: 100%|██████████| 1/1 [00:15<00:00, 15.39s/it]
Sampling conditions: 100%|██████████| 1/1 [00:00<00:00,  1.98it/s]
Sampling conditions: 100%|██████████| 1/1 [00:03<00:00,  3.27s/it]
Sampling conditions: 100%|██████████| 1/1 [00:07<00:00,  7.44s/it]
Sampling conditions: 100%|██████████| 1/1 [00:01<00:00,  1.74s/it]
Sampling conditions: 100%|██████████| 1/1 [00:01<00:00,  1.20s

ValueError: Unable to sample any rows for the given conditions. Try increasing `max_tries_per_batch` (currently: 2000). Note that increasing this value will also increase the sampling time.

In [22]:
df_result = pd.concat(data, ignore_index=True)
df_result.columns=["vx","vy","vz","time","distance","x","y","z","id"]
df_result.to_csv(f"result/total.csv", index=False)

In [20]:
orig_data = df.loc[25697802:25697861]

In [23]:
orig_data.to_csv(f"result/total-orig.csv", index=False)

In [24]:
data1 = df_result.drop(["x","y","z","id","time","distance"], axis=1)
data2 = orig_data.drop(["x","y","z","px","py","pz","time","distance"], axis=1)

In [25]:
data1.shape, data2.shape

((60, 3), (60, 3))

In [31]:
metadata.detect_from_dataframe(data=data2)
python_dict = metadata.to_dict()

In [29]:
from sdmetrics.reports import utils

In [37]:
fig = utils.get_column_pair_plot(
    real_data=data2,
    synthetic_data=data1,
    column_names=['vx','vy'],
    metadata=python_dict
)
fig.show()

In [87]:
generate_test_data = Condition(
    num_rows=1,
    column_values={'time': 301, 'distance': 150})
synthetic_data = synthesizer_saved.sample_from_conditions(
    conditions=[generate_test_data],max_tries_per_batch=1000)
synthetic_data["x"] = 2
synthetic_data["y"] = 2
synthetic_data["z"] = 6
synthetic_data["id"] = 6

Sampling conditions: 100%|██████████| 1/1 [00:00<00:00,  1.68it/s]


In [88]:
synthetic_data.head()

Unnamed: 0,vx,vy,vz,time,distance,x,y,z,id
0,0.5,0.03,-0.04,301,150,2,2,6,6
