In [1]:
import pandas as pd
import numpy as np

In [33]:
df = pd.read_pickle(f"dataset/3p6-selected.pkl", compression='zip')

In [29]:
df.groupby(["xyz_id"])["distance"].nunique()

xyz_id
0        1
1        1
2        1
3        1
4        1
        ..
50681    1
50682    1
50683    1
50684    1
50685    1
Name: distance, Length: 50686, dtype: int64

In [31]:
df['xyz_id'] = df.groupby(['x', 'y', 'z']).ngroup()

In [34]:
df.head()

Unnamed: 0,x,y,z,vx,vy,vz,px,py,pz,time,distance
1824696,-117.0,87.0,-33.0,0.48,0.02,0.04,-0.57,-0.11,-2.17,301,150.0
1824697,-113.0,87.0,-33.0,0.48,0.02,0.04,-0.5,-0.4,-2.1,301,146.0
1824698,-109.0,87.0,-33.0,0.48,0.02,0.04,-0.45,-0.58,-2.01,301,143.0
1824699,-105.0,87.0,-33.0,0.48,0.02,0.04,-0.42,-0.67,-1.91,301,140.0
1824700,-101.0,87.0,-33.0,0.48,0.02,0.05,-0.36,-0.69,-1.8,301,137.0


In [35]:
df.shape

(304116, 11)

In [12]:
df.time.unique()

array([301, 305, 300, 302, 303, 304])

In [36]:
train_df = df[df["time"] != 303]
train_df = train_df.drop(columns=["x","y","z","px","py","pz"], axis=1)

In [37]:
test_df = df[df["time"] == 303]
test_df = test_df.drop(columns=["x","y","z","px","py","pz"], axis=1)

In [38]:
train_df.shape, test_df.shape

((253430, 5), (50686, 5))

In [39]:
df[['x', 'y', 'z']].drop_duplicates().count()[0]

50686

In [38]:
centroid = np.array([0, 0, 0])

# calculate the Euclidean distance between each point and the centroid
df['distance_from_centroid'] = np.sqrt(
    (df['x'] - centroid[0])**2 +
    (df['y'] - centroid[1])**2 +
    (df['z'] - centroid[2])**2
)


In [39]:
df['distance'].nunique(), df['distance_from_centroid'].nunique()

(155, 10387)

In [40]:
train_df.head()

Unnamed: 0,vx,vy,vz,time,distance
1824696,0.48,0.02,0.04,301,150.0
1824697,0.48,0.02,0.04,301,146.0
1824698,0.48,0.02,0.04,301,143.0
1824699,0.48,0.02,0.04,301,140.0
1824700,0.48,0.02,0.05,301,137.0


In [None]:
# set constraint for time

In [41]:
import sdv

In [42]:
print(sdv.__version__)

1.0.0


In [43]:
from sdv.metadata import SingleTableMetadata

metadata = SingleTableMetadata()

In [44]:
metadata.detect_from_dataframe(data=train_df)

In [45]:
python_dict = metadata.to_dict()
python_dict

{'columns': {'vx': {'sdtype': 'numerical'},
  'vy': {'sdtype': 'numerical'},
  'vz': {'sdtype': 'numerical'},
  'time': {'sdtype': 'numerical'},
  'distance': {'sdtype': 'numerical'}},
 'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1'}

In [40]:
# metadata.update_column(
#     column_name='xyz_id',
#     sdtype='id')
# metadata.set_primary_key(column_name='xyz_id')

In [41]:
metadata.validate()

In [46]:
from sdv.single_table import CopulaGANSynthesizer

In [47]:
synthesizer = CopulaGANSynthesizer(metadata)
synthesizer.fit(train_df)

In [48]:
synthetic_data = synthesizer.sample(num_rows=10)

In [49]:
synthetic_data[(synthetic_data["time"]==302) & (synthetic_data["distance"]==74)]

Unnamed: 0,vx,vy,vz,time,distance


In [48]:
train_df[(train_df["xyz_id"]==30164)]

Unnamed: 0,vx,vy,vz,time,distance,xyz_id
1872348,0.47,0.01,0.06,301,82.0,30164
3291556,0.5,-0.02,-0.01,305,82.0,30164
9728678,0.48,0.01,0.07,300,82.0,30164
20727540,0.48,0.01,0.06,302,82.0,30164
30155136,0.49,-0.02,-0.02,304,82.0,30164


In [49]:
train_df[(train_df["time"]==302) & (train_df["distance"]==74)]

Unnamed: 0,vx,vy,vz,time,distance,xyz_id
20680297,0.49,-0.00,0.04,302,74.0,22048
20680307,0.48,0.01,0.03,302,74.0,29968
20680358,0.50,-0.01,0.05,302,74.0,19654
20680373,0.47,0.01,0.01,302,74.0,31534
20680439,0.47,0.01,0.00,302,74.0,33100
...,...,...,...,...,...,...
20730085,0.53,-0.01,0.01,302,74.0,18357
20730151,0.53,-0.02,0.00,302,74.0,19923
20730218,0.54,-0.01,-0.00,302,74.0,22281
20730225,0.50,0.02,-0.00,302,74.0,27825


In [50]:
synthesizer.save(
    filepath=f"temp/3p6-learned-1.pkl"
)

In [51]:
synthesizer_saved = CopulaGANSynthesizer.load(
    filepath=f"temp/3p6-learned-1.pkl"
)

In [52]:
from sdv.sampling import Condition

In [62]:
generate_test_data = Condition(
    num_rows=20,
    column_values={'time': 301, 'distance': 82}
)


In [63]:
synthetic_data = synthesizer_saved.sample_from_conditions(
    conditions=[generate_test_data],
    output_file_path=f"result/test1.csv"
)

Sampling conditions: 100%|██████████| 20/20 [00:00<00:00, 20.06it/s]


In [61]:
train_df[(train_df["time"]==301) & (train_df["distance"]==82)]

Unnamed: 0,vx,vy,vz,time,distance
1824916,0.49,0.01,0.05,301,82.0
1824917,0.49,0.01,0.06,301,82.0
1824918,0.49,0.01,0.06,301,82.0
1824919,0.49,0.01,0.07,301,82.0
1824988,0.48,0.01,0.04,301,82.0
...,...,...,...,...,...
1874977,0.49,-0.03,0.09,301,82.0
1875020,0.52,-0.02,0.01,301,82.0
1875086,0.52,-0.03,-0.00,301,82.0
1875152,0.52,-0.03,0.00,301,82.0


In [66]:
test_df

Unnamed: 0,vx,vy,vz,time,distance
25697802,0.48,0.01,0.03,303,150.0
25697803,0.48,0.01,0.03,303,146.0
25697804,0.47,0.01,0.03,303,143.0
25697805,0.47,0.01,0.03,303,140.0
25697806,0.47,0.02,0.03,303,137.0
...,...,...,...,...,...
25748483,0.47,-0.01,0.06,303,141.0
25748484,0.46,-0.01,0.06,303,144.0
25748485,0.46,-0.01,0.06,303,147.0
25748486,0.46,-0.01,0.05,303,151.0


In [69]:
synthetic_data

Unnamed: 0,vx,vy,vz,time,distance
0,0.28,-0.02,0.04,301,82
1,0.42,0.01,0.04,301,82
2,0.62,-0.02,0.01,301,82
3,0.49,0.04,0.04,301,82
4,0.5,0.01,0.01,301,82
5,0.23,-0.07,0.04,301,82
6,0.48,0.06,0.07,301,82
7,0.48,0.03,0.06,301,82
8,0.5,-0.04,0.05,301,82
9,0.49,-0.03,0.02,301,82


In [93]:
data = []
# try:
for i in range(3):
    idx = i+25697802
    generate_test_data = Condition(
    num_rows=1,
    column_values={'time': 301, 'distance': test_df.loc[idx]["distance"]})
    synthetic_data = synthesizer_saved.sample_from_conditions(
    conditions=[generate_test_data], max_tries_per_batch=2500)
    synthetic_data["x"] = df.loc[idx]["x"]
    synthetic_data["y"] = df.loc[idx]["y"]
    synthetic_data["z"] = df.loc[idx]["z"]
    synthetic_data["id"] = idx
    data.append(synthetic_data)
# except:
#     # print(data)
#     pass

print(data)
df_result = pd.concat(data, ignore_index=True)
df_result.columns=["vx","vy","vz","time","distance","x","y","z","id"]
df_result.to_csv(f"result/total.csv")


Sampling conditions: 100%|██████████| 1/1 [00:10<00:00, 10.38s/it]
Sampling conditions: 100%|██████████| 1/1 [00:03<00:00,  3.76s/it]
Sampling conditions: 100%|██████████| 1/1 [00:12<00:00, 12.45s/it]

[     vx    vy    vz  time  distance      x     y     z        id
0  0.48  0.08 -0.01   301     150.0 -117.0  87.0 -33.0  25697802,     vx    vy    vz  time  distance      x     y     z        id
0  0.5 -0.02  0.01   301     146.0 -113.0  87.0 -33.0  25697803,      vx    vy    vz  time  distance      x     y     z        id
0  0.48  0.06  0.09   301     143.0 -109.0  87.0 -33.0  25697804]





In [94]:
df.tes

Unnamed: 0,vx,vy,vz,time,distance,x,y,z,id
0,0.48,0.08,-0.01,301,150.0,-117.0,87.0,-33.0,25697802
1,0.5,-0.02,0.01,301,146.0,-113.0,87.0,-33.0,25697803
2,0.48,0.06,0.09,301,143.0,-109.0,87.0,-33.0,25697804


In [87]:
generate_test_data = Condition(
    num_rows=1,
    column_values={'time': 301, 'distance': 150})
synthetic_data = synthesizer_saved.sample_from_conditions(
    conditions=[generate_test_data],max_tries_per_batch=1000)
synthetic_data["x"] = 2
synthetic_data["y"] = 2
synthetic_data["z"] = 6
synthetic_data["id"] = 6

Sampling conditions: 100%|██████████| 1/1 [00:00<00:00,  1.68it/s]


In [88]:
synthetic_data.head()

Unnamed: 0,vx,vy,vz,time,distance,x,y,z,id
0,0.5,0.03,-0.04,301,150,2,2,6,6
