In [1]:
import pandas as pd
import numpy as np
import importlib
import json

import experiment_runner
import generate_synthetic_data
import neural_network
from generate_synthetic_data import GenerateSyntheticData
from experiment_runner import ExperimentRunner
importlib.reload(experiment_runner)
importlib.reload(generate_synthetic_data)
importlib.reload(neural_network)
from sklearn.model_selection import train_test_split

# Import preprocessor
from preprocess_data import PreprocessData

### Steps
1. Import the files
2. Transform all their features
3. Use the experiment runner and the generate synthetic data class to get info for the experiments


In [2]:
API_KEY = "AIzaSyC14K1XMc5q5wOmHKzuLH5zWb_BED8SrkQ"

### Car.csv

### TO-DO
- A single run-through an experiment should work now
    - Troubleshoot preprocessing method
    - Clarify benchmarking method (or how we pass data to it) so that it's taken on out-of-sample data    

In [3]:
# Define the dataset
dataset = pd.read_csv('car.csv') 
target_column = 'Selling_Price' 

# These are in fractions
subset_sizes = [0.05, 0.1] # What happens when you double the number of examples given
# Number of rows to generate
row_sizes =  [0.01, 0.1, 0.5, 1] # This should be len(dataset) * array number
# Number of experiments to run
num_trials = 5

# Initialize synthetic data generator and experiment runner
synthetic_data_generator = GenerateSyntheticData(API_KEY)
# Initialize the preprocessor based on the given dataset
preprocessor = PreprocessData(dataset, target_column)

# Loop through subset sizes
results = []
for prop in subset_sizes:
    for n in row_sizes:
        for s in range(2): # This is whether we drop indicator
            for i in range(num_trials): 
                print("Current experiment")
                print(f"Subset size: {prop}")
                print(f"Row percentage: {n}")
                print(f"Drop indicator (0 is false): {s}")
                print(f"Trial # {i}")
                # Take subset of the data
                subset = dataset.sample(frac=prop)
                # Generate synthetic data using subset + additional information
                synthetic_data = synthetic_data_generator.predict(n,len(dataset),subset)
                # Combine real and synthetic data for training
                dataset['source'] = 0
                synthetic_data['source'] = 1

                # Create a train test split 
                train_data, test_data = train_test_split(dataset)
                combined_data = pd.concat([train_data, synthetic_data])

                # Preprocess data
                combined_df_processed = preprocessor.preprocess(combined_data)
                test_data_processed = preprocessor.preprocess(test_data)
                
                if s != 0:
                    # If we don't check source column
                    combined_df_processed = combined_df_processed.drop(columns=['source'])
                    test_data_processed = test_data_processed.drop(columns=['source'])

                # Initialize an experiment runner
                experiment_runner = ExperimentRunner(combined_df_processed, target_column)
                # Compute subset characteristics (dimensions, variance, skewness)
                subset_characteristics = experiment_runner.compute_characteristics(subset)
                # Compute generated data characteristics
                generated_characteristics = experiment_runner.compute_characteristics(synthetic_data)
                # Train the network and benchmark
                success = experiment_runner.train_network(combined_df_processed)
                if not success:
                    mse = "Trial failed because gradient blowup"
                else:
                    # Benchmark network
                    mse = experiment_runner.benchmark_network(test_data_processed)
                    # Save results
                    results.append({
                        "source": prop,
                        "generated_rows" : n,
                        "subset_id" : i,
                        "indicators" : s,
                        "target_column": target_column,
                        "subset_characteristics": subset_characteristics.to_dict(),
                        "generated_characteristics" : generated_characteristics.to_dict(),
                        "mse": mse
                    })

# Write results to a json
file_path = "results.json"

# Write the list of dictionaries to a JSON file
with open(file_path, "w") as json_file:
    json.dump(results, json_file, indent=4)

Current experiment
Subset size: 0.05
Row percentage: 0.01
Drop indicator (0 is false): 0
Trial # 0


  'skew': skew(col_data),
  'kurtosis': kurtosis(col_data),
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [10/100], Loss: 49.8359
Epoch [20/100], Loss: 48.4358
Epoch [30/100], Loss: 47.0938
Epoch [40/100], Loss: 45.8054
Epoch [50/100], Loss: 44.5666
Epoch [60/100], Loss: 43.3743
Epoch [70/100], Loss: 42.2254
Epoch [80/100], Loss: 41.1180
Epoch [90/100], Loss: 40.0507
Epoch [100/100], Loss: 39.0220
Current experiment
Subset size: 0.05
Row percentage: 0.01
Drop indicator (0 is false): 0
Trial # 1


  'skew': skew(col_data),
  'kurtosis': kurtosis(col_data),
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [10/100], Loss: 46.5554
Epoch [20/100], Loss: 45.2375
Epoch [30/100], Loss: 43.9608
Epoch [40/100], Loss: 42.7220
Epoch [50/100], Loss: 41.5184
Epoch [60/100], Loss: 40.3482
Epoch [70/100], Loss: 39.2103
Epoch [80/100], Loss: 38.1041
Epoch [90/100], Loss: 37.0301
Epoch [100/100], Loss: 35.9897
Current experiment
Subset size: 0.05
Row percentage: 0.01
Drop indicator (0 is false): 0
Trial # 2


  'skew': skew(col_data),
  'kurtosis': kurtosis(col_data),
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [10/100], Loss: 46.0803
Epoch [20/100], Loss: 44.7571
Epoch [30/100], Loss: 43.4690
Epoch [40/100], Loss: 42.2121
Epoch [50/100], Loss: 40.9831
Epoch [60/100], Loss: 39.7801
Epoch [70/100], Loss: 38.6024
Epoch [80/100], Loss: 37.4499
Epoch [90/100], Loss: 36.3232
Epoch [100/100], Loss: 35.2247
Current experiment
Subset size: 0.05
Row percentage: 0.01
Drop indicator (0 is false): 0
Trial # 3


  'skew': skew(col_data),
  'kurtosis': kurtosis(col_data),
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [10/100], Loss: 49.3818
Epoch [20/100], Loss: 47.8391
Epoch [30/100], Loss: 46.3522
Epoch [40/100], Loss: 44.9172
Epoch [50/100], Loss: 43.5307
Epoch [60/100], Loss: 42.1906
Epoch [70/100], Loss: 40.8950
Epoch [80/100], Loss: 39.6433
Epoch [90/100], Loss: 38.4360
Epoch [100/100], Loss: 37.2744
Current experiment
Subset size: 0.05
Row percentage: 0.01
Drop indicator (0 is false): 0
Trial # 4


  'skew': skew(col_data),
  'kurtosis': kurtosis(col_data),
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [10/100], Loss: 49.9341
Epoch [20/100], Loss: 48.3429
Epoch [30/100], Loss: 46.8116
Epoch [40/100], Loss: 45.3349
Epoch [50/100], Loss: 43.9087
Epoch [60/100], Loss: 42.5297
Epoch [70/100], Loss: 41.1956
Epoch [80/100], Loss: 39.9045
Epoch [90/100], Loss: 38.6561
Epoch [100/100], Loss: 37.4503
Current experiment
Subset size: 0.05
Row percentage: 0.01
Drop indicator (0 is false): 1
Trial # 0


  'skew': skew(col_data),
  'kurtosis': kurtosis(col_data),
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [10/100], Loss: 43.3315
Epoch [20/100], Loss: 42.0591
Epoch [30/100], Loss: 40.8270
Epoch [40/100], Loss: 39.6326
Epoch [50/100], Loss: 38.4739
Epoch [60/100], Loss: 37.3498
Epoch [70/100], Loss: 36.2593
Epoch [80/100], Loss: 35.2032
Epoch [90/100], Loss: 34.1822
Epoch [100/100], Loss: 33.1971
Current experiment
Subset size: 0.05
Row percentage: 0.01
Drop indicator (0 is false): 1
Trial # 1


  'skew': skew(col_data),
  'kurtosis': kurtosis(col_data),
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [10/100], Loss: 48.0536
Epoch [20/100], Loss: 46.7890
Epoch [30/100], Loss: 45.5680
Epoch [40/100], Loss: 44.3871
Epoch [50/100], Loss: 43.2418
Epoch [60/100], Loss: 42.1290
Epoch [70/100], Loss: 41.0464
Epoch [80/100], Loss: 39.9924
Epoch [90/100], Loss: 38.9666
Epoch [100/100], Loss: 37.9682
Current experiment
Subset size: 0.05
Row percentage: 0.01
Drop indicator (0 is false): 1
Trial # 2


  'skew': skew(col_data),
  'kurtosis': kurtosis(col_data),
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [10/100], Loss: 39.2844
Epoch [20/100], Loss: 37.9955
Epoch [30/100], Loss: 36.7513
Epoch [40/100], Loss: 35.5482
Epoch [50/100], Loss: 34.3827
Epoch [60/100], Loss: 33.2520
Epoch [70/100], Loss: 32.1542
Epoch [80/100], Loss: 31.0877
Epoch [90/100], Loss: 30.0513
Epoch [100/100], Loss: 29.0448
Current experiment
Subset size: 0.05
Row percentage: 0.01
Drop indicator (0 is false): 1
Trial # 3


  'skew': skew(col_data),
  'kurtosis': kurtosis(col_data),
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [10/100], Loss: 48.2062
Epoch [20/100], Loss: 46.8786
Epoch [30/100], Loss: 45.5977
Epoch [40/100], Loss: 44.3600
Epoch [50/100], Loss: 43.1628
Epoch [60/100], Loss: 42.0032
Epoch [70/100], Loss: 40.8790
Epoch [80/100], Loss: 39.7892
Epoch [90/100], Loss: 38.7338
Epoch [100/100], Loss: 37.7130
Current experiment
Subset size: 0.05
Row percentage: 0.01
Drop indicator (0 is false): 1
Trial # 4


  'skew': skew(col_data),
  'kurtosis': kurtosis(col_data),
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [10/100], Loss: 45.2886
Epoch [20/100], Loss: 44.0865
Epoch [30/100], Loss: 42.9146
Epoch [40/100], Loss: 41.7719
Epoch [50/100], Loss: 40.6578
Epoch [60/100], Loss: 39.5720
Epoch [70/100], Loss: 38.5148
Epoch [80/100], Loss: 37.4875
Epoch [90/100], Loss: 36.4920
Epoch [100/100], Loss: 35.5302
Current experiment
Subset size: 0.05
Row percentage: 0.1
Drop indicator (0 is false): 0
Trial # 0


  'skew': skew(col_data),
  'kurtosis': kurtosis(col_data),
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [10/100], Loss: 45.8377
Epoch [20/100], Loss: 44.4479
Epoch [30/100], Loss: 43.0974
Epoch [40/100], Loss: 41.7831
Epoch [50/100], Loss: 40.5024
Epoch [60/100], Loss: 39.2533
Epoch [70/100], Loss: 38.0343
Epoch [80/100], Loss: 36.8440
Epoch [90/100], Loss: 35.6822
Epoch [100/100], Loss: 34.5495
Current experiment
Subset size: 0.05
Row percentage: 0.1
Drop indicator (0 is false): 0
Trial # 1


  'skew': skew(col_data),
  'kurtosis': kurtosis(col_data),
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [10/100], Loss: 37.4114
Epoch [20/100], Loss: 36.2625
Epoch [30/100], Loss: 35.1523
Epoch [40/100], Loss: 34.0775
Epoch [50/100], Loss: 33.0345
Epoch [60/100], Loss: 32.0208
Epoch [70/100], Loss: 31.0350
Epoch [80/100], Loss: 30.0754
Epoch [90/100], Loss: 29.1408
Epoch [100/100], Loss: 28.2309
Current experiment
Subset size: 0.05
Row percentage: 0.1
Drop indicator (0 is false): 0
Trial # 2


  'skew': skew(col_data),
  'kurtosis': kurtosis(col_data),
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [10/100], Loss: 43.1414
Epoch [20/100], Loss: 41.8656
Epoch [30/100], Loss: 40.6239
Epoch [40/100], Loss: 39.4129
Epoch [50/100], Loss: 38.2302
Epoch [60/100], Loss: 37.0733
Epoch [70/100], Loss: 35.9409
Epoch [80/100], Loss: 34.8315
Epoch [90/100], Loss: 33.7447
Epoch [100/100], Loss: 32.6804
Current experiment
Subset size: 0.05
Row percentage: 0.1
Drop indicator (0 is false): 0
Trial # 3


  'skew': skew(col_data),
  'kurtosis': kurtosis(col_data),
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [10/100], Loss: 44.8980
Epoch [20/100], Loss: 43.5623
Epoch [30/100], Loss: 42.2608
Epoch [40/100], Loss: 40.9912
Epoch [50/100], Loss: 39.7519
Epoch [60/100], Loss: 38.5420
Epoch [70/100], Loss: 37.3605
Epoch [80/100], Loss: 36.2071
Epoch [90/100], Loss: 35.0827
Epoch [100/100], Loss: 33.9887
Current experiment
Subset size: 0.05
Row percentage: 0.1
Drop indicator (0 is false): 0
Trial # 4


  'skew': skew(col_data),
  'kurtosis': kurtosis(col_data),
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [10/100], Loss: 35.8067
Epoch [20/100], Loss: 34.5010
Epoch [30/100], Loss: 33.2466
Epoch [40/100], Loss: 32.0390
Epoch [50/100], Loss: 30.8747
Epoch [60/100], Loss: 29.7507
Epoch [70/100], Loss: 28.6647
Epoch [80/100], Loss: 27.6149
Epoch [90/100], Loss: 26.6003
Epoch [100/100], Loss: 25.6206
Current experiment
Subset size: 0.05
Row percentage: 0.1
Drop indicator (0 is false): 1
Trial # 0


  'skew': skew(col_data),
  'kurtosis': kurtosis(col_data),
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [10/100], Loss: 47.0979
Epoch [20/100], Loss: 45.6297
Epoch [30/100], Loss: 44.1998
Epoch [40/100], Loss: 42.8048
Epoch [50/100], Loss: 41.4421
Epoch [60/100], Loss: 40.1097
Epoch [70/100], Loss: 38.8061
Epoch [80/100], Loss: 37.5312
Epoch [90/100], Loss: 36.2846
Epoch [100/100], Loss: 35.0675
Current experiment
Subset size: 0.05
Row percentage: 0.1
Drop indicator (0 is false): 1
Trial # 1


  'skew': skew(col_data),
  'kurtosis': kurtosis(col_data),
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [10/100], Loss: 39.9591
Epoch [20/100], Loss: 38.6613
Epoch [30/100], Loss: 37.4032
Epoch [40/100], Loss: 36.1811
Epoch [50/100], Loss: 34.9922
Epoch [60/100], Loss: 33.8338
Epoch [70/100], Loss: 32.7047
Epoch [80/100], Loss: 31.6039
Epoch [90/100], Loss: 30.5313
Epoch [100/100], Loss: 29.4871
Current experiment
Subset size: 0.05
Row percentage: 0.1
Drop indicator (0 is false): 1
Trial # 2


  'skew': skew(col_data),
  'kurtosis': kurtosis(col_data),
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [10/100], Loss: 39.0491
Epoch [20/100], Loss: 37.5917
Epoch [30/100], Loss: 36.1843
Epoch [40/100], Loss: 34.8223
Epoch [50/100], Loss: 33.5021
Epoch [60/100], Loss: 32.2212
Epoch [70/100], Loss: 30.9772
Epoch [80/100], Loss: 29.7692
Epoch [90/100], Loss: 28.5964
Epoch [100/100], Loss: 27.4600
Current experiment
Subset size: 0.05
Row percentage: 0.1
Drop indicator (0 is false): 1
Trial # 3


  'skew': skew(col_data),
  'kurtosis': kurtosis(col_data),
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [10/100], Loss: 45.0365
Epoch [20/100], Loss: 43.6742
Epoch [30/100], Loss: 42.3448
Epoch [40/100], Loss: 41.0469
Epoch [50/100], Loss: 39.7787
Epoch [60/100], Loss: 38.5400
Epoch [70/100], Loss: 37.3312
Epoch [80/100], Loss: 36.1540
Epoch [90/100], Loss: 35.0102
Epoch [100/100], Loss: 33.9025
Current experiment
Subset size: 0.05
Row percentage: 0.1
Drop indicator (0 is false): 1
Trial # 4


  'skew': skew(col_data),
  'kurtosis': kurtosis(col_data),
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [10/100], Loss: 37.7583
Epoch [20/100], Loss: 36.2207
Epoch [30/100], Loss: 34.7582
Epoch [40/100], Loss: 33.3651
Epoch [50/100], Loss: 32.0361
Epoch [60/100], Loss: 30.7667
Epoch [70/100], Loss: 29.5530
Epoch [80/100], Loss: 28.3921
Epoch [90/100], Loss: 27.2819
Epoch [100/100], Loss: 26.2212
Current experiment
Subset size: 0.05
Row percentage: 0.5
Drop indicator (0 is false): 0
Trial # 0
NO VALID JSON (AH)


  'skew': skew(col_data),
  'kurtosis': kurtosis(col_data),
  inputs = (inputs - inputs.mean(axis=0)) / inputs.std(axis=0)
  return F.mse_loss(input, target, reduction=self.reduction)


Loss is NaN. Stopping training.
Current experiment
Subset size: 0.05
Row percentage: 0.5
Drop indicator (0 is false): 0
Trial # 1


  'skew': skew(col_data),
  'kurtosis': kurtosis(col_data),
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [10/100], Loss: 41.2898
Epoch [20/100], Loss: 39.9505
Epoch [30/100], Loss: 38.6604
Epoch [40/100], Loss: 37.4162
Epoch [50/100], Loss: 36.2145
Epoch [60/100], Loss: 35.0531
Epoch [70/100], Loss: 33.9304
Epoch [80/100], Loss: 32.8457
Epoch [90/100], Loss: 31.7981
Epoch [100/100], Loss: 30.7874
Current experiment
Subset size: 0.05
Row percentage: 0.5
Drop indicator (0 is false): 0
Trial # 2
NO VALID JSON (AH)


  'skew': skew(col_data),
  'kurtosis': kurtosis(col_data),
  inputs = (inputs - inputs.mean(axis=0)) / inputs.std(axis=0)
  return F.mse_loss(input, target, reduction=self.reduction)


Loss is NaN. Stopping training.
Current experiment
Subset size: 0.05
Row percentage: 0.5
Drop indicator (0 is false): 0
Trial # 3


  'skew': skew(col_data),
  'kurtosis': kurtosis(col_data),
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [10/100], Loss: 49.6514
Epoch [20/100], Loss: 48.0755
Epoch [30/100], Loss: 46.5523
Epoch [40/100], Loss: 45.0765
Epoch [50/100], Loss: 43.6441
Epoch [60/100], Loss: 42.2514
Epoch [70/100], Loss: 40.8957
Epoch [80/100], Loss: 39.5744
Epoch [90/100], Loss: 38.2869
Epoch [100/100], Loss: 37.0321
Current experiment
Subset size: 0.05
Row percentage: 0.5
Drop indicator (0 is false): 0
Trial # 4


  'skew': skew(col_data),
  'kurtosis': kurtosis(col_data),
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [10/100], Loss: 53.5192
Epoch [20/100], Loss: 51.8564
Epoch [30/100], Loss: 50.2392
Epoch [40/100], Loss: 48.6646
Epoch [50/100], Loss: 47.1303
Epoch [60/100], Loss: 45.6343
Epoch [70/100], Loss: 44.1760
Epoch [80/100], Loss: 42.7557
Epoch [90/100], Loss: 41.3747
Epoch [100/100], Loss: 40.0353
Current experiment
Subset size: 0.05
Row percentage: 0.5
Drop indicator (0 is false): 1
Trial # 0


  'skew': skew(col_data),
  'kurtosis': kurtosis(col_data),
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [10/100], Loss: 48.7685
Epoch [20/100], Loss: 47.2517
Epoch [30/100], Loss: 45.7782
Epoch [40/100], Loss: 44.3433
Epoch [50/100], Loss: 42.9436
Epoch [60/100], Loss: 41.5760
Epoch [70/100], Loss: 40.2385
Epoch [80/100], Loss: 38.9307
Epoch [90/100], Loss: 37.6516
Epoch [100/100], Loss: 36.4027
Current experiment
Subset size: 0.05
Row percentage: 0.5
Drop indicator (0 is false): 1
Trial # 1


  'skew': skew(col_data),
  'kurtosis': kurtosis(col_data),
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [10/100], Loss: 53.6945
Epoch [20/100], Loss: 52.2549
Epoch [30/100], Loss: 50.8601
Epoch [40/100], Loss: 49.5054
Epoch [50/100], Loss: 48.1865
Epoch [60/100], Loss: 46.9001
Epoch [70/100], Loss: 45.6435
Epoch [80/100], Loss: 44.4143
Epoch [90/100], Loss: 43.2109
Epoch [100/100], Loss: 42.0322
Current experiment
Subset size: 0.05
Row percentage: 0.5
Drop indicator (0 is false): 1
Trial # 2
NO VALID JSON (AH)


  'skew': skew(col_data),
  'kurtosis': kurtosis(col_data),
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [10/100], Loss: 48.3316
Epoch [20/100], Loss: 46.9704
Epoch [30/100], Loss: 45.6515
Epoch [40/100], Loss: 44.3730
Epoch [50/100], Loss: 43.1335
Epoch [60/100], Loss: 41.9322
Epoch [70/100], Loss: 40.7685
Epoch [80/100], Loss: 39.6431
Epoch [90/100], Loss: 38.5581
Epoch [100/100], Loss: 37.5159
Current experiment
Subset size: 0.05
Row percentage: 0.5
Drop indicator (0 is false): 1
Trial # 3
NO VALID JSON (AH)


  'skew': skew(col_data),
  'kurtosis': kurtosis(col_data),
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [10/100], Loss: 53.1250
Epoch [20/100], Loss: 51.7265
Epoch [30/100], Loss: 50.3615
Epoch [40/100], Loss: 49.0268
Epoch [50/100], Loss: 47.7196
Epoch [60/100], Loss: 46.4384
Epoch [70/100], Loss: 45.1820
Epoch [80/100], Loss: 43.9505
Epoch [90/100], Loss: 42.7443
Epoch [100/100], Loss: 41.5645
Current experiment
Subset size: 0.05
Row percentage: 0.5
Drop indicator (0 is false): 1
Trial # 4


  'skew': skew(col_data),
  'kurtosis': kurtosis(col_data),
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [10/100], Loss: 43.2836
Epoch [20/100], Loss: 41.7997
Epoch [30/100], Loss: 40.3543
Epoch [40/100], Loss: 38.9446
Epoch [50/100], Loss: 37.5688
Epoch [60/100], Loss: 36.2259
Epoch [70/100], Loss: 34.9159
Epoch [80/100], Loss: 33.6396
Epoch [90/100], Loss: 32.3980
Epoch [100/100], Loss: 31.1942
Current experiment
Subset size: 0.05
Row percentage: 1
Drop indicator (0 is false): 0
Trial # 0
NO VALID JSON (AH)


  'skew': skew(col_data),
  'kurtosis': kurtosis(col_data),
  inputs = (inputs - inputs.mean(axis=0)) / inputs.std(axis=0)
  return F.mse_loss(input, target, reduction=self.reduction)


Loss is NaN. Stopping training.
Current experiment
Subset size: 0.05
Row percentage: 1
Drop indicator (0 is false): 0
Trial # 1
NO VALID JSON (AH)


  'skew': skew(col_data),
  'kurtosis': kurtosis(col_data),
  inputs = (inputs - inputs.mean(axis=0)) / inputs.std(axis=0)
  return F.mse_loss(input, target, reduction=self.reduction)


Loss is NaN. Stopping training.
Current experiment
Subset size: 0.05
Row percentage: 1
Drop indicator (0 is false): 0
Trial # 2


In [10]:
for result in results:
    result["subset_characteristics"] = result["subset_characteristics"].to_dict()
    result["generated_characteristics"] = result["generated_characteristics"].to_dict()

In [12]:
# Specify the file path
file_path = "results.json"

# Write the list of dictionaries to a JSON file
with open(file_path, "w") as json_file:
    json.dump(results, json_file, indent=4)

In [16]:
False ==combined_df_processed.isna().any().any()

np.True_

## Ideas for visualizations
- GENERAL IDEA: See if there's a relationship between proportion of "realness" ((prop * len(df)) / (n + prop * len(df))) and preservation of characteristics (average of difference between subset characteristics and generated characteristics) and mse
    - Are there certain characteristics (mean, variance, etc.) that are preserved better on average by feature? What properties about the feature make it the case?
- Coolest graph would be something like proportion of "realness" on x-axis and mse and then two lines corresponding to with and without-indicators
    - Could be super cool to see something besides "indicators always beat out without indicators" - something unintuitive would be sick
- Write down any other ideas!
- Just a bunch of facet grids could be cool

### Indicator ideas

Weights on how much the model uses the indicators as information
- also try with and without indicators and compare performance
- maybe we can compare embedding care 