In [1]:
import pandas as pd
import numpy as np
import importlib
import json

import experiment_runner
import generate_synthetic_data
import neural_network
from generate_synthetic_data import GenerateSyntheticData
from experiment_runner import ExperimentRunner
importlib.reload(experiment_runner)
importlib.reload(generate_synthetic_data)
importlib.reload(neural_network)
from sklearn.model_selection import train_test_split

# Import preprocessor
from preprocess_data import PreprocessData

### Steps
1. Import the files
2. Transform all their features
3. Use the experiment runner and the generate synthetic data class to get info for the experiments


In [2]:
API_KEY = "AIzaSyC14K1XMc5q5wOmHKzuLH5zWb_BED8SrkQ"

### Car.csv

### TO-DO
- A single run-through an experiment should work now
    - Troubleshoot preprocessing method
    - Clarify benchmarking method (or how we pass data to it) so that it's taken on out-of-sample data    

In [3]:
# Define the dataset
dataset = pd.read_csv('car.csv') 
target_column = 'Selling_Price' 

# These are in fractions
subset_sizes = [0.1] 
# Number of rows to generate
# TODO: added 0.01 for testing, change later!!
row_sizes =  [0.01, 0.5, 1, 5, 10] # This should be len(dataset) * array number
# Number of experiments to run
num_trials = 5

# Initialize synthetic data generator and experiment runner
synthetic_data_generator = GenerateSyntheticData(API_KEY)
# Initialize the preprocessor based on the given dataset
preprocessor = PreprocessData(dataset, target_column)

# Loop through subset sizes
results = []
for prop in subset_sizes:
    for n in row_sizes:
        for s in range(2): # This is whether we drop indicator
            for i in range(num_trials): 
                print("Current experiment")
                print(f"Subset size: {prop}")
                print(f"Row percentage: {n}")
                print(f"Drop indicator (0 is false): {s}")
                print(f"Trial # {i}")
                # Take subset of the data
                subset = dataset.sample(frac=prop)
                # Generate synthetic data using subset + additional information
                synthetic_data = synthetic_data_generator.predict(n,len(dataset),subset)
                # Combine real and synthetic data for training
                dataset['source'] = 0
                synthetic_data['source'] = 1

                # Create a train test split 
                # Validate the columns in the generated data... --> make sure the one hot encoding is not different
                train_data, test_data = train_test_split(dataset)
                combined_data = pd.concat([train_data, synthetic_data])

                # Preprocess data
                combined_df_processed = preprocessor.preprocess(combined_data)
                test_data_processed = preprocessor.preprocess(test_data)
                
                if s != 0:
                    # If we don't check source column
                    combined_df_processed = combined_df_processed.drop(columns=['source'])
                    test_data_processed = test_data_processed.drop(columns=['source'])

                # Initialize an experiment runner
                experiment_runner = ExperimentRunner(combined_df_processed, target_column)
                # Compute subset characteristics (dimensions, variance, skewness)
                subset_characteristics = experiment_runner.compute_characteristics(subset)
                # Compute generated data characteristics
                generated_characteristics = experiment_runner.compute_characteristics(synthetic_data)
                # Train the network and benchmark
                success = experiment_runner.train_network(combined_df_processed)
                if not success:
                    mse = "Trial failed because gradient blowup"
                else:
                    # Benchmark network
                    mse = experiment_runner.benchmark_network(test_data_processed)
                    # Save results
                    results.append({
                        "source": prop,
                        "generated_rows" : n,
                        "subset_id" : i,
                        "indicators" : s,
                        "target_column": target_column,
                        "subset_characteristics": subset_characteristics,
                        "generated_characteristics" : generated_characteristics,
                        "mse": mse
                    })

# Write results to a json
file_path = "results.json"

# Write the list of dictionaries to a JSON file
with open(file_path, "w") as json_file:
    json.dump(results, json_file, indent=4)

Current experiment
Subset size: 0.1
Row percentage: 0.01
Drop indicator (0 is false): 0
Trial # 0


  'skew': skew(col_data),
  'kurtosis': kurtosis(col_data),
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [10/100], Loss: 33.3049
Epoch [20/100], Loss: 27.3188
Epoch [30/100], Loss: 25.9794
Epoch [40/100], Loss: 25.6717
Epoch [50/100], Loss: 25.5565
Epoch [60/100], Loss: 25.5060
Epoch [70/100], Loss: 25.4813
Epoch [80/100], Loss: 25.4681
Epoch [90/100], Loss: 25.4603
Epoch [100/100], Loss: 25.4551
Current experiment
Subset size: 0.1
Row percentage: 0.01
Drop indicator (0 is false): 0
Trial # 1


  'skew': skew(col_data),
  'kurtosis': kurtosis(col_data),
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [10/100], Loss: 36.4596
Epoch [20/100], Loss: 29.9154
Epoch [30/100], Loss: 28.3614
Epoch [40/100], Loss: 28.0556
Epoch [50/100], Loss: 27.9779
Epoch [60/100], Loss: 27.9470
Epoch [70/100], Loss: 27.9302
Epoch [80/100], Loss: 27.9188
Epoch [90/100], Loss: 27.9108
Epoch [100/100], Loss: 27.9046
Current experiment
Subset size: 0.1
Row percentage: 0.01
Drop indicator (0 is false): 0
Trial # 2


  'skew': skew(col_data),
  'kurtosis': kurtosis(col_data),
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [10/100], Loss: 38.3105
Epoch [20/100], Loss: 30.8920
Epoch [30/100], Loss: 29.4419
Epoch [40/100], Loss: 29.2344
Epoch [50/100], Loss: 29.1839
Epoch [60/100], Loss: 29.1623
Epoch [70/100], Loss: 29.1499
Epoch [80/100], Loss: 29.1416
Epoch [90/100], Loss: 29.1354
Epoch [100/100], Loss: 29.1306
Current experiment
Subset size: 0.1
Row percentage: 0.01
Drop indicator (0 is false): 0
Trial # 3


  'skew': skew(col_data),
  'kurtosis': kurtosis(col_data),
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [10/100], Loss: 35.1396
Epoch [20/100], Loss: 28.7027
Epoch [30/100], Loss: 27.2042
Epoch [40/100], Loss: 26.8968
Epoch [50/100], Loss: 26.8180
Epoch [60/100], Loss: 26.7912
Epoch [70/100], Loss: 26.7789
Epoch [80/100], Loss: 26.7715
Epoch [90/100], Loss: 26.7659
Epoch [100/100], Loss: 26.7614
Current experiment
Subset size: 0.1
Row percentage: 0.01
Drop indicator (0 is false): 0
Trial # 4


  'skew': skew(col_data),
  'kurtosis': kurtosis(col_data),
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [10/100], Loss: 37.0163
Epoch [20/100], Loss: 30.9977
Epoch [30/100], Loss: 29.7149
Epoch [40/100], Loss: 29.5015
Epoch [50/100], Loss: 29.4400
Epoch [60/100], Loss: 29.4112
Epoch [70/100], Loss: 29.3948
Epoch [80/100], Loss: 29.3842
Epoch [90/100], Loss: 29.3767
Epoch [100/100], Loss: 29.3710
Current experiment
Subset size: 0.1
Row percentage: 0.01
Drop indicator (0 is false): 1
Trial # 0


  'skew': skew(col_data),
  'kurtosis': kurtosis(col_data),
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [10/100], Loss: 39.4555
Epoch [20/100], Loss: 31.1957
Epoch [30/100], Loss: 28.9819
Epoch [40/100], Loss: 28.5556
Epoch [50/100], Loss: 28.4459
Epoch [60/100], Loss: 28.4054
Epoch [70/100], Loss: 28.3864
Epoch [80/100], Loss: 28.3753
Epoch [90/100], Loss: 28.3678
Epoch [100/100], Loss: 28.3623
Current experiment
Subset size: 0.1
Row percentage: 0.01
Drop indicator (0 is false): 1
Trial # 1


  'skew': skew(col_data),
  'kurtosis': kurtosis(col_data),
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [10/100], Loss: 35.1014
Epoch [20/100], Loss: 29.1350
Epoch [30/100], Loss: 27.9846
Epoch [40/100], Loss: 27.8183
Epoch [50/100], Loss: 27.7772
Epoch [60/100], Loss: 27.7607
Epoch [70/100], Loss: 27.7519
Epoch [80/100], Loss: 27.7464
Epoch [90/100], Loss: 27.7425
Epoch [100/100], Loss: 27.7395
Current experiment
Subset size: 0.1
Row percentage: 0.01
Drop indicator (0 is false): 1
Trial # 2


  'skew': skew(col_data),
  'kurtosis': kurtosis(col_data),
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [10/100], Loss: 34.6759
Epoch [20/100], Loss: 28.2536
Epoch [30/100], Loss: 26.2182
Epoch [40/100], Loss: 25.7125
Epoch [50/100], Loss: 25.5536
Epoch [60/100], Loss: 25.4905
Epoch [70/100], Loss: 25.4596
Epoch [80/100], Loss: 25.4422
Epoch [90/100], Loss: 25.4310
Epoch [100/100], Loss: 25.4230
Current experiment
Subset size: 0.1
Row percentage: 0.01
Drop indicator (0 is false): 1
Trial # 3


  'skew': skew(col_data),
  'kurtosis': kurtosis(col_data),
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [10/100], Loss: 36.3605
Epoch [20/100], Loss: 28.1455
Epoch [30/100], Loss: 26.0580
Epoch [40/100], Loss: 25.7568
Epoch [50/100], Loss: 25.6921
Epoch [60/100], Loss: 25.6672
Epoch [70/100], Loss: 25.6543
Epoch [80/100], Loss: 25.6462
Epoch [90/100], Loss: 25.6403
Epoch [100/100], Loss: 25.6354
Current experiment
Subset size: 0.1
Row percentage: 0.01
Drop indicator (0 is false): 1
Trial # 4


  'skew': skew(col_data),
  'kurtosis': kurtosis(col_data),
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [10/100], Loss: 39.7721
Epoch [20/100], Loss: 31.9545
Epoch [30/100], Loss: 29.9064
Epoch [40/100], Loss: 29.5061
Epoch [50/100], Loss: 29.4095
Epoch [60/100], Loss: 29.3743
Epoch [70/100], Loss: 29.3574
Epoch [80/100], Loss: 29.3474
Epoch [90/100], Loss: 29.3409
Epoch [100/100], Loss: 29.3364
Current experiment
Subset size: 0.1
Row percentage: 0.5
Drop indicator (0 is false): 0
Trial # 0


  'skew': skew(col_data),
  'kurtosis': kurtosis(col_data),
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [10/100], Loss: 42.2434
Epoch [20/100], Loss: 32.1422
Epoch [30/100], Loss: 30.1058
Epoch [40/100], Loss: 29.8882
Epoch [50/100], Loss: 29.8391
Epoch [60/100], Loss: 29.8179
Epoch [70/100], Loss: 29.8060
Epoch [80/100], Loss: 29.7982
Epoch [90/100], Loss: 29.7925
Epoch [100/100], Loss: 29.7880
Current experiment
Subset size: 0.1
Row percentage: 0.5
Drop indicator (0 is false): 0
Trial # 1


  'skew': skew(col_data),
  'kurtosis': kurtosis(col_data),
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [10/100], Loss: 35.5224
Epoch [20/100], Loss: 28.1836
Epoch [30/100], Loss: 25.6420
Epoch [40/100], Loss: 25.0645
Epoch [50/100], Loss: 24.9312
Epoch [60/100], Loss: 24.8881
Epoch [70/100], Loss: 24.8683
Epoch [80/100], Loss: 24.8564
Epoch [90/100], Loss: 24.8482
Epoch [100/100], Loss: 24.8421
Current experiment
Subset size: 0.1
Row percentage: 0.5
Drop indicator (0 is false): 0
Trial # 2


  'skew': skew(col_data),
  'kurtosis': kurtosis(col_data),
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [10/100], Loss: 37.0402
Epoch [20/100], Loss: 28.7791
Epoch [30/100], Loss: 26.6634
Epoch [40/100], Loss: 26.1113
Epoch [50/100], Loss: 25.9522
Epoch [60/100], Loss: 25.9006
Epoch [70/100], Loss: 25.8789
Epoch [80/100], Loss: 25.8667
Epoch [90/100], Loss: 25.8582
Epoch [100/100], Loss: 25.8516
Current experiment
Subset size: 0.1
Row percentage: 0.5
Drop indicator (0 is false): 0
Trial # 3


  'skew': skew(col_data),
  'kurtosis': kurtosis(col_data),
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [10/100], Loss: 31.8656
Epoch [20/100], Loss: 24.9157
Epoch [30/100], Loss: 23.0739
Epoch [40/100], Loss: 22.6476
Epoch [50/100], Loss: 22.5031
Epoch [60/100], Loss: 22.4353
Epoch [70/100], Loss: 22.3969
Epoch [80/100], Loss: 22.3719
Epoch [90/100], Loss: 22.3539
Epoch [100/100], Loss: 22.3401
Current experiment
Subset size: 0.1
Row percentage: 0.5
Drop indicator (0 is false): 0
Trial # 4


  'skew': skew(col_data),
  'kurtosis': kurtosis(col_data),
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [10/100], Loss: 34.4513
Epoch [20/100], Loss: 27.5839
Epoch [30/100], Loss: 25.7544
Epoch [40/100], Loss: 25.3906
Epoch [50/100], Loss: 25.2814
Epoch [60/100], Loss: 25.2327
Epoch [70/100], Loss: 25.2057
Epoch [80/100], Loss: 25.1891
Epoch [90/100], Loss: 25.1776
Epoch [100/100], Loss: 25.1695
Current experiment
Subset size: 0.1
Row percentage: 0.5
Drop indicator (0 is false): 1
Trial # 0


  'skew': skew(col_data),
  'kurtosis': kurtosis(col_data),
  inputs = (inputs - inputs.mean(axis=0)) / inputs.std(axis=0)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [10/100], Loss: nan
Epoch [20/100], Loss: nan
Epoch [30/100], Loss: nan
Epoch [40/100], Loss: nan
Epoch [50/100], Loss: nan
Epoch [60/100], Loss: nan
Epoch [70/100], Loss: nan
Epoch [80/100], Loss: nan
Epoch [90/100], Loss: nan
Epoch [100/100], Loss: nan


ValueError: Input contains NaN.

In [16]:
False ==combined_df_processed.isna().any().any()

np.True_

## Ideas for visualizations
- GENERAL IDEA: See if there's a relationship between proportion of "realness" ((prop * len(df)) / (n + prop * len(df))) and preservation of characteristics (average of difference between subset characteristics and generated characteristics) and mse
    - Are there certain characteristics (mean, variance, etc.) that are preserved better on average by feature? What properties about the feature make it the case?
- Coolest graph would be something like proportion of "realness" on x-axis and mse and then two lines corresponding to with and without-indicators
    - Could be super cool to see something besides "indicators always beat out without indicators" - something unintuitive would be sick
- Write down any other ideas!
- Just a bunch of facet grids could be cool

### Indicator ideas

Weights on how much the model uses the indicators as information
- also try with and without indicators and compare performance
- maybe we can compare embedding care 