In [1]:
import pandas as pd
import numpy as np
from modules import utils, model, train
import matplotlib.pyplot as plt
import seaborn as sns
from timeit import default_timer as timer

In [2]:
pd.core.common.random_state(None)
seed = 42

In [3]:
data = utils.get_cleaned_students_data()
data = data.drop(data[data["curricular_units_2nd_sem_grade"] == 0].index)

target_column = 'target'

training_data, validation_data = \
          utils.stratified_train_test_split(data,
                                            target_column = target_column,
                                            train_size=0.5,
                                            random_state = seed
                                            )

print(f"Data Shape: {data.shape}\nTraining Shape: {training_data.shape}\nValidation Shape: {validation_data.shape}")

Data Shape: (3554, 37)
Training Shape: (1777, 37)
Validation Shape: (1777, 37)


In [4]:
# fig, ax = plt.subplots(1,3, figsize=(10,4), sharex=True, sharey=True)
# orig_label = data[target_column].value_counts(normalize = True).reset_index()
# tr_label = training_data[target_column].value_counts(normalize = True).reset_index()
# val_label = validation_data[target_column].value_counts(normalize = True).reset_index()
# sns.barplot(data = orig_label, x = "target", y = "proportion", ax = ax[0])
# ax[0].set_title("original")
# sns.barplot(data = tr_label, x = "target", y = "proportion",  ax = ax[1], label = "train")
# ax[1].set_title("train")
# sns.barplot(data = val_label, x = "target", y = "proportion", ax = ax[2], label = "validation")
# ax[2].set_title("validation")
# plt.suptitle("Percentage plots of Target column")
# plt.show()

In [5]:
def grid_search(grid_range, features):
    if len(features) != 2:
        raise ValueError("Features should be of length 2!!")
    
    b1_min = grid_range[0][0]
    b1_max = grid_range[0][1]
    b2_min = grid_range[1][0]
    b2_max = grid_range[1][1]
    b1_step = (b1_max - b1_min)/10
    b2_step = (b2_max - b2_min)/10
    min_delta = 999999999.9
    test_count = 1

    nogan = model.NoGANSynth(training_data[features])        
    for b1 in np.arange(b1_min, b1_max, b1_step):
        for b2 in np.arange(b2_min, b2_max, b2_step):
            bins = [b2, b2]
            _, results = \
                    train.nogan_synth(nogan, training_data[features], 
                                        validation_data[features], bins = bins,
                                        n_nodes = 1000, verbose = False, 
                                        random_seed = seed)
             
            if test_count % 10 == 0:
                print(f"Test {test_count} completed!!!")
            test_count += 1
        ks_stat = results["synth_comparison"]["ks_stat"]
        target_ks_stat = results["train_comparison"]["ks_stat"]
        delta = np.sqrt((ks_stat-target_ks_stat)**2)
        if delta < min_delta:
            best_b1 = b1
            best_b2 = b2
            min_delta = delta
            best_ks_stat = ks_stat
            best_target_ks_stat = target_ks_stat
                    
    return best_b1, best_b2, best_ks_stat, best_target_ks_stat, delta

In [6]:
from tqdm import tqdm
def grid_search_single(column_name, start, stop, step):    
    nogan = model.NoGANSynth(training_data[[column_name]])
    test_count = 1
    min_delta = 999999999.9 
    for p in tqdm(np.arange(start, stop+1, step)):
        bins = [p]
        _, results = \
                train.nogan_synth(nogan, training_data[[column_name]], 
                                validation_data[[column_name]], bins = bins,
                                n_nodes = 1000, verbose = False, 
                                random_seed = seed)
        test_count += 1
        ks_stat = results["synth_comparison"]["ks_stat"]
        target_ks_stat = results["train_comparison"]["ks_stat"]
        delta = np.sqrt((ks_stat-target_ks_stat)**2)  
        if delta < min_delta:
            best_p = p
            min_delta = delta
            best_ks_stat = ks_stat
            best_target_ks_stat = target_ks_stat
    print(f"best_bin: {best_p}\nbest_ks_stat: {best_ks_stat}\nbest_target_ks_stat: {best_target_ks_stat}\nmin_delta: {min_delta}")

In [7]:
grid_search_single("course", 1, 500, 10)

100%|██████████| 50/50 [03:36<00:00,  4.33s/it]

best_bin: 131
best_ks_stat: 0.12492965672481715
best_target_ks_stat: 0.033764772087788386
min_delta: 0.09116488463702876





In [8]:
from typing import List
def hyperparameter_tuning(features:List, b1:int = 100, b2:int = 100, 
                          step_b1:int = 100, step_b2:int = 100, 
                          iter:int = 3):
  final_min_delta = 999999999.9
  start_time = timer()
  for level in range(iter):
      step_b1 /= 2
      step_b2 /= 2
      b1_min = max(0, b1 - step_b1)
      b1_max = b1 + step_b1
      b2_min = b2 - step_b2
      b2_max = b2 + step_b2
      grid_range = [(b1_min, b1_max),(b2_min, b2_max)]
      (b1, b2, ks_stat, target_ks_stat, min_delta) = \
                              grid_search(grid_range, features)
      print(f"b1: {b1}, b2: {b2}, ks_stat: {ks_stat}, target_ks_stat: {target_ks_stat}, min_delta: {min_delta}")
                              
      if min_delta < final_min_delta:
        final_min_delta = min_delta
        final_best_b1 = b1
        final_best_b2 = b2
        final_ks_stat = ks_stat
        final_target_ks_stat =  target_ks_stat
  end_time = timer()
  print(f"final_b1: {final_best_b1}\nfinal_b2: {final_best_b2}\nfinal_ks_stat: {final_ks_stat}\nfinal_target_ks_stat: {final_target_ks_stat}\nfinal_min_delta: {min_delta}\nTotal Time: {end_time-start_time:.4f}")

In [9]:
features = [
  'course',
  'unemployment_rate',
    ]

hyperparameter_tuning(features = features, b1 = 100, b2 = 100, 
                      step_b1 = 100, step_b2 = 100, 
                      iter = 3 )

Test 10 completed!!!
Test 20 completed!!!
Test 30 completed!!!
Test 40 completed!!!
Test 50 completed!!!
Test 60 completed!!!
Test 70 completed!!!
Test 80 completed!!!
Test 90 completed!!!
Test 100 completed!!!
b1: 50.0, b2: 140.0, ks_stat: 0.22172200337647718, target_ks_stat: 0.03545301069217782, min_delta: 0.18626899268429936
Test 10 completed!!!
Test 20 completed!!!
Test 30 completed!!!
Test 40 completed!!!
Test 50 completed!!!
Test 60 completed!!!
Test 70 completed!!!
Test 80 completed!!!
Test 90 completed!!!
Test 100 completed!!!
b1: 25.0, b2: 160.0, ks_stat: 0.22172200337647718, target_ks_stat: 0.03545301069217782, min_delta: 0.18626899268429936
Test 10 completed!!!
Test 20 completed!!!
Test 30 completed!!!
Test 40 completed!!!
Test 50 completed!!!
Test 60 completed!!!
Test 70 completed!!!
Test 80 completed!!!
Test 90 completed!!!
Test 100 completed!!!
b1: 12.5, b2: 170.0, ks_stat: 0.22172200337647718, target_ks_stat: 0.03545301069217782, min_delta: 0.18626899268429936
final_b1: 

In [10]:
# b1 = 100
# b2 = 100
# step_b1 = 100
# step_b2 = 100
# features = [
#   'course',
#   'unemployment_rate'
#     ]
# final_min_delta = 999999999.9
# start_time = timer()
# for level in range(3):
#     step_b1 /= 2
#     step_b2 /= 2
#     b1_min = max(0, b1 - step_b1)
#     b1_max = b1 + step_b1
#     b2_min = b2 - step_b2
#     b2_max = b2 + step_b2
#     grid_range = [(b1_min, b1_max),(b2_min, b2_max)]
#     (b1, b2, ks_stat, target_ks_stat, min_delta) = \
#                             hyperparameter_search(grid_range, features)
#     print(f"b1: {b1}, b2: {b2}, ks_stat: {ks_stat}, target_ks_stat: {target_ks_stat}, min_delta: {min_delta}")
                            
#     if min_delta < final_min_delta:
#       final_min_delta = min_delta
#       final_best_b1 = b1
#       final_best_b2 = b2
#       final_ks_stat = ks_stat
#       final_target_ks_stat =  target_ks_stat
# end_time = timer()
# print(f"final_b1: {final_best_b1}\nfinal_b2: {final_best_b2}\nfinal_ks_stat: {final_ks_stat}\nfinal_target_ks_stat: {final_target_ks_stat}\nfinal_min_delta: {min_delta}\nTotal Time: {end_time-start_time:.4f}")

In [11]:
# def hyperparmeter_search(training_data,validation_data):
#     opt_bins = {}
#     for col in training_data.columns:
#         nogan = model.NoGANSynth(training_data[[col]])
#         p_min = 1
#         p_max = 100
#         p_step = (p_max - p_min)/10
#         min_delta = 999999999.9
#         print(f"Column: {col}")
#         for p in np.arange(p_min, p_max, p_step):
#             bins = [p]
#             #print(f"Bin Length: {len(bins)}, Col Count: {len(training_data.columns)}")
#             synth_data, results = \
#                         train.nogan_synth(nogan, training_data[[col]], 
#                                           validation_data[[col]], bins = bins,
#                                           n_nodes = 1000, verbose = False, 
#                                           random_seed = seed)
#             ks_stat = results["synth_comparison"]["ks_stat"]
#             target_ks_stat = results["train_comparison"]["ks_stat"]
#             delta = np.sqrt((ks_stat-target_ks_stat)**2)
#             if delta < min_delta:
#                 best_bin_val = p
#                 min_delta = delta
#                 best_ks_stat = ks_stat
#                 best_target_ks_stat = target_ks_stat
#         opt_bins[col] = {"bin_val":best_bin_val, "ks_stat":best_ks_stat,
#                          "target_ks_stat":best_target_ks_stat, "delta":delta}  
        
#     return opt_bins
#         # for i in range(3):
            
#         #     nogan = model.NoGANSynth(training_data[col])
    

In [12]:
# results = hyperparmeter_search(training_data,validation_data)

# results

In [13]:
# results_df = pd.DataFrame(results)

# results_df.head()

In [14]:
# [ round(bin) for bin in results_df.iloc[0,:]]

In [15]:
# col = "curricular_units_2nd_sem_approved"
# bin = [3]
# len(bin), len(training_data[[col]].columns)


In [16]:
# bins = [40] * len(features)
# nogan = model.NoGANSynth(training_data)
# synth_data, results = train.nogan_synth(nogan, training_data, 
#                                         validation_data, bins = bins,
#                                         n_nodes = 1000, verbose = False, 
#                                         random_seed = seed)

# ks_stat = results["synth_comparison"]["ks_stat"]
# target_ks_stat = results["train_comparison"]["ks_stat"]

# print(f"ks_stat: {ks_stat:.5f}\ntarget_ks_stat: {target_ks_stat:.5f}\nRoot Squared Diff {np.sqrt((ks_stat-target_ks_stat)**2):.5f}")