In [None]:
#Importing Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import MinMaxScaler

import warnings
warnings.simplefilter("ignore", UserWarning)

!pip install DataSynthesizer

from DataSynthesizer.DataDescriber import DataDescriber
from DataSynthesizer.DataGenerator import DataGenerator
from DataSynthesizer.ModelInspector import ModelInspector
from DataSynthesizer.lib.utils import read_json_file, display_bayesian_network

!pip install gower
import gower

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting DataSynthesizer
  Downloading DataSynthesizer-0.1.11-py2.py3-none-any.whl (24 kB)
Installing collected packages: DataSynthesizer
Successfully installed DataSynthesizer-0.1.11
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gower
  Downloading gower-0.1.2-py3-none-any.whl (5.2 kB)
Installing collected packages: gower
Successfully installed gower-0.1.2


In [None]:
#Reading Original Data
df = pd.read_csv("churn.csv")

#Drop Unnecesary Columns
df = df.drop(columns = ['RowNumber', 'CustomerId', 'Surname'])

In [None]:
# Splitting Data
train, test = train_test_split(df, test_size=0.25, random_state=123)

train.to_csv('train.csv')
test.to_csv('test.csv')

In [None]:
# Classification with Original Data (Benchmark)

#Splitting features and target
X = df.drop(columns = ['Exited'])
y = df[['Exited']]

#Creating dummies for categorical features
X = pd.get_dummies(X, columns = ['Geography', 'Gender'])

#Splitting Data to Train and Test (Note: Using same random seed as row 3 to ensure forward looking model)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=123)

#LightGBM
from lightgbm import LGBMClassifier

#Model fit and predict
model = LGBMClassifier()
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_test)[:, 1]

#Using ROC to measure performance, since dataset is imbalanced
print("ROC score with original data: ", round(roc_auc_score(y_test, y_pred),3))

ROC score with original data:  0.871


In [None]:
# Synthetic Data Generation

#Splitting train data based on target
train0 = train[train['Exited'] == 0]
train1 = train[train['Exited'] == 1]

train0.to_csv('train0.csv')
train1.to_csv('train1.csv')

#Creating attribute dictionaries for DataSynthesizer (Note: Binary features are treated as categorical)
attribute_to_datatype = {
    'CreditScore': 'Integer',
    'Geography': 'String',
    'Gender': 'String',
    'Age': 'Integer',
    'Tenure': 'Integer',
    'Balance': 'Float',
    'NumOfProducts': 'Integer',  
    'HasCrCard': 'Integer',
    'IsActiveMember':'Integer',
    'EstimatedSalary': 'Float',
    'Exited': 'Integer'
}

attribute_is_categorical = {
    'CreditScore': False,
    'Geography': True,
    'Gender': True,
    'Age': False,
    'Tenure': False,
    'Balance': False,
    'NumOfProducts': False,  
    'HasCrCard': True,
    'IsActiveMember':True,
    'EstimatedSalary': False,
    'Exited': True
}


#DataDescriber creates the bayesian network maiximizing mutual information
describer = DataDescriber()

tradeoff = pd.DataFrame(columns = ['Noise', 'Overall_Similarity', 'Dissimilar_Records', 'ROC'])

#Iterating over noise parameters
for n in [0, 100, 50, 10, 5, 1, 0.5, 0.1]:

  #Generating synthetic data for target 0
  #Using correlated attribute model to capture interaction effects amongst features (2 parent network)
  describer.describe_dataset_in_correlated_attribute_mode(
  dataset_file='train0.csv',
  epsilon=n,
  k=2,
  attribute_to_datatype=attribute_to_datatype,
  attribute_to_is_categorical=attribute_is_categorical)

  #Metadeta description
  describer.save_dataset_description_to_file('Description0.csv')

  #DataGenerator uses the metadata descriptions to generate samples of synthetic data
  generator = DataGenerator()
  generator.generate_dataset_in_correlated_attribute_mode(train0.shape[0],'Description0.csv',)
  generator.save_synthetic_data('Synthetic_Data0.csv')


  #Generating synthetic data for target 1
  #Using correlated attribute model to capture interaction effects amongst features (2 parent network)
  describer.describe_dataset_in_correlated_attribute_mode(
  dataset_file='train1.csv',
  epsilon=n,
  k=2,
  attribute_to_datatype=attribute_to_datatype,
  attribute_to_is_categorical=attribute_is_categorical)

  #Metadeta description
  describer.save_dataset_description_to_file('Description1.csv')

  #DataGenerator uses the metadata descriptions to generate samples of synthetic data
  generator = DataGenerator()
  generator.generate_dataset_in_correlated_attribute_mode(train1.shape[0],'Description1.csv',)
  generator.save_synthetic_data('Synthetic_Data1.csv')

  synthetic0 = pd.read_csv('Synthetic_Data0.csv')
  synthetic1 = pd.read_csv('Synthetic_Data1.csv')

  # Combining the synthetic data samples
  synthetic_df = pd.concat([synthetic0, synthetic1])
  synthetic_df.to_csv('Synthetic_Data'+str(n)+'.csv')

  synthetic = pd.read_csv('Synthetic_Data'+str(n)+'.csv')
  features = df.columns
  sdf = synthetic[features]

  #Calculating gower distance between real and synthetic data
  real_synthetic = pd.concat([train, sdf])
  distance_matrix = gower.gower_matrix(real_synthetic)[:7500, 7500:]

  s = 0
  for i in range(len(distance_matrix)):
    x = 0
    for j in range(len(distance_matrix[i])):
      if distance_matrix[i][j] < 0.03:
        x = 1
    s = s+x
  similarity = s/len(distance_matrix)

  # Classification with Synthetic Data
  #Splitting features and target
  X_train = sdf.drop(columns = ['Exited'])
  y_train = sdf[['Exited']]

  X_test = test.drop(columns = ['Exited'])
  y_test = test[['Exited']]

  #Creating dummies for categorical features
  X_train = pd.get_dummies(X_train, columns = ['Geography', 'Gender'])
  X_test = pd.get_dummies(X_test, columns = ['Geography', 'Gender'])

  #LightGBM
  from lightgbm import LGBMClassifier

  #Model fit and predict
  model = LGBMClassifier()
  model.fit(X_train, y_train)
  y_pred = model.predict_proba(X_test)[:, 1]

  #Using ROC to measure performance, since dataset is imbalanced
  print("ROC score with synthetic data - noise "+str(n)+": ", round(roc_auc_score(y_test, y_pred),3))

  #Noise 0 means no differential privacy. Noise greater that 0 are inversely proportions to differential privacy
  if n == 0:
    noise = 0
  else:
    noise = 1/n

  #Adding summary results
  details = [noise, np.mean(distance_matrix), 1-similarity, round(roc_auc_score(y_test, y_pred),3)]
  tradeoff.loc[len(tradeoff)] = details

#Saving tradeoff file
tradeoff.to_csv('tradeoff.csv')

Adding ROOT NumOfProducts
Adding attribute Balance
Adding attribute Geography
Adding attribute EstimatedSalary
Adding attribute CreditScore
Adding attribute Age
Adding attribute Tenure
Adding attribute IsActiveMember
Adding attribute HasCrCard
Adding attribute Gender
Adding attribute Exited
Adding ROOT NumOfProducts
Adding attribute EstimatedSalary
Adding attribute CreditScore
Adding attribute Age
Adding attribute Balance
Adding attribute Tenure
Adding attribute Geography
Adding attribute Gender
Adding attribute IsActiveMember
Adding attribute HasCrCard
Adding attribute Exited
ROC score with synthetic data - noise 0:  0.865
Adding ROOT NumOfProducts
Adding attribute Balance
Adding attribute Geography
Adding attribute EstimatedSalary
Adding attribute CreditScore
Adding attribute Age
Adding attribute Tenure
Adding attribute IsActiveMember
Adding attribute HasCrCard
Adding attribute Gender
Adding attribute Exited
Adding ROOT NumOfProducts
Adding attribute EstimatedSalary
Adding attribute 