## 필요 패키지

In [1]:
import numpy as np 
import pandas as pd 
from dython.nominal import compute_associations

from sklearn import svm,tree
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier


from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from scipy.stats import wasserstein_distance
from scipy.spatial import distance

from tabulate import tabulate
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import model_selection
from sklearn.preprocessing import MinMaxScaler,StandardScaler
import warnings
warnings.filterwarnings('ignore')

import glob
import importlib

In [2]:
def supervised_model_training(x_train, y_train, x_test, y_test, model_name):
    if model_name == 'lr':
        model  = LogisticRegression(random_state=42,max_iter=100) 
    elif model_name == 'svm':
        model  = svm.SVC(random_state=42,probability=True)
    elif model_name == 'dt':
        model  = tree.DecisionTreeClassifier(random_state=42)
    elif model_name  == "mlp":
        model = MLPClassifier(random_state=42,max_iter=100)
    elif model_name == 'rf':      
        model = RandomForestClassifier(random_state=42)
    elif model_name == 'ada':
        model = AdaBoostClassifier(random_state = 42)
    # elif model_name == 'xgb':
    #     model = XGBClassifier(random_state = 42, objective='multi:softmax') # target 수를 보고 결정
    # elif model_name == 'lgb':
    #     model = LGBMClassifier(random_state = 42, objective='multiclass')
    elif model_name == 'xgb':
        model = XGBClassifier(random_state = 42, objective='binary:logistic')
    elif model_name == 'lgb':
        model = LGBMClassifier(random_state = 42, objective='binary')
    
    model.fit(x_train, y_train)
    pred = model.predict(x_test)
    if len(np.unique(y_train))>2:
        predict = model.predict_proba(x_test)        
        acc = metrics.accuracy_score(y_test,pred)*100
        auc = metrics.roc_auc_score(y_test, predict,average="macro",multi_class="ovr")
        f1_score = metrics.precision_recall_fscore_support(y_test, pred,average="weighted", zero_division=1)[2]
        return [acc, auc,f1_score] 
    else:
        predict = model.predict_proba(x_test)[:,1]    
        acc = metrics.accuracy_score(y_test,pred)*100
        auc = metrics.roc_auc_score(y_test, predict)
        f1_score = metrics.precision_recall_fscore_support(y_test,pred)[2].mean()
        return [acc, auc,f1_score] 

In [3]:
def get_utility_metrics(real_train_path, real_test_path, fake_paths, discrete_columns, target_column, scaler="MinMax", classifiers = ["lr","dt","mlp","rf","ada","xgb","lgb"], test_ratio=.20):
    print('fake data 개수 :', len(fake_paths))
    label_encoder_dict = {}

    real_train_df = pd.read_csv(real_train_path)
    real_test_df = pd.read_csv(real_test_path)
    data_dim = real_train_df.shape[1]

    col_order = discrete_columns
    real_train_df = real_train_df.reindex(columns=col_order + list(real_train_df.columns.difference(col_order)))
    real_test_df = real_test_df.reindex(columns=col_order + list(real_test_df.columns.difference(col_order)))

    ## 범주형 변수 변환
    for col in real_train_df.columns:
        if (real_train_df[col]).dtypes == 'O':
            le = LabelEncoder()
            le = le.fit(real_train_df[col])
            label_encoder_dict[col] = le
            real_train_df[col] = le.transform(real_train_df[col])
            real_test_df[col] = le.transform(real_test_df[col])
        else:
            pass

    ## real_train_df 데이터 분리
    train_data_real = real_train_df.drop([target_column], axis=1, inplace=False) # 피처(독립변수)
    train_target_real = real_train_df[target_column].astype(int) # 레이블(종속변수) -> 0,1,2의 정수형으로 맞춰줘야 xgb 에러가 안 남

    ## real 평가 데이터
    test_data_real = real_test_df.drop([target_column], axis=1, inplace=False) # 피처(독립변수)
    test_target_real = real_test_df[target_column].astype(int)

    if scaler=="MinMax":
        scaler_real = MinMaxScaler()
    else:
        scaler_real = StandardScaler()
    
    ## scaling
    scaler_real.fit(train_data_real)
    X_train_real_scaled = scaler_real.transform(train_data_real)
    X_test_real_scaled = scaler_real.transform(test_data_real)
    
    ## 분류 모델 적용
    all_real_results = []
    for classifier in classifiers:
        print(" real data classifer :", classifier)
        real_results = supervised_model_training(X_train_real_scaled, train_target_real, X_test_real_scaled, test_target_real, classifier)
        all_real_results.append(real_results)
    
    print('## Real data ML Utility finish ##')
    print()
      
    all_fake_results_avg = []
    
    ## fake data 불러오기
    for fake_path in fake_paths:
        print(" fake_path :", fake_path)
        fake_train_df = pd.read_csv(fake_path)
        fake_train_df = fake_train_df.reindex(columns=col_order + list(fake_train_df.columns.difference(col_order)))
        for col in fake_train_df.columns:
            if (fake_train_df[col]).dtypes == 'O':
                le = label_encoder_dict[col]
                fake_train_df[col] = le.transform(fake_train_df[col])
        train_data_fake = fake_train_df.drop([target_column], axis=1, inplace=False) # 피처(독립변수)
        train_target_fake = fake_train_df[target_column].astype(int) # 레이블(종속변수) -> 0,1,2의 정수형으로 맞춰줘야 xgb 에러가 안 남
        print(len(np.unique(train_target_fake)))

        if scaler=="MinMax":
          scaler_fake = MinMaxScaler()
        else:
          scaler_fake = StandardScaler()
        
        scaler_fake.fit(train_data_fake)
        
        X_train_fake_scaled = scaler_fake.transform(train_data_fake)
        
        ## 분류 모델 적용
        all_fake_results = []
        for classifier in classifiers:
            fake_results = supervised_model_training(X_train_fake_scaled, train_target_fake, X_test_real_scaled, test_target_real, classifier)
            all_fake_results.append(fake_results)

        all_fake_results_avg.append(all_fake_results)
    
    diff_results = np.abs(np.array(all_real_results)- np.array(all_fake_results_avg).mean(axis=0))

    final_result_df = pd.DataFrame(diff_results, columns=["Acc","AUC","f1_score"])

    return final_result_df, all_real_results, all_fake_results_avg

----

# 1. Machine Learning Efficiacy

In [7]:
dataset = "shoppers"
real_train_path = f"CHECK_DATASETS/{dataset}/trn_{dataset}_final.csv"
real_test_path = f"CHECK_DATASETS/{dataset}/tst_{dataset}_final.csv"
fake_file_root = f"FAKE_DATASETS/{dataset}"
discrete_columns = ['SpecialDay', 'Month', 'OperatingSystems', 'Browser', 'Region', 'TrafficType', 'VisitorType', 'Weekend', 'Revenue']
classifiers_list = ["lr","mlp","rf", "ada"]
target_column = 'Revenue'

In [12]:
fake_paths = glob.glob(fake_file_root+'/'+'*')
fake_paths

['FAKE_DATASETS/shoppers\\Shoppers_fake_shoppers_1.csv']

In [9]:
metric = ["Acc","AUC","f1_score"]
classifiers_list = ["lr","mlp","rf", "ada"]
final_result_df, real_result, fake_result = get_utility_metrics(real_train_path, real_test_path, fake_paths, discrete_columns, target_column, "MinMax", classifiers_list, test_ratio = 0.20)

"""실제와 fake 간의 차이"""
final_result_df.index = classifiers_list

"""실제 데이터"""
real_df = pd.DataFrame(real_result, columns = metric)
real_df.index = classifiers_list
real_df = real_df.reset_index().rename({'index' : 'model'}, axis=1)

"""fake 데이터"""
fake_df = pd.DataFrame()
for idx, fdf in enumerate(fake_result):
    fdf = pd.DataFrame(fdf, columns = metric)
    fdf.index = classifiers_list
    fdf = fdf.reset_index().rename({'index' : 'model'}, axis=1)
    fdf['file_order'] = idx
    fake_df = pd.concat([fake_df, fdf], axis=0)
fake_df = fake_df.reset_index(drop=True)
final_result_df

fake data 개수 : 22
 real data classifer : lr
 real data classifer : mlp
 real data classifer : rf
 real data classifer : ada
## Real data ML Utility finish ##

 fake_path : F


FileNotFoundError: [Errno 2] No such file or directory: 'F'

In [21]:
final_result_df.mean(axis=0)

Acc         0.941876
AUC         0.034919
f1_score    0.017061
dtype: float64