In [None]:
import pandas as pd
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

## **인코더 학습**

In [177]:
import torch
import torch.nn as nn
from torch.optim.lr_scheduler import StepLR
import torch.optim as optim
class Autoencoder(nn.Module):
    def __init__(self, input_dim, latent_dim, num_classes):
        super(Autoencoder, self).__init__()
        # Encoder
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, latent_dim),
            nn.ReLU(True)
        )
        # Decoder
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, input_dim),
            nn.Sigmoid()
        )
        # Classification Layer
        self.classifier = nn.Linear(latent_dim, num_classes)

    def forward(self, x):
        z = self.encoder(x)
        x_recon = self.decoder(z)
        class_pred = self.classifier(z)
        return x_recon, z, class_pred



def target_class_center_loss(z, labels, target_label=1):

    
    mask = labels == 0
    z_false = z[mask]
    # 선택된 데이터의 평균 계산
    z_false_mean = z_false.mean(dim=0)

    mask = labels == 1
    z_true = z[mask]
    # 선택된 데이터의 평균 계산
    z_true_mean = z_true.mean(dim=0)
   
    loss_false = ((z_false - z_false_mean) ** 2).sum()/2
    loss_true = ((z_true - z_true_mean) ** 2).sum()/2
    loss = loss_true + loss_false
    return loss
def mape_loss(output, target, eps=1e-2):

    return 100 * torch.mean(torch.abs((target - output) / (target + eps)))
def train_model(model, dataloader, num_epochs):
    criterion_recon = nn.MSELoss()
    criterion_ce = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-3)

    for epoch in range(num_epochs):
        for data, labels in dataloader:
            optimizer.zero_grad()
            x_recon, z, class_pred = model(data)

            
            loss_recon = criterion_recon(x_recon, data)*1000
            loss_ce = criterion_ce(class_pred, labels)
            
            # Calculate the target class center loss for label 1
            loss_target_class_center = target_class_center_loss(z, labels, target_label=1)
      
            # Combine the losses
            loss = loss_recon + loss_ce + loss_target_class_center

            loss.backward()
            optimizer.step()

        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')
    return model


# Assuming 'df' is your DataFrame
# Setup your dataloader here as shown before

from torch.utils.data import Dataset, DataLoader
import torch

class CustomDataset(Dataset):
    def __init__(self, dataframe):
        # Assuming 'is_converted' is the label column
        self.data = torch.tensor(dataframe.drop(columns=['is_converted']).values, dtype=torch.float32)
        self.labels = torch.tensor(dataframe['is_converted'].values, dtype=torch.long)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]



In [178]:

input_dim = train.shape[1] - 1  # Adjust based on your dataset
latent_dim = 32
num_classes = 2  # Adjust based on your dataset

# Initialize the model
model = Autoencoder(input_dim, latent_dim, num_classes)

# Initialize the dataset and dataloader
dataset = CustomDataset(imbalanced_train)  # Make sure to replace 'train' with 'df'
dataloader = DataLoader(dataset, batch_size=5000, shuffle=True)


# Train the model
num_epochs = 100
trained_encoder = train_model(model, dataloader, num_epochs)

Epoch [1/100], Loss: 618.5135
Epoch [2/100], Loss: 446.0736
Epoch [3/100], Loss: 341.5865
Epoch [4/100], Loss: 358.9991
Epoch [5/100], Loss: 265.0999
Epoch [6/100], Loss: 294.7734
Epoch [7/100], Loss: 253.7092
Epoch [8/100], Loss: 237.8191
Epoch [9/100], Loss: 259.3097
Epoch [10/100], Loss: 249.6816
Epoch [11/100], Loss: 288.4279
Epoch [12/100], Loss: 232.7352
Epoch [13/100], Loss: 236.3727
Epoch [14/100], Loss: 242.5010
Epoch [15/100], Loss: 220.8013
Epoch [16/100], Loss: 225.6420
Epoch [17/100], Loss: 217.5265
Epoch [18/100], Loss: 228.5418
Epoch [19/100], Loss: 236.7931
Epoch [20/100], Loss: 228.0318
Epoch [21/100], Loss: 211.6195
Epoch [22/100], Loss: 207.7432
Epoch [23/100], Loss: 210.2309
Epoch [24/100], Loss: 206.4343
Epoch [25/100], Loss: 203.6395
Epoch [26/100], Loss: 212.6441
Epoch [27/100], Loss: 199.8744
Epoch [28/100], Loss: 200.7134
Epoch [29/100], Loss: 209.4112
Epoch [30/100], Loss: 193.5911
Epoch [31/100], Loss: 193.4958
Epoch [32/100], Loss: 183.3506
Epoch [33/100], L

In [152]:
from sklearn.neighbors import KernelDensity
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
def extract_middle_percent(data, start, last):
    scaler = StandardScaler()
    data_scaled = scaler.fit_transform(data)
    kde = KernelDensity(kernel='gaussian', bandwidth=0.5).fit(data_scaled)
    
    log_prob = kde.score_samples(data_scaled)
    prob = np.exp(log_prob)
    threshold_low, threshold_high = np.percentile(prob, [start, last])
    mask = np.logical_and(prob >= threshold_low, prob <= threshold_high) #######
    data_keep = data[mask]
    # plt.figure(figsize=(10, 6))

    # # Original KDE plot
    # sns.kdeplot(data.ravel(), shade=True, label='Original KDE')
    if len(data_keep) > 0 :
      return data_keep,  data[~mask]
    else:  
      print("No middle 50% found, returning original data")
      return np.array([])

  #  각 feature 안의 값을 복원추출하는 함수

In [153]:
train_true = train[train['is_converted']==1]
train_false = train[train['is_converted']==0]
dataset = CustomDataset(train_true)  # Make sure to replace 'train' with 'df'
dataloader = DataLoader(dataset, batch_size=train_true.shape[0], shuffle=True)
for data, label in dataloader:
    x_recon, z, class_pred = trained_encoder(data)

z,x_recon

(tensor([[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]], grad_fn=<ReluBackward0>),
 tensor([[0.2559, 0.0641, 0.0152,  ..., 0.0094, 0.0132, 0.0120],
         [0.2468, 0.0520, 0.0104,  ..., 0.0063, 0.0089, 0.0081],
         [0.2419, 0.0473, 0.0087,  ..., 0.0052, 0.0075, 0.0068],
         ...,
         [0.2454, 0.0499, 0.0097,  ..., 0.0058, 0.0083, 0.0075],
         [0.2484, 0.0546, 0.0113,  ..., 0.0069, 0.0097, 0.0088],
         [0.2401, 0.0438, 0.0076,  ..., 0.0045, 0.0065, 0.0059]],
        grad_fn=<SigmoidBackward0>))

In [154]:
train_true = train[train['is_converted']==1]
train_false = train[train['is_converted']==0]


# Initialize the dataset and dataloader
dataset = CustomDataset(train_true)  # Make sure to replace 'train' with 'df'
dataloader = DataLoader(dataset, batch_size=train_true.shape[0], shuffle=True)
for data, label in dataloader:
    x_recon, z, class_pred = trained_encoder(data)
data_keep, data_sample = extract_middle_percent(z.detach().numpy(),25,100)  


# Initialize the dataset and dataloader
dataset = CustomDataset(train_false)  # Make sure to replace 'train' with 'df'
dataloader = DataLoader(dataset, batch_size=train_false.shape[0], shuffle=True)
for data, label in dataloader:
    x_recon, z, class_pred = trained_encoder(data)

data_keep, data_drop = extract_middle_percent(z.detach().numpy(),50,100)  


In [155]:
import torch

# data_sample과 data_keep을 텐서로 변환 (이미 텐서라면 이 단계는 생략 가능)
data_sample = torch.tensor(data_sample)
data_keep = torch.tensor(data_keep)

# Step 1: data_sample과 data_keep의 평균 구하기
data_true_mean = data_sample.mean(dim=0)
data_false_mean = data_keep.mean(dim=0)

valid_count = 0  # 유효한 행의 개수를 세기 위한 변수
print(f"데이터 생성 전 기준개수: {data_keep.shape[0]}")
print(f"데이터 생성 전 개수: {data_sample.shape[0]}")
while(data_sample.shape[0]<data_keep.shape[0]):
    # 각 열별로 랜덤 인덱스 선택
    random_indices = torch.randint(0, data_sample.shape[0], size=(1, data_sample.shape[1]))
    new_row = torch.gather(data_sample, 0, random_indices)

    # Step 2: 가장 외곽에 있는 행과의 임계거리 계산
    distances = torch.norm(data_sample - data_true_mean, dim=1)
    threshold_distance = distances.max()

    # Step 3: 새로운 행과 data_sample 평균과의 거리 계산
    distance_true = torch.norm(new_row - data_true_mean)
    distance_false = torch.norm(new_row - data_false_mean)

    # 새로운 행의 유효성 판단
    if distance_true <= threshold_distance and distance_true <= distance_false:
        valid_count += 1
        # 조건을 충족하는 경우 new_row를 data_sample에 추가
        data_sample = torch.cat((data_sample, new_row), dim=0)
    
print(f"100번 중 유효한 행의 수: {valid_count}")
print(f"데이터 생성 후 개수: {data_sample.shape[0]}")

데이터 생성 전 기준개수: 27225
데이터 생성 전 개수: 1213
100번 중 유효한 행의 수: 26012
데이터 생성 후 개수: 27225


In [156]:

decoded_data = trained_encoder.decoder(data_sample)

# 디코딩된 데이터를 numpy 배열로 변환 (필요한 경우)
decoded_data_np = decoded_data.detach().numpy()  # GPU에서 실행하는 경우 .cpu()를 추가해야 할 수 있음

column_names = X.columns

# 디코딩된 데이터를 데이터프레임으로 변환하고 열 이름 적용
true_df = pd.DataFrame(decoded_data_np, columns=column_names)
true_df['is_converted']=1

decoded_data = trained_encoder.decoder(data_keep)

# 디코딩된 데이터를 numpy 배열로 변환 (필요한 경우)
decoded_data_np = decoded_data.detach().numpy()  # GPU에서 실행하는 경우 .cpu()를 추가해야 할 수 있음

column_names = X.columns

# 디코딩된 데이터를 데이터프레임으로 변환하고 열 이름 적용
false_df = pd.DataFrame(decoded_data_np, columns=column_names)
false_df['is_converted']=0

In [164]:
balanced_train = pd.concat([true_df, false_df], ignore_index=True).sample(frac=1, random_state=42)

from sklearn.model_selection import train_test_split
X = balanced_train[balanced_train.columns.drop('is_converted')]
Y = balanced_train['is_converted']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.05, random_state=42, shuffle=True)
# 예측에 필요한 데이터 분리
x_test = test.drop(["is_converted"], axis=1)

In [172]:
y_test.value_counts()

is_converted
0    1370
1    1353
Name: count, dtype: int64

In [166]:

from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

In [167]:
import time
import optuna
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score
# XGBoost와 CatBoost 라이브러리를 임포트합니다.
import xgboost as xgb
import catboost as cb
def objective(trial):
    # classifier_name = trial.suggest_categorical('classifier', ['DecisionTree', 'LGBM', 'XGBoost', 'CatBoost'])
    classifier_name = 'XGBoost'
    if classifier_name == 'DecisionTree':
        param = {
            'max_depth': trial.suggest_int('max_depth', 3, 20),
            'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
            'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
            'max_features': trial.suggest_int('max_features', 1, 30)
        }
        model = DecisionTreeClassifier(**param)
    elif classifier_name == 'LGBM':
        param = {
            'n_estimators': trial.suggest_int('n_estimators', 300, 500),
            'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
            'num_leaves': trial.suggest_int('num_leaves', 31, 128),
            'max_depth': trial.suggest_int('max_depth', 3, 12),

        }
        model = LGBMClassifier(**param)
    elif classifier_name == 'XGBoost':
        param = {
            'n_estimators': trial.suggest_int('n_estimators', 100, 500),
            'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
            'subsample': trial.suggest_float('subsample', 0.5, 1),
            'random_state':trial.suggest_int('random_state',42,42),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1),
        }
        model = xgb.XGBClassifier(**param)
    elif classifier_name == 'CatBoost':
        param = {
            'iterations': trial.suggest_int('iterations', 100, 500),
            'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
            'depth': trial.suggest_int('depth', 4, 10),
            'l2_leaf_reg': trial.suggest_int('l2_leaf_reg', 1, 10),
            'border_count': trial.suggest_int('border_count', 1, 255),
            'loss_function': 'Logloss',
        }
        model = cb.CatBoostClassifier(**param, verbose=False)


    model.fit(X_train, y_train)
    
    test_pred = model.predict(x_test)
    print(sum(test_pred))
    y_pred = model.predict(X_test)
    score = f1_score(y_test, y_pred, average='binary')
    return score

In [168]:
# Optuna Study 생성 및 최적화
sampler = optuna.samplers.TPESampler(seed=42)
study = optuna.create_study(direction='maximize', sampler=sampler)
study.optimize(objective, n_trials=50)

# 최적화 결과 출력
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[I 2024-02-24 20:48:26,606] A new study created in memory with name: no-name-336a2cc2-5332-42df-915a-52cf16d8bd6b
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
[I 2024-02-24 20:48:30,803] Trial 0 finished with value: 0.997779422649889 and parameters: {'n_estimators': 250, 'learning_rate': 0.2536999076681772, 'max_depth': 8, 'min_child_weight': 6, 'subsample': 0.5780093202212182, 'random_state': 42, 'colsample_bytree': 0.5779972601681014}. Best is trial 0 with value: 0.997779422649889.


5271


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
[I 2024-02-24 20:48:32,820] Trial 1 finished with value: 0.997039230199852 and parameters: {'n_estimators': 123, 'learning_rate': 0.19030368381735815, 'max_depth': 7, 'min_child_weight': 8, 'subsample': 0.5102922471479012, 'random_state': 42, 'colsample_bytree': 0.9849549260809971}. Best is trial 0 with value: 0.997779422649889.


5271


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
[I 2024-02-24 20:48:39,210] Trial 2 finished with value: 0.9970348406226834 and parameters: {'n_estimators': 433, 'learning_rate': 0.020589728197687916, 'max_depth': 4, 'min_child_weight': 2, 'subsample': 0.6521211214797689, 'random_state': 42, 'colsample_bytree': 0.762378215816119}. Best is trial 0 with value: 0.997779422649889.


5271


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
[I 2024-02-24 20:48:43,641] Trial 3 finished with value: 0.9974064468321601 and parameters: {'n_estimators': 273, 'learning_rate': 0.02692655251486473, 'max_depth': 7, 'min_child_weight': 2, 'subsample': 0.6460723242676091, 'random_state': 42, 'colsample_bytree': 0.6831809216468459}. Best is trial 0 with value: 0.997779422649889.


5271


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
[I 2024-02-24 20:48:46,893] Trial 4 finished with value: 0.997039230199852 and parameters: {'n_estimators': 282, 'learning_rate': 0.14447746112718687, 'max_depth': 4, 'min_child_weight': 6, 'subsample': 0.7962072844310213, 'random_state': 42, 'colsample_bytree': 0.5232252063599989}. Best is trial 0 with value: 0.997779422649889.


5271


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
[I 2024-02-24 20:48:50,310] Trial 5 finished with value: 0.9962908011869436 and parameters: {'n_estimators': 343, 'learning_rate': 0.0178601378893971, 'max_depth': 3, 'min_child_weight': 10, 'subsample': 0.9828160165372797, 'random_state': 42, 'colsample_bytree': 0.9041986740582306}. Best is trial 0 with value: 0.997779422649889.


5271


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
[I 2024-02-24 20:48:53,743] Trial 6 finished with value: 0.9970348406226834 and parameters: {'n_estimators': 222, 'learning_rate': 0.013940346079873234, 'max_depth': 8, 'min_child_weight': 5, 'subsample': 0.5610191174223894, 'random_state': 42, 'colsample_bytree': 0.7475884550556351}. Best is trial 0 with value: 0.997779422649889.


5271


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
[W 2024-02-24 20:48:54,055] Trial 7 failed with parameters: {'n_estimators': 113, 'learning_rate': 0.22038218939289875, 'max_depth': 5, 'min_child_weight': 7, 'subsample': 0.6558555380447055, 'random_state': 42, 'colsample_bytree': 0.7600340105889054} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "c:\Users\kuils\anaconda3\envs\env\lib\site-packages\optuna\study\_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\kuils\AppData\Local\Temp\ipykernel_31324\2742280551.py", line 52, in objective
    model.fit(X_train, y_train)
  File "c:\Users\kuils\anaconda3\envs\env\lib\site-packages\xgboost\core.py", line 730, in inner_f
    return func(**kwargs)
  File "c:\Users\kuils\anaconda3\envs\env\lib\site-packages\xgboost\sklearn.py", line 1519, in fit
    self._Booster = train(
  File "c:\Users\kuils\anaconda3\envs\env\lib\site-packages\xg

KeyboardInterrupt: 