### Import

In [None]:
!pip install category_encoders
!pip install catboost

In [2]:
import random
import pandas as pd
import numpy as np
from datetime import datetime
import os
import json
import re

from sklearn.linear_model import (
    LogisticRegression
)

from sklearn.ensemble import (
    ExtraTreesClassifier,
    RandomForestClassifier
)

from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
    roc_auc_score
)
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

import category_encoders as ce

from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier


import warnings
warnings.filterwarnings(action='ignore')

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



### Data Load

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import os

# 현재 작업 디렉터리 출력
print(os.getcwd())

data_path = os.path.join(os.getcwd(), 'drive', 'MyDrive', 'Aimers_6th', 'Data')
print(data_path)

/content
/content/drive/MyDrive/Aimers_6th/Data


In [5]:
config = {
    'root': data_path
    , 'train_path': f'{data_path}/train.csv'
    , 'test_path': f'{data_path}/test.csv'
    , 'submit_path': f'{data_path}/sample_submission.csv'
    , 'seed_list': [42]
    , 'k_fold': 5
    , 'model': 'lgb'       # cbt, logistic, et, rf, lgb, xgb
    , 'encoding': None # None(lgb, xgb, cbt), target, one-hot, ordinal, catboost

}

### HyperParameter

In [6]:
# Our best parameters
params = {
    'logistic': {
        'n_jobs': 1,
        'random_state': config['seed_list'][0],
        'max_iter': 300,
        'penalty': 'l2'
    },

    'et': {
        'n_jobs': 1,
        'random_state': config['seed_list'][0],
        'n_estimators': 300,
        'max_depth': 6,
        'min_samples_leaf': 2,
        'max_samples': 0.5

    },

    'rf': {
        'n_jobs': 1,
        'random_state': config['seed_list'][0],
        'n_estimators': 300,
        'max_depth': 6,
        'min_samples_leaf': 2,
        # 'max_samples': 0.5,
        'bootstrap': False,

    },

    'lgb': {
        'random_state': config['seed_list'][0],
        'objective': 'binary',
        'n_jobs': 1,
        'verbosity': -1,
        'early_stopping_rounds': 10,
        'n_estimators': 300,
        'learning_rate': 0.1,
        'max_depth': 6,
        'reg_lambda': 1,
        'subsample': 0.5,
        'deterministic': True,
    },

    'xgb': {
        'random_state': config['seed_list'][0],
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'n_jobs': 1,
        'early_stopping_rounds': 10,
        'learning_rate': 0.1,
        'n_estimators': 300,
        'max_depth': 6,
        'reg_lambda': 1,
        'subsample': 0.5,
        'enable_categorical': True,
        'tree_method': 'hist'

    },

    'cbt': {
        'random_seed': config['seed_list'][0],
        'objective': 'Logloss',
        'eval_metric': 'AUC',
        'auto_class_weights': 'Balanced',
        'verbose': 100,
        'early_stopping_rounds': 10,
        'learning_rate': 0.1,
        'n_estimators': 300,
        'max_depth': 6,
        'l2_leaf_reg': 1,
        'min_data_in_leaf': 2,
        'subsample': 0.5,
        'task_type': 'CPU',
        'allow_writing_files': False
    }
}

### Function

In [171]:
def set_seed(seed: int):
    # Set the seed for reproducibility.
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)


def read_data(config):
    # Load training, testing, and submission CSV files
    df_train = pd.read_csv(config['train_path']).drop(columns=['ID'])  # train data
    df_test = pd.read_csv(config['test_path']).drop(columns=['ID'])    # test data
    df_sub = pd.read_csv(config['submit_path'])

    print(f'train data 수: {df_train.shape[0]}')
    print(f'test data 수: {df_test.shape[0]}')
    print(f'submission data 수: {df_sub.shape[0]}')
    return df_train, df_test, df_sub

def get_clf_eval(y_test, y_proba=None, fold_no=None):
    # Calculate and print evaluation metrics and confusion matrix,
    # accuracy, precision, recall, f1 and roc_auc score.
    # Optionally includes fold number in the output.

    # 임계값 0.5 기준 예측값 생성
    y_pred = (y_proba >= 0.5).astype(int)

    y_test = y_test.values

    confusion = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_proba)  # ROC-AUC는 확률값 그대로 사용

    fold_info = f'Fold #{fold_no}' if fold_no is not None else ''
    print(f'{fold_info} ACC: {accuracy:.4f}, PRE: {precision:.4f}, REC: {recall:.4f}, F1: {f1:.4f}, ROC-AUC: {roc_auc:.4f}')
    return roc_auc


# Categorical variable encoding method.
def category_encoding(df_train, df_test, base_num_features, base_cat_features, config):
    target = '임신 성공 여부'

    new_df_train = pd.concat([df_train[base_num_features].copy(), df_train[target]], axis = 1)
    new_df_test = df_test[base_num_features].copy()

    print(f'category 변수 인코딩: {config["encoding"]}')

    if config['encoding'] == 'target':
        encoder = ce.TargetEncoder(cols=base_cat_features)

    elif config['encoding'] == 'ordinal':
        encoder = ce.OrdinalEncoder(cols=base_cat_features)

    elif config['encoding'] == 'catboost':
        encoder = ce.CatBoostEncoder(cols=base_cat_features)#, random_state=config['seed'])

    if config['encoding'] in ['target', 'ordinal', 'catboost']:
        encoder.fit(df_train[base_cat_features], df_train[target])
        new_df_train[base_cat_features] = encoder.transform(df_train[base_cat_features])
        new_df_test[base_cat_features] = encoder.transform(df_test[base_cat_features])

    elif config['encoding'] == 'one-hot':
        encoder = ce.OneHotEncoder(cols=base_cat_features, use_cat_names = True)
        encoder.fit(df_train[base_cat_features], df_train[target])
        result_tr = encoder.transform(df_train[base_cat_features])
        result_te = encoder.transform(df_test[base_cat_features])

        result_tr.columns = result_tr.columns.str.replace(r'[^ㄱ-ㅎ가-힣A-Za-z0-9_]', '_', regex=True)
        result_te.columns = result_te.columns.str.replace(r'[^ㄱ-ㅎ가-힣A-Za-z0-9_]', '_', regex=True)

        new_df_train = pd.concat([new_df_train, result_tr], axis = 1)
        new_df_test = pd.concat([new_df_test, result_te], axis = 1)

    elif config['encoding'] is None:
        new_df_train = pd.concat([new_df_train, df_train[base_cat_features]], axis = 1)
        new_df_test = pd.concat([new_df_test, df_test[base_cat_features]], axis = 1)

    _, base_cat_features, base_features = make_feature_lists(new_df_train)

    return new_df_train, new_df_test, base_cat_features, base_features

# # 변환 함수: 1이 있는 feature를 문자열 조합으로 변환
def multi_hot_to_combined_label(row):
    return " ".join(row.index[row == 1].tolist())

def feature_engineering(df_input, is_train=False):
    df = df_input.copy()

    if is_train:
        print(f'중복 제거 전 train data 수: {df.shape[0]}')
    else:
        print(f'중복 제거 전 test data 수: {df.shape[0]}')

    # drop_duplicates
    df = df.drop_duplicates(keep='first')

    if is_train:
        print(f'중복 제거 후 train data 수: {df.shape[0]}')
    else:
        print(f'중복 제거 후 test data 수: {df.shape[0]}')

    # # 불임 원인 multi hot -> 하나의 feature

    # infertility_columns = [column for column in df.columns if '불임 원인' in column]

    # # dot()을 사용하여 1이 있는 컬럼을 결합 (각 feature 사이에 공백 추가)
    # df['불임 원인 종합'] = df[infertility_columns].dot(pd.Index([f'{col.replace("불임 원인", "").replace(" ", "")} ' for col in infertility_columns])).str.strip()

    # # 한글만 남기고 나머지 문자 제거 (정규 표현식 사용)
    # df['불임 원인 종합'] = df['불임 원인 종합'].apply(lambda x: re.sub(r'[^가-힣 ]', '', x))

    # # split()과 join()을 사용하여 공백을 ','로 변경
    # df['불임 원인 종합'] = df['불임 원인 종합'].apply(lambda x: ', '.join(x.split()))

    # df = df.drop(columns=infertility_columns)

    return df

def make_feature_lists(df):
    base_features = []     # all features except target variable.
    base_num_features = [] # numerical features
    base_cat_features = [] # categorical features

    for feat in df.columns:
        # skip the target
        if feat == '임신 성공 여부':
            continue

        base_features.append(feat)

        if df[feat].dtype in ['object', 'category']:
            base_cat_features.append(feat)
        else:
            base_num_features.append(feat)

    # infertility_columns = [column for column in df.columns if '불임 원인' in column]

    removal_features = {
            'ID', '불임 원인 - 여성 요인', '불임 원인 - 정자 면역학적 요인', '불임 원인 - 자궁경부 문제', '불임 원인 - 정자 형태', '남성 주 불임 원인', '여성 주 불임 원인'
    }

    # removal_features.update(infertility_columns)

    # remove the specified features
    base_num_features = [i for i in base_num_features if i not in removal_features]
    base_cat_features = [i for i in base_cat_features if i not in removal_features]
    base_features = [i for i in base_features if i not in removal_features]

    print(f'numeric feature 수: {len(base_num_features)}')
    print(f'category feature 수: {len(base_cat_features)}')
    print(f'전체 feature 수: {len(base_features)}')

    return base_num_features, base_cat_features, base_features

def filling_missing_values(df_input, base_cat_features, base_num_features, config):
    df = df_input.copy()

    # Fill missing values for categorical features with 'Unknown' and ensure their data type is string.
    for base_cat_feat in base_cat_features:
        df[base_cat_feat] = df[base_cat_feat].astype(str)
        df[base_cat_feat] = df[base_cat_feat].fillna('알 수 없음')

        if config['encoding'] is None:
            df[base_cat_feat] = df[base_cat_feat].astype('category')


    # Fill missing values for numerical features with -1.
    for base_num_feat in base_num_features:
        df[base_num_feat] = df[base_num_feat].fillna(-1)

    return df


def model_kfold(df, config, params, base_features, base_cat_features):
    target = '임신 성공 여부'

    skf = StratifiedKFold(n_splits=config['k_fold'], shuffle=True, random_state=config['seed'])
    models = []       # trained models
    roc_auc_scores = []  # roc_auc_scores for validation sets

    model_params = params[config['model']]

    for k_fold, (train_idx, valid_idx) in enumerate(skf.split(df[base_features], df[target])):
        print(f'Fold #{k_fold + 1}')
        X_train, y_train = df[base_features].iloc[train_idx], df[target].iloc[train_idx]
        X_valid, y_valid = df[base_features].iloc[valid_idx], df[target].iloc[valid_idx]

        if config['model'] == 'logistic':
            model = LogisticRegression(**model_params)

        elif config['model'] == 'et':
            model = ExtraTreesClassifier(**model_params)

        elif config['model'] == 'rf':
            model = RandomForestClassifier(**model_params)

        if config['model'] in ['logistic', 'et', 'rf']:
            model.fit(X_train, y_train)

        elif config['model'] == 'lgb':
            model = LGBMClassifier(**model_params)

            model.fit(
                X_train, y_train,
                eval_set=[(X_valid, y_valid)],
                eval_metric='auc',
                categorical_feature=base_cat_features, # specify categorical features
            )

        elif config['model'] == 'xgb':
            model = XGBClassifier(**model_params)

            model.fit(
                X_train, y_train,
                eval_set=[(X_valid, y_valid)],
                verbose = 100
            )

        elif config['model'] == 'cbt':
            model = CatBoostClassifier(**model_params)

            model.fit(
                X_train, y_train,
                eval_set=[(X_valid, y_valid)],
                cat_features=base_cat_features, # specify categorical features
            )

        # save the trained model
        models.append(model)

        # evaluate the model
        # --- train-set
        print('[Train] ', end='')
        y_prob = model.predict_proba(X_train)[:, 1]
        _ = get_clf_eval(y_train, y_prob, k_fold + 1)

        # --- valid-set
        print('[Valid] ', end='')
        y_prob = model.predict_proba(X_valid)[:, 1]
        roc_auc_score = get_clf_eval(y_valid, y_prob, k_fold + 1)

        roc_auc_scores.append(roc_auc_score)

    avg_roc_auc = np.mean(roc_auc_scores)
    var_roc_auc = np.var(roc_auc_scores)
    print(f'Avg. roc-auc of validset: {avg_roc_auc}')
    print(f'Var. roc-auc of validset: {var_roc_auc}')

    return models, avg_roc_auc, var_roc_auc

def kfold_submission(df_test, df_sub, models, config):
    feat_importance_path = f"{config['root']}/FeatureImportance"
    submission_path = f"{config['root']}/Submission"
    json_path = f"{config['root']}/Json"

    if not os.path.exists(feat_importance_path):
        os.makedirs(feat_importance_path)

    if not os.path.exists(submission_path):
        os.makedirs(submission_path)

    if not os.path.exists(json_path):
        os.makedirs(json_path)

    # get current date and time
    now = datetime.now()

    # record the year, month, day, hour, and minute for naming files.
    year = now.year
    month = now.month
    day = now.day
    hour = now.hour
    minute = now.minute

    # file format
    submission_time = f"{year:04d}{month:02d}{day:02d}_{hour:02d}{minute:02d}"[2:]
    target = 'probability'

    # apply feature engineering
    base_num_features, base_cat_features, base_features = make_feature_lists(df_test)
    df_test = filling_missing_values(df_test, base_cat_features, base_num_features, config)
    X_test = df_test[base_features]

    # dataframe for feature importances
    df_feature_importance_all = pd.DataFrame({'features': base_features})

    y_probs = 0

    for i, model in enumerate(models):
        y_probs += model.predict_proba(X_test)[:, 1] / len(models)

        # save feature importance of current model
        if config['model'] in ['logistic']:
            df_feature_importance_all[f'model_{i}'] = model.coef_.squeeze()

        elif config['model'] in ['et', 'rf', 'lgb', 'xgb']:
            df_feature_importance_all[f'model_{i}'] = model.feature_importances_

        elif config['model'] == 'cbt':
            df_feature_importance_all[f'model_{i}'] = model.get_feature_importance()


    df_sub[target] = y_probs

    # save submission file as CSV
    df_sub.to_csv(f"{submission_path}/{submission_time}_{config['model']}_{config['encoding']}_submission.csv", index=False)

    # compute avarege, rank
    df_feature_importance_all['average'] = df_feature_importance_all.iloc[:, 1:].mean(axis=1).values
    df_feature_importance_all['rank'] = df_feature_importance_all['average'].rank(ascending=False)

    # save the feature importance as CSV
    df_feature_importance_all.to_csv(f'{feat_importance_path}/feat_import_{submission_time}_{config["model"]}_{config["encoding"]}.csv', index=False)

    # save parameters as JSON
    json_data = json.dumps(config, indent=4)

    with open(f'{json_path}/{submission_time}_{config["model"]}_{config["encoding"]}.json', 'w') as file:
        file.write(json_data)

### Data Pre-processing

In [172]:
def main(config, params):
    models = []


    for seed in config['seed_list']:
        config['seed'] = seed
        params['random_seed'] = seed

        # set seed
        set_seed(config['seed'])

        # read data set
        df_train, df_test, df_sub = read_data(config)

        # feature engineering (train,test)
        df_train = feature_engineering(df_train, is_train=True)
        df_test = feature_engineering(df_test, is_train=False)

        # feature list 생성 (train)
        base_num_features, base_cat_features, base_features = make_feature_lists(df_train)

        # 결측치 처리
        df_train = filling_missing_values(df_train, base_cat_features, base_num_features, config)

        # encoding
        df_train, df_test, base_cat_features, base_features = category_encoding(df_train, df_test, base_num_features, base_cat_features, config)

        # check model performance
        model, avg_roc_auc, var_roc_auc = model_kfold(df_train, config, params, base_features, base_cat_features)

        config['avg_roc_auc'] = avg_roc_auc
        config['var_roc_auc'] = var_roc_auc

        models.extend(model)

    config['model_param'] = params[config['model']]

    # submission
    kfold_submission(df_test, df_sub, models, config)

In [None]:
main(config, params)