In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd

import seaborn as sns

import matplotlib.pyplot as plt
from matplotlib import rc
rc('font', family='NanumGothic')
plt.rcParams['axes.unicode_minus'] = False

## 데이터셋 : `dataset.csv` $ \rightarrow $ `X`, `y`

In [2]:
file = 'dataset.csv'

dataset = pd.read_csv(file)
print(dataset.shape)
# dataset.head().transpose()
dataset['환매일종가위치'].astype('category').cat.codes

(1750, 40)


0       2
1       2
2       2
3       2
4       2
       ..
1745    2
1746    2
1747    2
1748    2
1749    2
Length: 1750, dtype: int8

In [3]:
cols_drop_info = ['종목코드', '기준가1', '녹인가1', '환매일 종가', '평가기준가']
cols_drop_duplicate = ['평가구분', '상환조건달성'] + ['녹인발생차수'] # unique 값이 1개
cols_drop_future = ['상환구분', '상환실현차수']

cols_drop_dt = ['발행일', '상환일', '평가시작일', '평가종료일', '환매결정일', '녹인발생일']
# 데이터 타입 변경
for col in cols_drop_dt:
    dataset[col] = pd.to_datetime(dataset[col])


# 피처로 쓰기 애매함
cols_cnt = [
    '녹인일수', '녹인일수_전', '영업일수', '상환일수'
]

cols_feature = ['차수', '기초자산개수', '녹인발생차수_차이']
cols_cat = ['환매일종가위치'] # 범주형 변수
cols_dummy = ['환매일종가위치_code']
dataset['환매일종가위치_code'] = dataset['환매일종가위치'].astype('category').cat.codes.astype(float)
cols_pct100 = [
    '상환조건(%)', '하한 수준(%)', '상환조건감소량(%)_prev', '상환조건감소량(%)_next',
    '환매일 수준(%)', '녹인대비상환수준(%)', '환매대비상환수준(%)', '환매대비상환수준(%)_next'
]
cols_pct = [
    '녹인비율', '녹인비율_전', 'H총증감률', 'H평균증감률', 'H일평균증감률', 'H이전대비증감률', '상환비율'
]
# 비율 단위 변경
for col in cols_pct:
    col_new = col+"(%)"
    dataset[col_new] = dataset[col]*100
    cols_pct100.append(col_new)

#
cols_drop = cols_drop_info + cols_drop_duplicate + cols_drop_future + cols_drop_dt + cols_cat


In [4]:
features = [
'기초자산개수',
 '녹인발생차수_차이',
 '상환조건(%)',
 '상환조건감소량(%)_next',
 '환매일 수준(%)',
 '녹인대비상환수준(%)',
 '환매대비상환수준(%)_next',
 '녹인비율(%)',
 '녹인비율_전(%)',
 'H총증감률(%)',
 'H이전대비증감률(%)',
 '상환비율(%)'
 ]

In [5]:
dataset = dataset[dataset['차수']==5]
dataset.shape

(266, 48)

In [6]:
for feature in features:
    if len(dataset[feature].unique())==1:
        print(feature)

녹인발생차수_차이
녹인비율(%)
녹인비율_전(%)


In [7]:
dataset['label'].value_counts()

label
1.0    173
0.0     93
Name: count, dtype: int64

In [8]:
dataset['label'] = dataset['label'].apply(lambda x: 0 if x==1 else 1)

dataset['label'].value_counts()

label
0    173
1     93
Name: count, dtype: int64

In [9]:
# col_X = cols_feature + cols_pct100
col_X = features
col_y = 'label'

df = dataset[col_X + [col_y]] # train + test(2015)

# X = dataset[col_X]
X = dataset[col_X].drop(columns=['상환비율(%)'])
y = dataset[col_y]

df.shape, X.shape, y.shape

((266, 13), (266, 11), (266,))

## 테스트셋 : `dataset_test.csv` $\rightarrow$ `X_test`, `y_test`

In [10]:
##### test_set import #### 
import pandas as pd 
test_data = pd.read_csv('dataset_test.csv')
test_data.head()

Unnamed: 0,종목코드,차수,평가종료일,상환조건(%),평가시작일,환매결정일,발행일,상환일,상환실현차수,녹인발생일,...,상환비율,녹인발생차수,녹인발생차수_차이,상환조건감소량(%)_prev,상환조건감소량(%)_next,녹인대비상환수준(%),환매일 종가,환매일 수준(%),환매대비상환수준(%),환매대비상환수준(%)_next
0,KR6DS0000428,3,2022-07-13,85.0,2022-01-14,2022-06-28,2021-01-18,2024-01-17,6,2022-03-15,...,0.0,3.0,0.0,5.0,0.0,30.0,7893.759766,69.729595,15.270405,15.270405
1,KR6DS0000428,4,2023-01-13,85.0,2022-07-14,2022-12-29,2021-01-18,2024-01-17,6,2022-03-15,...,0.0,3.0,1.0,-0.0,5.0,25.0,6695.569824,59.145374,25.854626,20.854626
2,KR6DS0000428,5,2023-07-13,80.0,2023-01-14,2023-06-28,2021-01-18,2024-01-17,6,2022-03-15,...,0.0,3.0,2.0,5.0,5.0,20.0,6521.220215,57.605255,22.394745,17.394745
3,KR6HN0000H91,3,2022-07-07,85.0,2022-01-08,2022-06-22,2021-01-08,2024-01-09,6,2022-03-09,...,0.0,3.0,0.0,-0.0,5.0,15.0,7335.0,67.937237,17.062763,12.062763
4,KR6HN0000H91,4,2023-01-06,80.0,2022-07-08,2022-12-22,2021-01-08,2024-01-09,6,2022-03-09,...,0.0,3.0,1.0,5.0,5.0,10.0,6716.319824,62.206981,17.793019,12.793019


In [11]:
dataset = test_data

cols_drop_info = ['종목코드', '기준가1', '녹인가1', '환매일 종가', '평가기준가']
cols_drop_duplicate = ['평가구분', '상환조건달성'] + ['녹인발생차수'] # unique 값이 1개
cols_drop_future = ['상환구분', '상환실현차수']

cols_drop_dt = ['발행일', '상환일', '평가시작일', '평가종료일', '환매결정일', '녹인발생일']
# 데이터 타입 변경
for col in cols_drop_dt:
    dataset[col] = pd.to_datetime(dataset[col])


# 피처로 쓰기 애매함
cols_cnt = [
    '녹인일수', '녹인일수_전', '영업일수', '상환일수'
]

cols_feature = ['차수', '기초자산개수', '녹인발생차수_차이']
cols_cat = ['환매일종가위치'] # 범주형 변수
cols_dummy = ['환매일종가위치_code']
dataset['환매일종가위치_code'] = dataset['환매일종가위치'].astype('category').cat.codes.astype(float)
cols_pct100 = [
    '상환조건(%)', '하한 수준(%)', '상환조건감소량(%)_prev', '상환조건감소량(%)_next',
    '환매일 수준(%)', '녹인대비상환수준(%)', '환매대비상환수준(%)', '환매대비상환수준(%)_next'
]
cols_pct = [
    '녹인비율', '녹인비율_전', 'H총증감률', 'H평균증감률', 'H일평균증감률', 'H이전대비증감률', '상환비율'
]
# 비율 단위 변경
for col in cols_pct:
    col_new = col+"(%)"
    dataset[col_new] = dataset[col]*100
    cols_pct100.append(col_new)

#
cols_drop = cols_drop_info + cols_drop_duplicate + cols_drop_future + cols_drop_dt + cols_cat

test_data = dataset

In [12]:
test_data = test_data[test_data['차수']==5]
test_data.shape

(41, 46)

In [13]:
test_data['label'] = test_data['label'].apply(lambda x: 0 if x==1 else 1)

test_data['label'].value_counts()

label
1    41
Name: count, dtype: int64

In [14]:
col_X = features
col_y = 'label'

test = test_data[col_X + [col_y]]

# X_test = test_data[col_X]
X_test = test_data[col_X].drop(columns=['상환비율(%)'])
y_test = test_data[col_y]

test.shape, X_test.shape, y_test.shape

((41, 13), (41, 11), (41,))

## 전처리

### 데이터셋 분리

In [15]:
# train test split

from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, 
    test_size=0.2, random_state=42, 
    stratify= y
)


### 스케일링

In [16]:
# scaling

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Fit the scaler to the selected columns and transform them
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

### 샘플링

In [17]:
from imblearn.over_sampling import RandomOverSampler, SMOTE

# Random Oversampling
ros = RandomOverSampler(random_state=42)
X_ros, y_ros = ros.fit_resample(X_train_scaled, y_train)

# SMOTE
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X_train_scaled, y_train)

In [22]:
X_train_scaled.shape, y_train.shape

((212, 11), (212,))

In [23]:
y_train.value_counts()

label
0    138
1     74
Name: count, dtype: int64

In [18]:
X_ros.shape, y_ros.shape

((276, 11), (276,))

In [19]:
y_ros.value_counts()

label
0    138
1    138
Name: count, dtype: int64

In [20]:
X_smote.shape, y_smote.shape

((276, 11), (276,))

In [21]:
y_smote.value_counts()

label
0    138
1    138
Name: count, dtype: int64

## 모델링

In [53]:
# 분류 모델 성능평가
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
from sklearn.metrics import roc_auc_score

def eval(model, x, y):
    # 예측
    y_pred = model.predict(x)

    cf_matrix = confusion_matrix(y, y_pred)
    print(cf_matrix)

    acc = accuracy_score(y, y_pred)
    print("Accuracy : %.3f" % acc)

    prec = precision_score(y, y_pred)
    print("Precision : %.3f" % prec)

    rec = recall_score(y, y_pred)
    print("Recall : %.3f" % rec)

    f1 = f1_score(y, y_pred)
    print("F1 : %.3f" % f1)


    y_pred_proba = model.predict_proba(x)[:, 1]
    try:
        roc_auc = roc_auc_score(y, y_pred_proba)
    except:
        roc_auc=None
    print("ROC_AUC :", roc_auc)

    return {
        'predicted' : y_pred, 
        'predicted_proba' : y_pred_proba, 
        'confusion_matrix' : cf_matrix, 
        'metrics' : {
            'accuracy' : [acc],
            'precision' : [prec], 
            'recall' : [rec],
            'f1' : [f1],
            'roc_auc' : [roc_auc]
        }
    }

In [43]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
def train_lr(x, y):
    model = LogisticRegression()
    model.fit(x, y)

    return model

# Decision Tree
from sklearn.tree import DecisionTreeClassifier
def train_dt(x, y):
    model = DecisionTreeClassifier(random_state=0)
    model.fit(x, y)
    return model

# SVC
from sklearn.svm import SVC
def train_svc(x, y):
    svc = SVC(random_state=0, probability=True)
    svc.fit(x, y)
    return svc

# Random Forest
from sklearn.ensemble import RandomForestClassifier
def train_rf(x, y):
    rf_model = RandomForestClassifier()
    rf_model.fit(x, y)
    return rf_model

# XGB
from xgboost import XGBClassifier
def train_xgb(x, y):
    xgb_model = XGBClassifier()
    xgb_model.fit(x, y)
    return xgb_model

# LGBM
from lightgbm import LGBMClassifier
def train_lgbm(x, y):
    lgbm_model = LGBMClassifier()
    lgbm_model.fit(x, y)
    return lgbm_model

In [44]:
def get_model(x, y, method = 'lr'):
    # lr, dt, svc, rf, xgb, lgbm
    if method=='lr':
        return train_lr(x, y)
    elif method=='dt':
        return train_dt(x, y)
    elif method =='svc':
        return train_svc(x, y)
    elif method=='rf':
        return train_rf(x, y)
    elif method=='xgb':
        return train_xgb(x, y)
    elif method=='lgbm':
        return train_lgbm(x, y)
    else:
        return 'method는 lr, dt, svc, rf, xgb, lgbm 중 하나여야 함'

In [74]:
# train
# - X_train_scaled : standard scaling
# - X_ros : standard scaling + random over samplint
# - X_smote : standard scaling + SMOTE

# method는 lr, dt, svc, rf, xgb, lgbm 중 하나여야 함
# method = 'lr'

res = []

method = 'lr'
print('모델명:', method, '\n')

print("# 스케일링")
model = get_model(X_train_scaled, y_train, method=method)
print("## Validation(2015)")
res_val = eval(model, X_val_scaled, y_val)
df = pd.DataFrame(res_val['metrics'])
df['model'] = method
df['data'] = 'val'
df['preprocessing'] = 'scaling'
res.append(df)

print("## Test(2021)")
res_test = eval(model, X_test_scaled, y_test)
df = pd.DataFrame(res_test['metrics'])
df['model'] = method
df['data'] = 'test'
df['preprocessing'] = 'scaling'
res.append(df)
print("##################################")

print("# 스케일링 + 랜덤오버샘플링")
model_ros = get_model(X_ros, y_ros, method=method)
print("## Validation(2015)")
res_val_ros = eval(model_ros, X_val_scaled, y_val)
df = pd.DataFrame(res_val_ros['metrics'])
df['model'] = method
df['data'] = 'val'
df['preprocessing'] = 'RandomOverSampling'
res.append(df)

print("## Test(2021)")
res_test_ros = eval(model_ros, X_test_scaled, y_test)
df = pd.DataFrame(res_test_ros['metrics'])
df['model'] = method
df['data'] = 'test'
df['preprocessing'] = 'RandomOverSampling'
res.append(df)
print("##################################")

print("# 스케일링 + SMOTE")
model_smote = get_model(X_smote, y_smote, method=method)
print("## Validation(2015)")
res_val_smote = eval(model_smote, X_val_scaled, y_val)
df = pd.DataFrame(res_val_smote['metrics'])
df['model'] = method
df['data'] = 'val'
df['preprocessing'] = 'SMOTE'
res.append(df)
print("## Test(2021)")
res_test_smote = eval(model_smote, X_test_scaled, y_test)
df = pd.DataFrame(res_test_smote['metrics'])
df['model'] = method
df['data'] = 'test'
df['preprocessing'] = 'SMOTE'
res.append(df)
print("##################################")

모델명: lr 

# 스케일링
## Validation(2015)
[[32  3]
 [ 3 16]]
Accuracy : 0.889
Precision : 0.842
Recall : 0.842
F1 : 0.842
ROC_AUC : 0.9458646616541353
## Test(2021)
[[ 0  0]
 [36  5]]
Accuracy : 0.122
Precision : 1.000
Recall : 0.122
F1 : 0.217
ROC_AUC : None
##################################
# 스케일링 + 랜덤오버샘플링
## Validation(2015)
[[28  7]
 [ 2 17]]
Accuracy : 0.833
Precision : 0.708
Recall : 0.895
F1 : 0.791
ROC_AUC : 0.9488721804511278
## Test(2021)
[[ 0  0]
 [36  5]]
Accuracy : 0.122
Precision : 1.000
Recall : 0.122
F1 : 0.217
ROC_AUC : None
##################################
# 스케일링 + SMOTE
## Validation(2015)
[[28  7]
 [ 2 17]]
Accuracy : 0.833
Precision : 0.708
Recall : 0.895
F1 : 0.791
ROC_AUC : 0.930827067669173
## Test(2021)
[[ 0  0]
 [33  8]]
Accuracy : 0.195
Precision : 1.000
Recall : 0.195
F1 : 0.327
ROC_AUC : None
##################################


In [75]:
results = pd.concat(res, axis=0)
results[['model', 'data', 'preprocessing', 'accuracy', 'precision', 'recall', 'f1', 'roc_auc']]

Unnamed: 0,model,data,preprocessing,accuracy,precision,recall,f1,roc_auc
0,lr,val,scaling,0.888889,0.842105,0.842105,0.842105,0.945865
0,lr,test,scaling,0.121951,1.0,0.121951,0.217391,
0,lr,val,RandomOverSampling,0.833333,0.708333,0.894737,0.790698,0.948872
0,lr,test,RandomOverSampling,0.121951,1.0,0.121951,0.217391,
0,lr,val,SMOTE,0.833333,0.708333,0.894737,0.790698,0.930827
0,lr,test,SMOTE,0.195122,1.0,0.195122,0.326531,


In [77]:
X_train['녹인발생차수_차이'].value_counts()

녹인발생차수_차이
3.0    212
Name: count, dtype: int64

In [78]:
X_val['녹인발생차수_차이'].value_counts()

녹인발생차수_차이
3.0    54
Name: count, dtype: int64

In [79]:
X_test['녹인발생차수_차이'].value_counts()

녹인발생차수_차이
2.0    22
1.0    19
Name: count, dtype: int64

In [76]:
X_train.corr()

Unnamed: 0,기초자산개수,녹인발생차수_차이,상환조건(%),상환조건감소량(%)_next,환매일 수준(%),녹인대비상환수준(%),환매대비상환수준(%)_next,녹인비율(%),녹인비율_전(%),H총증감률(%),H이전대비증감률(%)
기초자산개수,1.0,,-0.107004,0.104491,-0.005108,0.034965,-0.107807,,,-0.10341,0.02056
녹인발생차수_차이,,,,,,,,,,,
상환조건(%),-0.107004,,1.0,-0.198573,0.255935,0.676147,0.537269,,,-0.007319,0.334367
상환조건감소량(%)_next,0.104491,,-0.198573,1.0,-0.06026,-0.551123,-0.479823,,,0.047622,-0.125916
환매일 수준(%),-0.005108,,0.255935,-0.06026,1.0,0.185736,-0.586319,,,0.415979,0.563836
녹인대비상환수준(%),0.034965,,0.676147,-0.551123,0.185736,1.0,0.518514,,,-0.105141,0.307645
환매대비상환수준(%)_next,-0.107807,,0.537269,-0.479823,-0.586319,0.518514,1.0,,,-0.347544,-0.169134
녹인비율(%),,,,,,,,,,,
녹인비율_전(%),,,,,,,,,,,
H총증감률(%),-0.10341,,-0.007319,0.047622,0.415979,-0.105141,-0.347544,,,1.0,0.047392
