In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd

import seaborn as sns

import matplotlib.pyplot as plt
from matplotlib import rc
rc('font', family='NanumGothic')
plt.rcParams['axes.unicode_minus'] = False

## 데이터셋 : `dataset.csv` $ \rightarrow $ `X`, `y`

In [2]:
file = 'dataset.csv'

dataset = pd.read_csv(file)
print(dataset.shape)
# dataset.head().transpose()
dataset['환매일종가위치'].astype('category').cat.codes

(1750, 40)


0       2
1       2
2       2
3       2
4       2
       ..
1745    2
1746    2
1747    2
1748    2
1749    2
Length: 1750, dtype: int8

In [3]:
cols_drop_info = ['종목코드', '기준가1', '녹인가1', '환매일 종가', '평가기준가']
cols_drop_duplicate = ['평가구분', '상환조건달성'] + ['녹인발생차수'] # unique 값이 1개
cols_drop_future = ['상환구분', '상환실현차수']

cols_drop_dt = ['발행일', '상환일', '평가시작일', '평가종료일', '환매결정일', '녹인발생일']
# 데이터 타입 변경
for col in cols_drop_dt:
    dataset[col] = pd.to_datetime(dataset[col])


# 피처로 쓰기 애매함
cols_cnt = [
    '녹인일수', '녹인일수_전', '영업일수', '상환일수'
]

cols_feature = ['차수', '기초자산개수', '녹인발생차수_차이']
cols_cat = ['환매일종가위치'] # 범주형 변수
cols_dummy = ['환매일종가위치_code']
dataset['환매일종가위치_code'] = dataset['환매일종가위치'].astype('category').cat.codes.astype(float)
cols_pct100 = [
    '상환조건(%)', '하한 수준(%)', '상환조건감소량(%)_prev', '상환조건감소량(%)_next',
    '환매일 수준(%)', '녹인대비상환수준(%)', '환매대비상환수준(%)', '환매대비상환수준(%)_next'
]
cols_pct = [
    '녹인비율', '녹인비율_전', 'H총증감률', 'H평균증감률', 'H일평균증감률', 'H이전대비증감률', '상환비율'
]
# 비율 단위 변경
for col in cols_pct:
    col_new = col+"(%)"
    dataset[col_new] = dataset[col]*100
    cols_pct100.append(col_new)

#
cols_drop = cols_drop_info + cols_drop_duplicate + cols_drop_future + cols_drop_dt + cols_cat


In [4]:
features = [
'기초자산개수',
 '녹인발생차수_차이',
 '상환조건(%)',
 '상환조건감소량(%)_next',
 '환매일 수준(%)',
 '녹인대비상환수준(%)',
 '환매대비상환수준(%)_next',
 '녹인비율(%)',
 '녹인비율_전(%)',
 'H총증감률(%)',
 'H이전대비증감률(%)',
 '상환비율(%)'
 ]

In [5]:
# col_X = cols_feature + cols_pct100
col_X = features
col_y = 'label'

df = dataset[col_X + [col_y]] # train + test(2015)

X = dataset[col_X]
y = dataset[col_y]

df.shape, X.shape, y.shape

((1750, 13), (1750, 12), (1750,))

## 테스트셋 : `dataset_test.csv` $\rightarrow$ `X_test`, `y_test`

In [6]:
##### test_set import #### 
import pandas as pd 
test_data = pd.read_csv('dataset_test.csv')
test_data.head()

Unnamed: 0,종목코드,차수,평가종료일,상환조건(%),평가시작일,환매결정일,발행일,상환일,상환실현차수,녹인발생일,...,상환비율,녹인발생차수,녹인발생차수_차이,상환조건감소량(%)_prev,상환조건감소량(%)_next,녹인대비상환수준(%),환매일 종가,환매일 수준(%),환매대비상환수준(%),환매대비상환수준(%)_next
0,KR6DS0000428,3,2022-07-13,85.0,2022-01-14,2022-06-28,2021-01-18,2024-01-17,6,2022-03-15,...,0.0,3.0,0.0,5.0,0.0,30.0,7893.759766,69.729595,15.270405,15.270405
1,KR6DS0000428,4,2023-01-13,85.0,2022-07-14,2022-12-29,2021-01-18,2024-01-17,6,2022-03-15,...,0.0,3.0,1.0,-0.0,5.0,25.0,6695.569824,59.145374,25.854626,20.854626
2,KR6DS0000428,5,2023-07-13,80.0,2023-01-14,2023-06-28,2021-01-18,2024-01-17,6,2022-03-15,...,0.0,3.0,2.0,5.0,5.0,20.0,6521.220215,57.605255,22.394745,17.394745
3,KR6HN0000H91,3,2022-07-07,85.0,2022-01-08,2022-06-22,2021-01-08,2024-01-09,6,2022-03-09,...,0.0,3.0,0.0,-0.0,5.0,15.0,7335.0,67.937237,17.062763,12.062763
4,KR6HN0000H91,4,2023-01-06,80.0,2022-07-08,2022-12-22,2021-01-08,2024-01-09,6,2022-03-09,...,0.0,3.0,1.0,5.0,5.0,10.0,6716.319824,62.206981,17.793019,12.793019


In [7]:
dataset = test_data

cols_drop_info = ['종목코드', '기준가1', '녹인가1', '환매일 종가', '평가기준가']
cols_drop_duplicate = ['평가구분', '상환조건달성'] + ['녹인발생차수'] # unique 값이 1개
cols_drop_future = ['상환구분', '상환실현차수']

cols_drop_dt = ['발행일', '상환일', '평가시작일', '평가종료일', '환매결정일', '녹인발생일']
# 데이터 타입 변경
for col in cols_drop_dt:
    dataset[col] = pd.to_datetime(dataset[col])


# 피처로 쓰기 애매함
cols_cnt = [
    '녹인일수', '녹인일수_전', '영업일수', '상환일수'
]

cols_feature = ['차수', '기초자산개수', '녹인발생차수_차이']
cols_cat = ['환매일종가위치'] # 범주형 변수
cols_dummy = ['환매일종가위치_code']
dataset['환매일종가위치_code'] = dataset['환매일종가위치'].astype('category').cat.codes.astype(float)
cols_pct100 = [
    '상환조건(%)', '하한 수준(%)', '상환조건감소량(%)_prev', '상환조건감소량(%)_next',
    '환매일 수준(%)', '녹인대비상환수준(%)', '환매대비상환수준(%)', '환매대비상환수준(%)_next'
]
cols_pct = [
    '녹인비율', '녹인비율_전', 'H총증감률', 'H평균증감률', 'H일평균증감률', 'H이전대비증감률', '상환비율'
]
# 비율 단위 변경
for col in cols_pct:
    col_new = col+"(%)"
    dataset[col_new] = dataset[col]*100
    cols_pct100.append(col_new)

#
cols_drop = cols_drop_info + cols_drop_duplicate + cols_drop_future + cols_drop_dt + cols_cat

test_data = dataset

In [8]:
col_X = features
col_y = 'label'

test = test_data[col_X + [col_y]]

X_test = test_data[col_X]
y_test = test_data[col_y]

test.shape, X_test.shape, y_test.shape

((104, 13), (104, 12), (104,))

## 모델링

In [9]:
# train test split

from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, 
    test_size=0.2, random_state=42, 
    stratify= y
)


In [10]:
# scaling

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Fit the scaler to the selected columns and transform them
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

## LR

In [11]:
# modeling

from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train_scaled, y_train)

# 예측
y_pred = model.predict(X_val_scaled)

# 모델 성능평가
from sklearn.metrics import confusion_matrix

cf_matrix = confusion_matrix(y_val, y_pred)
print(cf_matrix)

from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

print("Accuracy : %.3f" % accuracy_score(y_val, y_pred))
print("Precision : %.3f" % precision_score(y_val, y_pred))
print("Recall : %.3f" % recall_score(y_val, y_pred))
print("F1 : %.3f" % f1_score(y_val, y_pred))

[[256  13]
 [  9  72]]
Accuracy : 0.937
Precision : 0.847
Recall : 0.889
F1 : 0.867


In [13]:
y_test.value_counts()

label
0.0    104
Name: count, dtype: int64

In [12]:
# 예측
y_pred = model.predict(X_test_scaled)

# 모델 성능평가
from sklearn.metrics import confusion_matrix

cf_matrix = confusion_matrix(y_test, y_pred)
print(cf_matrix)

from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

print("Accuracy : %.3f" % accuracy_score(y_test, y_pred))
print("Precision : %.3f" % precision_score(y_test, y_pred))
print("Recall : %.3f" % recall_score(y_test, y_pred))
print("F1 : %.3f" % f1_score(y_test, y_pred))

[[96  8]
 [ 0  0]]
Accuracy : 0.923
Precision : 0.000
Recall : 0.000
F1 : 0.000


## DT

In [14]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=0)

# fit
dt.fit(X_train_scaled, y_train)

# 예측
y_pred = dt.predict(X_val_scaled)

# 모델 성능평가
from sklearn.metrics import confusion_matrix

cf_matrix = confusion_matrix(y_val, y_pred)
print(cf_matrix)


# score print
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
print("Accuracy : %.3f" % accuracy_score(y_val, y_pred))
print("Precision : %.3f" % precision_score(y_val, y_pred))
print("Recall : %.3f" % recall_score(y_val, y_pred))
print("F1 : %.3f" % f1_score(y_val, y_pred))

[[264   5]
 [  7  74]]
Accuracy : 0.966
Precision : 0.937
Recall : 0.914
F1 : 0.925


In [15]:
# 예측
y_pred = dt.predict(X_test_scaled)

# 모델 성능평가
from sklearn.metrics import confusion_matrix

cf_matrix = confusion_matrix(y_test, y_pred)
print(cf_matrix)

from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

print("Accuracy : %.3f" % accuracy_score(y_test, y_pred))
print("Precision : %.3f" % precision_score(y_test, y_pred))
print("Recall : %.3f" % recall_score(y_test, y_pred))
print("F1 : %.3f" % f1_score(y_test, y_pred))

[[93 11]
 [ 0  0]]
Accuracy : 0.894
Precision : 0.000
Recall : 0.000
F1 : 0.000


## SVC

In [16]:
from sklearn.svm import SVC
svc = SVC(random_state=0, probability=True)

# fit
svc.fit(X_train_scaled, y_train)

# 예측
y_pred = svc.predict(X_val_scaled)

# 모델 성능평가
from sklearn.metrics import confusion_matrix

cf_matrix = confusion_matrix(y_val, y_pred)
print(cf_matrix)


# score print
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
print("Accuracy : %.3f" % accuracy_score(y_val, y_pred))
print("Precision : %.3f" % precision_score(y_val, y_pred))
print("Recall : %.3f" % recall_score(y_val, y_pred))
print("F1 : %.3f" % f1_score(y_val, y_pred))

[[256  13]
 [  7  74]]
Accuracy : 0.943
Precision : 0.851
Recall : 0.914
F1 : 0.881


In [17]:
# 예측
y_pred = svc.predict(X_test_scaled)

# 모델 성능평가
from sklearn.metrics import confusion_matrix

cf_matrix = confusion_matrix(y_test, y_pred)
print(cf_matrix)

from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

print("Accuracy : %.3f" % accuracy_score(y_test, y_pred))
print("Precision : %.3f" % precision_score(y_test, y_pred))
print("Recall : %.3f" % recall_score(y_test, y_pred))
print("F1 : %.3f" % f1_score(y_test, y_pred))

[[104]]
Accuracy : 1.000
Precision : 0.000
Recall : 0.000
F1 : 0.000


## RF

In [18]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier()

# fit
rf_model.fit(X_train_scaled, y_train)

# 예측
y_pred = rf_model.predict(X_val_scaled)

# 모델 성능평가
from sklearn.metrics import confusion_matrix

cf_matrix = confusion_matrix(y_val, y_pred)
print(cf_matrix)


# score print
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
print("Accuracy : %.3f" % accuracy_score(y_val, y_pred))
print("Precision : %.3f" % precision_score(y_val, y_pred))
print("Recall : %.3f" % recall_score(y_val, y_pred))
print("F1 : %.3f" % f1_score(y_val, y_pred))

[[265   4]
 [  7  74]]
Accuracy : 0.969
Precision : 0.949
Recall : 0.914
F1 : 0.931


In [19]:
# 예측
y_pred = rf_model.predict(X_test_scaled)

# 모델 성능평가
from sklearn.metrics import confusion_matrix

cf_matrix = confusion_matrix(y_test, y_pred)
print(cf_matrix)

from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

print("Accuracy : %.3f" % accuracy_score(y_test, y_pred))
print("Precision : %.3f" % precision_score(y_test, y_pred))
print("Recall : %.3f" % recall_score(y_test, y_pred))
print("F1 : %.3f" % f1_score(y_test, y_pred))

[[93 11]
 [ 0  0]]
Accuracy : 0.894
Precision : 0.000
Recall : 0.000
F1 : 0.000


## XGB

In [21]:
from xgboost import XGBClassifier
xgb_model = XGBClassifier()

# fit
xgb_model.fit(X_train_scaled, y_train)

# 예측
y_pred = xgb_model.predict(X_val_scaled)


# 모델 성능평가
from sklearn.metrics import confusion_matrix
cf_matrix = confusion_matrix(y_val, y_pred)
print(cf_matrix)

# score print
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
print("Accuracy : %.3f" % accuracy_score(y_val, y_pred))
print("Precision : %.3f" % precision_score(y_val, y_pred))
print("Recall : %.3f" % recall_score(y_val, y_pred))
print("F1 : %.3f" % f1_score(y_val, y_pred))

[[267   2]
 [  5  76]]
Accuracy : 0.980
Precision : 0.974
Recall : 0.938
F1 : 0.956


In [22]:
# 예측
y_pred = xgb_model.predict(X_test_scaled)

# 모델 성능평가
from sklearn.metrics import confusion_matrix

cf_matrix = confusion_matrix(y_test, y_pred)
print(cf_matrix)

from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

print("Accuracy : %.3f" % accuracy_score(y_test, y_pred))
print("Precision : %.3f" % precision_score(y_test, y_pred))
print("Recall : %.3f" % recall_score(y_test, y_pred))
print("F1 : %.3f" % f1_score(y_test, y_pred))

[[93 11]
 [ 0  0]]
Accuracy : 0.894
Precision : 0.000
Recall : 0.000
F1 : 0.000


## LGBM

In [23]:
from lightgbm import LGBMClassifier
lgbm_model = LGBMClassifier()

# fit
lgbm_model.fit(X_train_scaled, y_train)

# 예측
y_pred = lgbm_model.predict(X_val_scaled)


# 모델 성능평가
from sklearn.metrics import confusion_matrix
cf_matrix = confusion_matrix(y_val, y_pred)
print(cf_matrix)

# score print
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
print("Accuracy : %.3f" % accuracy_score(y_val, y_pred))
print("Precision : %.3f" % precision_score(y_val, y_pred))
print("Recall : %.3f" % recall_score(y_val, y_pred))
print("F1 : %.3f" % f1_score(y_val, y_pred))

[LightGBM] [Info] Number of positive: 323, number of negative: 1077
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000688 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1242
[LightGBM] [Info] Number of data points in the train set: 1400, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.230714 -> initscore=-1.204282
[LightGBM] [Info] Start training from score -1.204282
[[268   1]
 [  5  76]]
Accuracy : 0.983
Precision : 0.987
Recall : 0.938
F1 : 0.962


In [24]:
# 예측
y_pred = lgbm_model.predict(X_test_scaled)

# 모델 성능평가
from sklearn.metrics import confusion_matrix

cf_matrix = confusion_matrix(y_test, y_pred)
print(cf_matrix)

from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

print("Accuracy : %.3f" % accuracy_score(y_test, y_pred))
print("Precision : %.3f" % precision_score(y_test, y_pred))
print("Recall : %.3f" % recall_score(y_test, y_pred))
print("F1 : %.3f" % f1_score(y_test, y_pred))

[[87 17]
 [ 0  0]]
Accuracy : 0.837
Precision : 0.000
Recall : 0.000
F1 : 0.000
