## Import

In [8]:
import os
import time
import glob
import random
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import lightgbm as lgb
from lightgbm import LGBMClassifier

# sklearn
from sklearn.metrics import log_loss, accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold,StratifiedShuffleSplit, train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# optuna
import optuna
from optuna.samplers import TPESampler
from optuna.pruners import SuccessiveHalvingPruner


# tensorflow
import tensorflow as tf
from tensorflow import keras
import tensorflow_addons as tfa

from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import ReduceLROnPlateau, ModelCheckpoint

## Data Load / Preprocessing

In [9]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

train = train.drop(columns=['id', 'father', 'mother', 'gender'])
test = test.drop(columns=['id', 'father', 'mother', 'gender'])

In [10]:
TARLE = LabelEncoder()
SNPLE = LabelEncoder()
SNPCOL = [f'SNP_{str(x).zfill(2)}' for x in range(1,16)]

snp_data = []
for col in SNPCOL:
    snp_data += list(train[col].values)
    
train.iloc[:,-1] = TARLE.fit_transform(train.iloc[:,-1])
SNPLE.fit(snp_data)

for col in train.columns:
    if col in SNPCOL:
        train[col] = SNPLE.transform(train[col])
        test[col] = SNPLE.transform(test[col])

## StraitfiedSS

In [11]:
# provides train/test indices to split data in train/test sets.
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)



for train_idx, test_idx in split.split(train, train['trait']):

    x_train = train.iloc[:,:-1].loc[train_idx]

    x_test = train.iloc[:,:-1].loc[test_idx]

    y_train = train.iloc[:,-1].loc[train_idx]

    y_test = train.iloc[:,-1].loc[test_idx]
    
# y_train = to_categorical(y_train)
# y_test = to_categorical(y_test)

In [12]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

def get_clf_eval(y_test, pred=None, pred_proba=None):
    print('오차행렬 \n', confusion_matrix(y_test, pred))
    print('정확도 :', accuracy_score(y_test, pred))
    print('정밀도 : ',precision_score(y_test, pred, average='macro'))
    print('재현율 :', recall_score(y_test, pred, average='macro'))
    print('f1 score :', f1_score(y_test, pred, average='macro'))

In [14]:
rf = RandomForestClassifier(random_state=0)
rf.fit(x_train,y_train)
pred = rf.predict(x_test)

In [15]:
get_clf_eval(y_test, pred)

오차행렬 
 [[14  0  0]
 [ 0 20  1]
 [ 0  3 15]]
정확도 : 0.9245283018867925
정밀도 :  0.9356884057971014
재현율 : 0.9285714285714285
f1 score : 0.9304812834224597


In [21]:
from sklearn.model_selection import GridSearchCV

params ={
    'n_estimators':[300, 400, 500, 600, 700],
    'max_features': ['sqrt', 'log2'],
    'max_depth':[4, 6, 8, 10, 12, 14],
    'criterion' :['gini', 'entropy'],
    'min_samples_leaf':[6, 8, 10, 12, 14],
    'min_samples_split':[4, 8, 16, 32]
}

rf = RandomForestClassifier(random_state=0, n_jobs=-1)
grid_cv = GridSearchCV(rf, param_grid=params, cv=5, n_jobs=-1)
grid_cv.fit(x_train,y_train)

In [24]:
best_param = grid_cv.best_params_
grid_cv.best_score_

0.9569105691056912

In [25]:
rf_1 = RandomForestClassifier(**best_param, random_state=0)
rf_1.fit(x_train,y_train)
pred = rf_1.predict(x_test)
get_clf_eval(y_test, pred)

오차행렬 
 [[14  0  0]
 [ 0 20  1]
 [ 0  7 11]]
정확도 : 0.8490566037735849
정밀도 :  0.8858024691358025
재현율 : 0.8544973544973544
f1 score : 0.8555555555555556


In [None]:
submit_9906 = pd.read_csv('./trial 14 (99.06%).csv')
check = []
for idx in range(len(submit)):
    if submit['class'][idx] == submit_9906['class'][idx]: check.append(True)
    else: check.append(False)
pd.DataFrame(check).value_counts()

True     171
False      4
dtype: int64

In [None]:
# submit.to_csv('./trial 17.csv', index=False)