In [32]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb

import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.svm import SVC
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold, GridSearchCV, train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier

import optuna
# import optuna.integration.lightgbm as oplgb
# import optuna.integration.xgboost as opxgb

In [16]:
!ls ./data

[31mtestset.csv[m[m  [31mtrainset.csv[m[m


In [17]:
# Const variable

DATA_PATH = 'data'
SEED = 222

# KNN
neighbors = 5

In [28]:
train_data = pd.read_csv(f'{DATA_PATH}/trainset.csv',
                         header=None)
test_data = pd.read_csv(f'{DATA_PATH}/testset.csv',
                        header=None)

train_data[0]

0       HI
1       PH
2       GR
3       PH
4       EL
        ..
4275    EL
4276    PH
4277    EL
4278    PH
4279    EL
Name: 0, Length: 4280, dtype: object

In [5]:
# Train Data 처리
# TODO:
# 1. data 전처리 [x]
#   label -> int 처리
# 1. Train dataset 나누기 (train/validation)
# 2. KMeans, KNN, DecisionTree, RandomForest
# 3. KFold
# 4. HyperParameter Tuning

In [6]:
# 데이터 전처리
# train_data[0], uniques = pd.factorize(train_data[0])
# classify_values = {}

# for i, v in enumerate(uniques):
#     classify_values[i] = v
# classify_values
# train_data[1]

0       188
1       174
2       175
3       176
4       182
       ... 
4275    153
4276    179
4277    171
4278    175
4279    180
Name: 1, Length: 4280, dtype: int64

In [19]:
X = train_data.iloc[:, 1:]
y = train_data.iloc[:, 0]
train_data
X

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,22,23,24,25,26,27,28,29,30,31
0,188,128,95,114,143,108,88,103,113,85,...,100,78,70,79,84,66,70,75,76,63
1,174,112,88,104,119,92,74,79,88,74,...,82,65,70,75,89,73,67,71,89,73
2,175,138,106,105,135,109,75,95,113,96,...,110,98,67,88,119,98,75,91,110,94
3,176,111,80,106,131,96,76,99,104,85,...,96,78,78,91,96,78,82,104,112,85
4,182,144,111,100,151,119,67,106,114,90,...,108,88,71,103,113,92,68,107,118,92
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4275,153,120,95,83,123,104,53,83,100,85,...,100,78,52,67,84,78,52,71,84,78
4276,179,110,82,109,135,97,83,95,97,75,...,86,68,80,94,94,76,80,89,94,72
4277,171,138,106,106,152,122,76,112,122,99,...,112,92,70,100,117,92,66,109,122,92
4278,175,109,81,105,123,81,71,79,85,67,...,90,68,75,88,97,75,75,88,97,72


In [20]:
# X
y

0       HI
1       PH
2       GR
3       PH
4       EL
        ..
4275    EL
4276    PH
4277    EL
4278    PH
4279    EL
Name: 0, Length: 4280, dtype: object

In [21]:
# Define model

# KNN
knn = KNeighborsClassifier()
# DecisionTree
dt = DecisionTreeClassifier(random_state=SEED)
# SVM
svm = svm.SVC(random_state=SEED)
# RandomForest
rf = RandomForestClassifier(random_state=SEED,)

# LightGBM, Xgboost 는 optuna로 진행

models = [knn, dt, svm, rf]

In [22]:
# Train model w. KFold

# KFold
kf = KFold(n_splits=5, random_state=SEED, shuffle=True)

mean_scores = []

# TODO: apply hyperparameter tuning
knn_params = {
    ''
}
dt_params = {
    
}
svm_params = {
    
}
rf_params = {
    
}


for model in models:
    print(f"{model} training\n")
    scores = cross_val_score(model, X, y, n_jobs=-1, scoring='accuracy', cv=kf)
    print(f"mean score: {np.mean(scores)}\n")
    mean_scores.append(np.mean(scores))
print(mean_scores)

KNeighborsClassifier() training

mean score: 0.9004672897196262

DecisionTreeClassifier(random_state=222) training

mean score: 0.8464953271028038

SVC(random_state=222) training

mean score: 0.8794392523364486

RandomForestClassifier(random_state=222) training

mean score: 0.9088785046728972

[0.9004672897196262, 0.8464953271028038, 0.8794392523364486, 0.9088785046728972]


In [59]:
# LightGBM, Xgboost w. optuna

# LightGBM
lightgbm = lgb.LGBMClassifier(random_state=SEED)
# Xgboost
xgbboost = xgb.XGBClassifier(random_state=SEED)

def objective(trial):
    
    classifier = trial.suggest_categorical('classifier', ['KNeighbor', 'DecisionTree', 'SVM', 
                                                          'RandomForest', 'LightGBM', 'Xgboost'])
    
    # KFold
    kf = KFold(n_splits=8, random_state=SEED, shuffle=True)
    
    if classifier == 'KNeighbor':
        # KNN params
        knn_n_neighbors = trial.suggest_int('n_neighbors', 3, 10)
        knn_weights = trial.suggest_categorical('weights', ['uniform', 'distance'])
        knn_algorithm = trial.suggest_categorical('algorithm', ['auto', 'ball_tree', 'kd_tree', 'brute'])
        knn_leaf_size = trial.suggest_int('leaf_size', 10, 40)
        knn_p = trial.suggest_int('p', 1, 2)
        
        model = KNeighborsClassifier(
            n_neighbors=knn_n_neighbors,
            weights=knn_weights,
            algorithm=knn_algorithm,
            leaf_size=knn_leaf_size,
            p=knn_p
        )

    elif classifier == 'DecisionTree':
        # DecisionTree params
        dt_criterion = trial.suggest_categorical('criterion', ['gini', 'entropy'])
        dt_splitter = trial.suggest_categorical('splitter', ['best', 'random'])
        
        model = DecisionTreeClassifier(
            random_state=SEED,
            criterion=dt_criterion,
            splitter=dt_splitter
        )

    elif classifier == 'SVM':
        # SVM params
        svm_C = trial.suggest_categorical('svm_C', [0.1, 1, 10, 100, 1000])
        svm_degree = trial.suggest_categorical('svm_degree', [0, 1, 2, 3, 4, 5, 6])
        
        model = SVC(
            random_state=SEED,
            C=svm_C,
            degree=svm_degree
        )

    elif classifier == 'RandomForest':
        # RandomForest params
        rf_max_depth = trial.suggest_categorical('rf_max_depth', [80, 90, 100, 110])
        rf_max_features = trial.suggest_categorical('rf_max_features', [2, 3])
        rf_min_samples_leaf = trial.suggest_categorical('rf_min_sample_leaf', [8, 10, 12])
        rf_n_estimators = trial.suggest_categorical('rf_n_estimators', [100, 200, 300, 1000])
        
        model = RandomForestClassifier(
            random_state=SEED,
            max_depth=rf_max_depth,
            max_features=rf_max_features,
            min_samples_leaf=rf_min_samples_leaf,
            n_estimators=rf_n_estimators
        )

    elif classifier == 'LightGBM':
        # LightGBM params
        lgbm_max_depth = trial.suggest_int('lgbm_max_depth', 20, 200)
        lgbm_learning_rate = trial.suggest_categorical('lgbm_learning_rate', [0.01, 0.05, 0.1])
        lgbm_num_leaves = trial.suggest_categorical('lgbm_num_leaves', [80, 100, 150, 200])
#         lgbm_boosting_type = trial.suggest_categorical('lgbm_boosting_type', ['gbdt', 'dart', 'goss', 'rf'])
        lgbm_subsample = trial.suggest_categorical('lgbm_subsample', [1, 0.8, 0.7, 0.5])
        
        model = lgb.LGBMClassifier(
            random_state=SEED,
            max_depth=lgbm_max_depth,
            num_leaves=lgbm_num_leaves,
            learning_rate=lgbm_learning_rate,
            subsample=lgbm_subsample,
#             boosting_type=lgbm_boosting_type
        )

    elif classifier == 'Xgboost':
        # Xgboost params
        xgb_max_depth = trial.suggest_int('xgb_max_depth', 10, 200)
        xgb_learning_rate = trial.suggest_categorical('xgb_learning_rate', [0.01, 0.05, 0.1])
#         xgb_booster = trial.suggest_categorical('xgb_boosting_type', ['gbdt', 'dart', 'goss', 'rf'])
        
        model = xgb.XGBClassifier(
            random_state=SEED,
            max_depth=xgb_max_depth,
            learning_rate=xgb_learning_rate,
#             booster=xgb_booster
        )
        model
    else:
        return
        
    return cross_val_score(model, X, y, n_jobs=-1, scoring='accuracy', cv=kf).mean()

In [60]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=400)

trial = study.best_trial
print('Accuracy: {}'.format(trial.value))
print("Best hyperparameters: {}".format(trial.params))

[32m[I 2020-10-17 10:51:44,260][0m A new study created in memory with name: no-name-0b6ef24e-9289-458c-8dc2-55de9ab7df00[0m
[32m[I 2020-10-17 10:51:49,313][0m Trial 0 finished with value: 0.8336448598130841 and parameters: {'classifier': 'DecisionTree', 'criterion': 'entropy', 'splitter': 'random'}. Best is trial 0 with value: 0.8336448598130841.[0m
[32m[I 2020-10-17 10:51:49,516][0m Trial 1 finished with value: 0.8336448598130841 and parameters: {'classifier': 'DecisionTree', 'criterion': 'entropy', 'splitter': 'random'}. Best is trial 0 with value: 0.8336448598130841.[0m
[32m[I 2020-10-17 10:51:50,273][0m Trial 2 finished with value: 0.9032710280373832 and parameters: {'classifier': 'SVM', 'svm_C': 1000, 'svm_degree': 2}. Best is trial 2 with value: 0.9032710280373832.[0m
[32m[I 2020-10-17 10:51:50,635][0m Trial 3 finished with value: 0.9063084112149533 and parameters: {'classifier': 'KNeighbor', 'n_neighbors': 4, 'weights': 'distance', 'algorithm': 'brute', 'leaf_size'

[32m[I 2020-10-17 10:54:45,864][0m Trial 32 finished with value: 0.9212616822429905 and parameters: {'classifier': 'LightGBM', 'lgbm_max_depth': 73, 'lgbm_learning_rate': 0.1, 'lgbm_num_leaves': 200, 'lgbm_subsample': 0.8}. Best is trial 8 with value: 0.9212616822429905.[0m
[32m[I 2020-10-17 10:54:51,616][0m Trial 33 finished with value: 0.9212616822429905 and parameters: {'classifier': 'LightGBM', 'lgbm_max_depth': 82, 'lgbm_learning_rate': 0.1, 'lgbm_num_leaves': 150, 'lgbm_subsample': 1}. Best is trial 8 with value: 0.9212616822429905.[0m
[32m[I 2020-10-17 10:54:57,487][0m Trial 34 finished with value: 0.9212616822429905 and parameters: {'classifier': 'LightGBM', 'lgbm_max_depth': 86, 'lgbm_learning_rate': 0.1, 'lgbm_num_leaves': 150, 'lgbm_subsample': 1}. Best is trial 8 with value: 0.9212616822429905.[0m
[32m[I 2020-10-17 10:55:02,528][0m Trial 35 finished with value: 0.9212616822429905 and parameters: {'classifier': 'LightGBM', 'lgbm_max_depth': 87, 'lgbm_learning_rate

[32m[I 2020-10-17 10:57:20,508][0m Trial 63 finished with value: 0.9212616822429905 and parameters: {'classifier': 'LightGBM', 'lgbm_max_depth': 77, 'lgbm_learning_rate': 0.1, 'lgbm_num_leaves': 150, 'lgbm_subsample': 1}. Best is trial 8 with value: 0.9212616822429905.[0m
[32m[I 2020-10-17 10:57:27,730][0m Trial 64 finished with value: 0.9212616822429905 and parameters: {'classifier': 'LightGBM', 'lgbm_max_depth': 173, 'lgbm_learning_rate': 0.1, 'lgbm_num_leaves': 200, 'lgbm_subsample': 0.7}. Best is trial 8 with value: 0.9212616822429905.[0m
[32m[I 2020-10-17 10:57:31,823][0m Trial 65 finished with value: 0.9203271028037383 and parameters: {'classifier': 'LightGBM', 'lgbm_max_depth': 51, 'lgbm_learning_rate': 0.1, 'lgbm_num_leaves': 80, 'lgbm_subsample': 0.8}. Best is trial 8 with value: 0.9212616822429905.[0m
[32m[I 2020-10-17 10:57:32,248][0m Trial 66 finished with value: 0.902803738317757 and parameters: {'classifier': 'KNeighbor', 'n_neighbors': 7, 'weights': 'uniform',

[32m[I 2020-10-17 11:00:05,690][0m Trial 94 finished with value: 0.9212616822429905 and parameters: {'classifier': 'LightGBM', 'lgbm_max_depth': 193, 'lgbm_learning_rate': 0.1, 'lgbm_num_leaves': 200, 'lgbm_subsample': 0.5}. Best is trial 8 with value: 0.9212616822429905.[0m
[32m[I 2020-10-17 11:00:11,150][0m Trial 95 finished with value: 0.9212616822429905 and parameters: {'classifier': 'LightGBM', 'lgbm_max_depth': 80, 'lgbm_learning_rate': 0.1, 'lgbm_num_leaves': 200, 'lgbm_subsample': 0.7}. Best is trial 8 with value: 0.9212616822429905.[0m
[32m[I 2020-10-17 11:00:14,300][0m Trial 96 finished with value: 0.9203271028037383 and parameters: {'classifier': 'LightGBM', 'lgbm_max_depth': 62, 'lgbm_learning_rate': 0.1, 'lgbm_num_leaves': 80, 'lgbm_subsample': 0.7}. Best is trial 8 with value: 0.9212616822429905.[0m
[32m[I 2020-10-17 11:00:16,042][0m Trial 97 finished with value: 0.8906542056074767 and parameters: {'classifier': 'RandomForest', 'rf_max_depth': 90, 'rf_max_featu

[32m[I 2020-10-17 11:02:14,698][0m Trial 124 finished with value: 0.9212616822429905 and parameters: {'classifier': 'LightGBM', 'lgbm_max_depth': 135, 'lgbm_learning_rate': 0.1, 'lgbm_num_leaves': 150, 'lgbm_subsample': 1}. Best is trial 8 with value: 0.9212616822429905.[0m
[32m[I 2020-10-17 11:02:19,430][0m Trial 125 finished with value: 0.9212616822429905 and parameters: {'classifier': 'LightGBM', 'lgbm_max_depth': 157, 'lgbm_learning_rate': 0.1, 'lgbm_num_leaves': 150, 'lgbm_subsample': 0.7}. Best is trial 8 with value: 0.9212616822429905.[0m
[32m[I 2020-10-17 11:02:33,523][0m Trial 126 finished with value: 0.9091121495327102 and parameters: {'classifier': 'Xgboost', 'xgb_max_depth': 88, 'xgb_learning_rate': 0.05}. Best is trial 8 with value: 0.9212616822429905.[0m
[32m[I 2020-10-17 11:02:34,324][0m Trial 127 finished with value: 0.8880841121495326 and parameters: {'classifier': 'RandomForest', 'rf_max_depth': 110, 'rf_max_features': 2, 'rf_min_sample_leaf': 12, 'rf_n_est

KeyboardInterrupt: 

In [54]:
# Test Data Laebeling

kf = KFold(n_splits=8, random_state=SEED, shuffle=True)

model = lgb.LGBMClassifier(random_state=SEED, max_depth=81, learning_rate=0.1, num_leaves=200, subsample=0.7)

for train_index, test_index in kf.split(X):
    train_X, test_X = X.iloc[train_index], X.iloc[test_index]
    train_y, test_y = y.iloc[train_index], y.iloc[train_index]

    model.fit(train_X, train_y)

pr = model.predict(test_data)
pr

array(['HI', 'EL', 'HI', ..., 'PH', 'PH', 'CO'], dtype=object)

In [57]:
result = pd.DataFrame(pr)
result

Unnamed: 0,0
0,HI
1,EL
2,HI
3,PH
4,EL
...,...
1828,HI
1829,EL
1830,PH
1831,PH


In [58]:
result.to_csv(f'./data/lgbm_dep_21_subsample_1.csv', index=False)