# Reference Codes for Competition #1 #

In [20]:
# Data Wrangling
import pandas as pd
import numpy as np

# Hyperparameter Optimization
import optuna

# Modeling
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

# Evaluation
from sklearn.metrics import accuracy_score

# Etc
import pickle
import warnings
warnings.filterwarnings('ignore')

In [21]:
X = pd.read_csv('X_train.csv')
y = pd.read_csv('y_train.csv').gender

In [22]:
X

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f51,f52,f53,f54,f55,f56,f57,f58,f59,f60
0,1.065708,-1.108290,0.513177,0.458800,-0.747350,-0.428893,-0.539247,-0.731433,0.076911,0.021530,...,-0.113168,0.222601,0.002654,-0.155161,0.207087,0.142654,-0.088473,0.046094,0.084016,-0.112096
1,-1.118512,-0.283965,-0.807377,-0.227990,0.499381,0.517660,0.216804,-0.090451,0.025904,-0.353093,...,0.018257,0.148791,0.020114,-0.091960,0.096535,0.002544,0.067031,-0.033075,-0.102020,-0.230362
2,-1.621164,0.359342,-0.009075,0.264630,0.613557,0.340367,0.289711,-0.087876,0.157243,0.587353,...,-0.001830,0.121247,-0.187377,-0.045809,-0.031548,-0.013707,0.100588,-0.000918,-0.004412,-0.111499
3,0.109638,-0.011408,-0.158151,-0.152665,0.118723,0.108297,0.011834,0.964645,0.056170,-0.055947,...,0.225539,0.005191,-0.231734,0.000957,0.099028,0.065950,0.062079,-0.017500,-0.031968,0.026953
4,-0.947410,-0.471158,0.001532,-0.062487,0.147821,0.392181,0.105615,0.027790,-0.491372,-0.168841,...,0.053036,0.028679,0.028065,0.174117,0.116474,-0.036161,0.043683,-0.032832,-0.004160,-0.047245
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3495,-1.464698,0.756092,-0.190608,0.345714,-0.504831,-0.024720,-0.562073,0.120787,0.344116,-0.068355,...,-0.027943,0.045253,-0.110824,0.165699,-0.003497,0.000216,0.127658,-0.027552,-0.003847,-0.081093
3496,-0.637022,-1.259685,-0.407535,-0.201414,-0.656503,-0.238687,-0.388249,0.293272,0.295465,0.261209,...,0.012746,0.158836,-0.051198,-0.014381,0.062800,-0.012179,0.035542,0.019090,-0.031491,0.039452
3497,-1.917122,0.586596,0.176672,-0.022940,0.887222,0.212659,0.248100,-0.033524,0.439044,0.662337,...,-0.081378,0.040186,-0.095817,-0.072336,-0.003542,-0.015690,-0.126858,0.018467,0.007164,0.129033
3498,-1.613797,0.677287,0.720803,-0.153151,0.455676,-0.145414,0.168945,-0.336686,-0.307792,1.001812,...,-0.095184,-0.097667,0.106476,-0.039247,0.025189,-0.016424,0.038734,0.026754,0.005283,-0.042267


In [23]:
# 학습데이터(70%)와 평가데이터(30%) 생성
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

<font color="#CC3D3D"><p>
## DecisionTree

In [24]:
# 해당 estimator에 설정 가능한 파라미터와 디폴트 값을 나열
DecisionTreeClassifier().get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'presort': 'deprecated',
 'random_state': None,
 'splitter': 'best'}

####  max_depth
- 트리의 최대 깊이 지정
- Default = 10
- 지정된 값까지 tree깊이가 늘어나거나 노드가 가지는 데이터 수가 min_samples_split 보다 작아질때까지 계속 분할

#### min_samples_split
- 노드를 분할하기 위한 최소한의 샘플 데이터수 → 과적합을 제어하는데 사용
- Default = 2 → 작게 설정할 수록 분할 노드가 많아져 과적합 가능성 증가  

#### min_samples_leaf
- 리프노드가 되기 위해 필요한 최소한의 샘플 데이터수
- min_samples_split과 함께 과적합 제어 용도
- 불균형 데이터의 경우 특정 클래스의 데이터가 극도로 작을 수 있으므로 작게 설정 필요  

#### max_features
- 최적의 분할을 위해 고려할 최대 feature 개수
- Default = 'auto' (결정트리에서는 default가 none이었음)
- int형으로 지정 →피처 갯수 / float형으로 지정 →비중
- sqrt 또는 auto : 전체 피처 중 √(피처개수) 만큼 선정
- log : 전체 피처 중 log2(전체 피처 개수) 만큼 선정  

#### max_leaf_nodes 
- 리프노드의 최대 개수  

In [102]:
def objective_dt(trial):
    max_depth = trial.suggest_int('max_depth', 2, 20)  # 변경된 부분
    min_samples_split = trial.suggest_float('min_samples_split', 0.01, 0.5, log=True)
    min_samples_leaf = trial.suggest_float('min_samples_leaf', 0.01, 0.2, log=True)
    max_leaf_nodes = trial.suggest_int('max_leaf_nodes', 2, 20)  # 변경된 부분
    dt_splitter = trial.suggest_categorical('splitter', ['best', 'random'])
    
    classifier_obj = DecisionTreeClassifier(
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_leaf_nodes=max_leaf_nodes,
        class_weight='balanced',
        splitter=dt_splitter,
        random_state=0)
    
    score = cross_val_score(classifier_obj, X_train, y_train, scoring = 'accuracy', cv=5, n_jobs=-1)
    roc_auc = score.mean()
    return roc_auc 

study = optuna.create_study(sampler=optuna.samplers.TPESampler(seed=100), direction="maximize") 
study.optimize(objective_dt, n_trials=42)
    
print("Best score:", study.best_value)
print("Best parameters:", study.best_params)

[32m[I 2023-04-11 16:05:31,081][0m A new study created in memory with name: no-name-7a70ae27-b868-43fd-a0c2-1b3e9c35342d[0m
[32m[I 2023-04-11 16:05:31,251][0m Trial 0 finished with value: 0.603265306122449 and parameters: {'max_depth': 12, 'min_samples_split': 0.02971263961346909, 'min_samples_leaf': 0.03567057575132749, 'max_leaf_nodes': 18, 'splitter': 'random'}. Best is trial 0 with value: 0.603265306122449.[0m
[32m[I 2023-04-11 16:05:31,470][0m Trial 1 finished with value: 0.643673469387755 and parameters: {'max_depth': 14, 'min_samples_split': 0.25298749738635706, 'min_samples_leaf': 0.015061193000286317, 'max_leaf_nodes': 12, 'splitter': 'best'}. Best is trial 1 with value: 0.643673469387755.[0m
[32m[I 2023-04-11 16:05:31,706][0m Trial 2 finished with value: 0.6302040816326531 and parameters: {'max_depth': 5, 'min_samples_split': 0.015280201666902271, 'min_samples_leaf': 0.019312256807168915, 'max_leaf_nodes': 20, 'splitter': 'best'}. Best is trial 1 with value: 0.6436

[32m[I 2023-04-11 16:05:38,040][0m Trial 28 finished with value: 0.6379591836734694 and parameters: {'max_depth': 2, 'min_samples_split': 0.3669398384082225, 'min_samples_leaf': 0.06380217166818775, 'max_leaf_nodes': 7, 'splitter': 'best'}. Best is trial 24 with value: 0.6485714285714286.[0m
[32m[I 2023-04-11 16:05:38,245][0m Trial 29 finished with value: 0.5787755102040817 and parameters: {'max_depth': 7, 'min_samples_split': 0.21637533592634803, 'min_samples_leaf': 0.03951891174196232, 'max_leaf_nodes': 4, 'splitter': 'random'}. Best is trial 24 with value: 0.6485714285714286.[0m
[32m[I 2023-04-11 16:05:38,495][0m Trial 30 finished with value: 0.6485714285714286 and parameters: {'max_depth': 4, 'min_samples_split': 0.3572243855446932, 'min_samples_leaf': 0.04141376152249869, 'max_leaf_nodes': 7, 'splitter': 'best'}. Best is trial 24 with value: 0.6485714285714286.[0m
[32m[I 2023-04-11 16:05:38,713][0m Trial 31 finished with value: 0.6485714285714286 and parameters: {'max_d

Best score: 0.6514285714285715
Best parameters: {'max_depth': 4, 'min_samples_split': 0.33915445416669665, 'min_samples_leaf': 0.03910066841898059, 'max_leaf_nodes': 7, 'splitter': 'best'}


In [103]:
model = DecisionTreeClassifier(**study.best_params, random_state=100)
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.6685714285714286

# 0.6752380952380952

max_depth = trial.suggest_int('max_depth', 2, 32)
    min_samples_split = trial.suggest_float('min_samples_split', 0.01, 0.5, log=True)
    min_samples_leaf = trial.suggest_float('min_samples_leaf', 0.01, 0.2, log=True)
    max_leaf_nodes = trial.suggest_int('max_leaf_nodes', 2, 32)
    dt_splitter = trial.suggest_categorical('splitter', ['best', 'random'])
    
    classifier_obj = DecisionTreeClassifier(
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_leaf_nodes=max_leaf_nodes,
        class_weight='balanced',
        splitter=dt_splitter,
        random_state=0)

In [26]:
# Insert HPO codes
#

<font color="#CC3D3D"><p>
## k-NN

In [27]:
KNeighborsClassifier().get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

#### n_neighbors : 검색할 이웃수, k
- default = 5


#### weights : 예측에 사용되는 가중치 함수
- 'unjiform' : 균일한 가중치
- 'distance' : 거리의 역수로 가중치 부여


#### algorithm : 가장 가까운 이웃을 계산하는 데 사용하는 알고리즘
- 'atuo', 'ball_tree', 'kd_tree', 'brute'


#### p : Minkowski 메트릭에 대한 검정력 매개변수
- default = 2
- p=1 : 맨하튼거리
- p=2 : 유클리드 거리


#### metric : 거리 계산에 사용할 미터법
- default = 'minkowski'

In [112]:
def objective_knn(trial):
    knn_n_neighbors = trial.suggest_int('n_neighbors', 1, 300, step=1)  
    knn_weights = trial.suggest_categorical('weights', ['uniform','distance'])
    knn_algorithms = trial.suggest_categorical('algorithm', ['auto', 'ball_tree', 'kd_tree', 'brute'])  
    knn_leaf_size = trial.suggest_int('leaf_size', 3, 7, step = 1) 
    knn_p = trial.suggest_int('p', 1, 3, step=1)  # 변경된 부분
    knn_metrics = trial.suggest_categorical('metric', ['minkowski', 'euclidean', 'manhattan', 'chebyshev'])  # 변경된 부분
    
    classifier_obj = KNeighborsClassifier(
        n_neighbors=knn_n_neighbors, 
        weights=knn_weights,
        algorithm=knn_algorithms,
        leaf_size=knn_leaf_size,
        p=knn_p,
        metric=knn_metrics
    )
    score = cross_val_score(classifier_obj, X_train, y_train, scoring = 'accuracy', cv=6, n_jobs=-1, error_score='raise')
    roc_auc = score.mean()
    return roc_auc

study = optuna.create_study(sampler=optuna.samplers.TPESampler(seed=100), direction="maximize") # 모르면 노터치, seed 값만 고정하면 항상 같은 값 !
study.optimize(objective_knn, n_trials=50)
    
print("Best score:", study.best_value)
print("Best parameters:", study.best_params)

[32m[I 2023-04-11 16:14:03,673][0m A new study created in memory with name: no-name-f0d7341a-9bb0-4003-b3e9-2985a058aa9a[0m
[32m[I 2023-04-11 16:14:04,317][0m Trial 0 finished with value: 0.6922491490483723 and parameters: {'n_neighbors': 164, 'weights': 'distance', 'algorithm': 'auto', 'leaf_size': 7, 'p': 1, 'metric': 'euclidean'}. Best is trial 0 with value: 0.6922491490483723.[0m
[32m[I 2023-04-11 16:14:06,674][0m Trial 1 finished with value: 0.6698028029467696 and parameters: {'n_neighbors': 33, 'weights': 'distance', 'algorithm': 'kd_tree', 'leaf_size': 5, 'p': 3, 'metric': 'minkowski'}. Best is trial 0 with value: 0.6922491490483723.[0m
[32m[I 2023-04-11 16:14:07,268][0m Trial 2 finished with value: 0.5991937932467201 and parameters: {'n_neighbors': 2, 'weights': 'distance', 'algorithm': 'kd_tree', 'leaf_size': 4, 'p': 1, 'metric': 'euclidean'}. Best is trial 0 with value: 0.6922491490483723.[0m
[32m[I 2023-04-11 16:14:07,853][0m Trial 3 finished with value: 0.6848

[32m[I 2023-04-11 16:14:32,649][0m Trial 31 finished with value: 0.6930711363600045 and parameters: {'n_neighbors': 164, 'weights': 'uniform', 'algorithm': 'auto', 'leaf_size': 5, 'p': 2, 'metric': 'euclidean'}. Best is trial 22 with value: 0.6938851335155088.[0m
[32m[I 2023-04-11 16:14:33,524][0m Trial 32 finished with value: 0.6885816673857806 and parameters: {'n_neighbors': 169, 'weights': 'uniform', 'algorithm': 'auto', 'leaf_size': 5, 'p': 2, 'metric': 'euclidean'}. Best is trial 22 with value: 0.6938851335155088.[0m
[32m[I 2023-04-11 16:14:34,418][0m Trial 33 finished with value: 0.6914371494318999 and parameters: {'n_neighbors': 159, 'weights': 'uniform', 'algorithm': 'auto', 'leaf_size': 5, 'p': 2, 'metric': 'euclidean'}. Best is trial 22 with value: 0.6938851335155088.[0m
[32m[I 2023-04-11 16:14:35,250][0m Trial 34 finished with value: 0.6857221902615978 and parameters: {'n_neighbors': 102, 'weights': 'uniform', 'algorithm': 'kd_tree', 'leaf_size': 5, 'p': 2, 'metri

Best score: 0.6938851335155088
Best parameters: {'n_neighbors': 161, 'weights': 'uniform', 'algorithm': 'auto', 'leaf_size': 5, 'p': 1, 'metric': 'euclidean'}


In [113]:
model = KNeighborsClassifier(**study.best_params)
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.700952380952381

## 0.7028571428571428

knn_n_neighbors = trial.suggest_int('n_neighbors', 1, 300, step=1)  
    knn_weights = trial.suggest_categorical('weights', ['uniform','distance'])
    knn_algorithms = trial.suggest_categorical('algorithm', ['auto', 'ball_tree', 'kd_tree', 'brute'])  
    knn_leaf_size = trial.suggest_int('leaf_size', 3, 7, step = 1) 
    knn_p = trial.suggest_int('p', 1, 3, step=1)  # 변경된 부분
    knn_metrics = trial.suggest_categorical('metric', ['minkowski', 'euclidean', 'manhattan', 'chebyshev'])  # 변경된 부분
    
    classifier_obj = KNeighborsClassifier(
        n_neighbors=knn_n_neighbors, 
        weights=knn_weights,
        algorithm=knn_algorithms,
        leaf_size=knn_leaf_size,
        p=knn_p,
        metric=knn_metrics
    )

In [49]:
# Insert HPO codes
#

In [111]:
# 실제 제출할 때 모델명은 "대표학생이름-차수.pickle"로 해야 함!!!

with open(file='이수인-3.pickle', mode='wb') as f:
    pickle.dump(model, f)

# End