### Cross Validation Task

### 약물 A, B, C, X, Y
##### 다중 분류(Multiclass Classification)
- 의학 연구원으로서 동일한 질병을 앓고 있는 일련의 환자에 대한 데이터를 수집했다.
- 치료 과정 동안 각 환자는 5가지 약물, 즉 약물 A, 약물 B, 약물 c, 약물 x 및 y 중 하나에 반응했다.
-  미래에 동일한 질병을 앓는 환자에게 어떤 약물이 적합할 수 있는지 알아보기 위한 모델을 구축한다.

featur
- Age: 환자의 나이
- Sex: 환자의 성별
- BP: 혈압
- Cholesterol: 콜레스테롤 수치
- Na_to_K: 나트륨-칼륨

target
- Drug: 의약품, 환자에게 효과가 있던 약

In [1]:
import pandas as pd
drug_df = pd.read_csv('./datasets/drugs.csv')
drug_df

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,drugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,drugY
...,...,...,...,...,...,...
195,56,F,LOW,HIGH,11.567,drugC
196,16,M,LOW,HIGH,12.006,drugC
197,52,M,NORMAL,HIGH,9.894,drugX
198,23,M,NORMAL,NORMAL,14.020,drugX


In [2]:
# 각 feature에 있는 값들을 모두 숫자로 바꿈

drug_df['Sex'] = drug_df['Sex'].replace({'F': 0, 'M': 1})
drug_df['BP'] = drug_df['BP'].replace({'LOW': 0, 'NORMAL': 1, 'HIGH' : 2})
drug_df['Cholesterol'] = drug_df['Cholesterol'].replace({'LOW': 0, 'NORMAL': 1, 'HIGH' : 2})
drug_df

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,0,2,2,25.355,drugY
1,47,1,0,2,13.093,drugC
2,47,1,0,2,10.114,drugC
3,28,0,1,2,7.798,drugX
4,61,0,0,2,18.043,drugY
...,...,...,...,...,...,...
195,56,0,0,2,11.567,drugC
196,16,1,0,2,12.006,drugC
197,52,1,1,2,9.894,drugX
198,23,1,1,1,14.020,drugX


In [3]:
drug_df.isna().sum()

Age            0
Sex            0
BP             0
Cholesterol    0
Na_to_K        0
Drug           0
dtype: int64

In [4]:
na_to_k_df = drug_df.loc[:, 'Na_to_K'].reset_index()
na_to_k_df

Unnamed: 0,index,Na_to_K
0,0,25.355
1,1,13.093
2,2,10.114
3,3,7.798
4,4,18.043
...,...,...
195,195,11.567
196,196,12.006
197,197,9.894
198,198,14.020


In [5]:
# target의 값을 표준화
from sklearn.preprocessing import StandardScaler

std = StandardScaler()
std_na_to_k = std.fit_transform(na_to_k_df[['Na_to_K']])
std_na_to_k

array([[ 1.28652212],
       [-0.4151454 ],
       [-0.82855818],
       [-1.14996267],
       [ 0.27179427],
       [-1.03769314],
       [ 0.02643885],
       [-0.70046821],
       [-0.12676951],
       [ 0.45567206],
       [-0.59916196],
       [ 0.43221897],
       [-0.09832049],
       [ 0.674105  ],
       [-0.46926791],
       [-0.0788919 ],
       [-0.64245998],
       [-0.29316156],
       [-1.21935052],
       [ 1.37242427],
       [ 0.42236589],
       [ 1.36451406],
       [ 2.00995979],
       [-0.14550423],
       [ 2.41490725],
       [ 0.37809645],
       [ 1.9819271 ],
       [-0.93028076],
       [ 0.91765633],
       [ 0.25902691],
       [-1.01784822],
       [-0.90446848],
       [-0.70366006],
       [ 2.19147839],
       [-0.27081868],
       [-1.2211546 ],
       [-0.92139911],
       [-0.29787994],
       [-0.88476233],
       [-0.97149714],
       [ 0.43527203],
       [-0.25610845],
       [-0.04086736],
       [-0.53074555],
       [-0.5258884 ],
       [-1

In [6]:
na_to_k_df['Na_to_K'] = std_na_to_k
na_to_k_df

Unnamed: 0,index,Na_to_K
0,0,1.286522
1,1,-0.415145
2,2,-0.828558
3,3,-1.149963
4,4,0.271794
...,...,...
195,195,-0.626917
196,196,-0.565995
197,197,-0.859089
198,198,-0.286500


In [7]:
cond1 = na_to_k_df['Na_to_K'] >= -1.96
cond2 = na_to_k_df['Na_to_K'] <= 1.96
cond = cond1&cond2

drug_df = drug_df.iloc[na_to_k_df[cond].index].reset_index(drop=True)
drug_df

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,0,2,2,25.355,drugY
1,47,1,0,2,13.093,drugC
2,47,1,0,2,10.114,drugC
3,28,0,1,2,7.798,drugX
4,61,0,0,2,18.043,drugY
...,...,...,...,...,...,...
183,56,0,0,2,11.567,drugC
184,16,1,0,2,12.006,drugC
185,52,1,1,2,9.894,drugX
186,23,1,1,1,14.020,drugX


In [8]:
drug_enc_df = drug_df.copy()

In [9]:
drug_enc_df

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,0,2,2,25.355,drugY
1,47,1,0,2,13.093,drugC
2,47,1,0,2,10.114,drugC
3,28,0,1,2,7.798,drugX
4,61,0,0,2,18.043,drugY
...,...,...,...,...,...,...
183,56,0,0,2,11.567,drugC
184,16,1,0,2,12.006,drugC
185,52,1,1,2,9.894,drugX
186,23,1,1,1,14.020,drugX


In [10]:
from sklearn.preprocessing import LabelEncoder

drug_encoder = LabelEncoder()

targets = drug_encoder.fit_transform(drug_df['Drug'].tolist())
drug_enc_df['Drug'] = targets

gender_encoder = LabelEncoder()
genders = gender_encoder.fit_transform(drug_df['Sex'].tolist())
drug_enc_df['Sex'] = genders

blood_pressure_encoder = LabelEncoder()
blood_pressures = blood_pressure_encoder.fit_transform(drug_df['BP'].tolist())
drug_enc_df['BP'] = blood_pressures

choleserol_encoder = LabelEncoder()
choleserols = choleserol_encoder.fit_transform(drug_df['Cholesterol'].tolist())
drug_enc_df['Cholesterol'] = choleserols

In [11]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

features, targets = drug_enc_df.iloc[:,:-1] , drug_enc_df.iloc[:,-1]

X_train, X_test, y_train, y_test = \
train_test_split(features, targets, test_size=0.2, random_state=124)

dtc = DecisionTreeClassifier()
parameters = {'max_depth': [2, 3, 4], 'min_samples_split': [15, 16, 17]}

In [12]:
g_dtc = GridSearchCV(dtc, 
              param_grid=parameters,
              cv=5,
              refit=True,
              return_train_score=True,
              n_jobs=-1)

In [13]:
g_dtc.fit(X_train, y_train)

In [14]:
g_dtc.cv_results_

{'mean_fit_time': array([0.00440021, 0.00439963, 0.00420136, 0.00280027, 0.00239987,
        0.00260134, 0.00240002, 0.00220032, 0.00219994]),
 'std_fit_time': array([0.00048992, 0.00048941, 0.00074898, 0.00074853, 0.00080016,
        0.00049112, 0.00049   , 0.00039926, 0.00039981]),
 'mean_score_time': array([0.00179954, 0.00200109, 0.00139923, 0.00100045, 0.00180149,
        0.0012001 , 0.00160017, 0.0013999 , 0.00140162]),
 'std_score_time': array([4.00401312e-04, 2.62650287e-06, 4.89825395e-04, 1.57717089e-06,
        4.00548095e-04, 3.99638090e-04, 4.90193622e-04, 4.88986999e-04,
        4.88656089e-04]),
 'param_max_depth': masked_array(data=[2, 2, 2, 3, 3, 3, 4, 4, 4],
              mask=[False, False, False, False, False, False, False, False,
                    False],
        fill_value='?',
             dtype=object),
 'param_min_samples_split': masked_array(data=[15, 16, 17, 15, 16, 17, 15, 16, 17],
              mask=[False, False, False, False, False, False, False, False,

In [15]:
result_df = pd.DataFrame(g_dtc.cv_results_)
result_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.0044,0.00049,0.0018,0.0004,2,15,"{'max_depth': 2, 'min_samples_split': 15}",0.866667,0.8,0.8,...,0.82,0.026667,7,0.816667,0.833333,0.833333,0.825,0.825,0.826667,0.006236
1,0.0044,0.000489,0.002001,3e-06,2,16,"{'max_depth': 2, 'min_samples_split': 16}",0.866667,0.8,0.8,...,0.82,0.026667,7,0.816667,0.833333,0.833333,0.825,0.825,0.826667,0.006236
2,0.004201,0.000749,0.001399,0.00049,2,17,"{'max_depth': 2, 'min_samples_split': 17}",0.866667,0.8,0.8,...,0.82,0.026667,7,0.816667,0.833333,0.833333,0.825,0.825,0.826667,0.006236
3,0.0028,0.000749,0.001,2e-06,3,15,"{'max_depth': 3, 'min_samples_split': 15}",0.8,0.9,0.9,...,0.873333,0.04899,4,0.916667,0.916667,0.908333,0.933333,0.908333,0.916667,0.009129
4,0.0024,0.0008,0.001801,0.000401,3,16,"{'max_depth': 3, 'min_samples_split': 16}",0.8,0.9,0.9,...,0.873333,0.04899,4,0.916667,0.916667,0.908333,0.933333,0.908333,0.916667,0.009129
5,0.002601,0.000491,0.0012,0.0004,3,17,"{'max_depth': 3, 'min_samples_split': 17}",0.8,0.9,0.9,...,0.873333,0.04899,4,0.916667,0.916667,0.908333,0.933333,0.908333,0.916667,0.009129
6,0.0024,0.00049,0.0016,0.00049,4,15,"{'max_depth': 4, 'min_samples_split': 15}",1.0,1.0,1.0,...,0.993333,0.013333,1,1.0,1.0,1.0,1.0,1.0,1.0,0.0
7,0.0022,0.000399,0.0014,0.000489,4,16,"{'max_depth': 4, 'min_samples_split': 16}",1.0,1.0,1.0,...,0.993333,0.013333,1,1.0,1.0,1.0,1.0,1.0,1.0,0.0
8,0.0022,0.0004,0.001402,0.000489,4,17,"{'max_depth': 4, 'min_samples_split': 17}",1.0,1.0,1.0,...,0.993333,0.013333,1,1.0,1.0,1.0,1.0,1.0,1.0,0.0


In [16]:
print(g_dtc.best_params_, g_dtc.best_score_, sep="\n")

{'max_depth': 4, 'min_samples_split': 15}
0.9933333333333334


In [17]:
g_dtc.best_estimator_

In [18]:
dtc = g_dtc.best_estimator_
prediction = dtc.predict(X_test)
# round(accuracy_score(y_test, prediction), 4)
accuracy_score(y_test, prediction)

0.9736842105263158