# 데이터셋 분할과 모델검증

## 특성치, 레이블 나누기

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('../data/Fvote.csv')
data.head()

Unnamed: 0,edu,income,age,score_gov,score_progress,score_intention,vote,parties,gender_female,gender_male,region_Chungcheung,region_Honam,region_Others,region_Sudo,region_Youngnam
0,3,3,3,2,2,4.0,1,2,0,1,0,0,0,0,1
1,2,3,3,2,4,3.0,0,3,0,1,0,0,1,0,0
2,1,2,4,1,3,2.8,1,4,0,1,0,1,0,0,0
3,2,1,3,5,4,2.6,1,1,1,0,0,0,0,1,0
4,1,2,4,4,3,2.4,1,1,0,1,0,0,0,1,0


## 특성치 분리

In [3]:
X = data[['gender_female', 'gender_male', 'region_Chungcheung', 'region_Honam', 'region_Others', 'region_Sudo', 'region_Youngnam', 'edu', 'income', 'age', 'score_gov', 'score_progress', 'score_intention']]

X2 = data[data.columns[1:14]]  # y레이블인 vote와 parties가 중간에 끼어있어서 못씀

X3 = data.loc[:, 'gender_female':'score_intention']  # y레이블인 vote와 parties가 중간에 끼어있어서 못씀

In [5]:
X

Unnamed: 0,gender_female,gender_male,region_Chungcheung,region_Honam,region_Others,region_Sudo,region_Youngnam,edu,income,age,score_gov,score_progress,score_intention
0,0,1,0,0,0,0,1,3,3,3,2,2,4.0
1,0,1,0,0,1,0,0,2,3,3,2,4,3.0
2,0,1,0,1,0,0,0,1,2,4,1,3,2.8
3,1,0,0,0,0,1,0,2,1,3,5,4,2.6
4,0,1,0,0,0,1,0,1,2,4,4,3,2.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
206,0,1,0,0,0,0,1,1,4,4,3,3,1.8
207,0,1,0,0,0,1,0,2,1,2,3,4,2.6
208,0,1,1,0,0,0,0,2,1,2,3,3,2.6
209,0,1,0,0,0,1,0,2,3,4,3,2,4.0


In [4]:
y = data[['vote']]

In [6]:
y

Unnamed: 0,vote
0,1
1,0
2,1
3,1
4,1
...,...
206,1
207,1
208,1
209,1


## Train-Test 데이터셋 분할 및 검증

In [7]:
from sklearn.model_selection import train_test_split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)  # 분류모델이라 stratify 넣어줌

In [10]:
X_train.shape

(158, 13)

In [11]:
X_test.shape

(53, 13)

## 모델 적용

In [12]:
from sklearn.linear_model import LogisticRegression

In [13]:
model = LogisticRegression()

## 교차검증을 적용한 학습
- 랜덤없는 교차검증 기법 cross_val_score
- 랜덤있는 교차검증 기법 k-fold

## 랜덤없는 교차검증 cross_val_score
- train 데이터를 n개의 그룹으로 나누고 그중 하나의 그룹만 제외하고 총 n번 훈련하여 과대 혹은 과소 적합을 막음
- cross_val_score(model, X, y, cv=n)으로 사용하여 n번 교차검증한다는 의미

In [14]:
from sklearn.model_selection import cross_val_score

In [17]:
scores = cross_val_score(model, X_train, y_train.values.reshape(-1), cv=5)
scores

array([0.71875   , 0.6875    , 0.8125    , 0.58064516, 0.80645161])

In [18]:
scores.mean()

0.7211693548387096

## 랜덤있는 교차검증 K-Fold
- 데이터의 순서에 의한 편향을 제거하기 위해 데이터를 shuffle하고 cv를 kfold 모델을 넣어줌

In [19]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [24]:
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X_train, y_train.values.reshape(-1), cv=kfold)
scores

array([0.71875   , 0.6875    , 0.625     , 0.70967742, 0.77419355])

In [25]:
scores.mean()

0.7030241935483872

## Train-Validity-Test 분할 및 검증
- train 데이터로 학습을 하는 도중에는 test 데이터를 이용하면 안됨
- train 데이터 중 일부를 학습 도중 검증용으로 사용하기 위해 분리

In [26]:
from sklearn.model_selection import train_test_split

In [28]:
# 일단 테스트 데이터 떨궈냄
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, random_state=42)
# 남은걸로 train, valid 분리
X_train, X_valid, y_train, y_valid = train_test_split(X_train_val, y_train_val, random_state=42)

In [31]:
X_train.shape

(118, 13)

In [32]:
X_valid.shape

(40, 13)

In [33]:
y_train.shape

(118, 1)

## 모델 학습

In [39]:
from sklearn.linear_model import LogisticRegression

In [40]:
model = LogisticRegression()

In [47]:
model.fit(X_train, y_train)
scores = cross_val_score(model, X_train, y_train.values.ravel(), cv=5)
scores

  y = column_or_1d(y, warn=True)


array([0.70833333, 0.54166667, 0.70833333, 0.65217391, 0.69565217])

In [48]:
scores.mean()

0.661231884057971

In [49]:
# 검증 데이터의 정확도 측정
model.score(X_valid, y_valid)

0.725

In [50]:
# 테스트 데이터의 정확도 측정
model.score(X_test, y_test)

0.7735849056603774