In [23]:
# 회귀모형 
import numpy as np
import matplotlib.pyplot as plt 
import pandas as pd 
import seaborn as sns 

# 쓸데없는 알림 방지
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

import itertools

#통계적 모형
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor 

#머신러닝
from sklearn import datasets
from sklearn import metrics
from sklearn import svm, neighbors, tree 

from sklearn.linear_model import Ridge, Lasso, LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.metrics import mean_squared_log_error, mean_squared_error, r2_score, mean_absolute_error #연속형일때 사용하는 경우 
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, confusion_matrix #범주형(분류모델)

from sklearn.preprocessing import StandardScaler, RobustScaler, Normalizer,MinMaxScaler # scale

In [24]:
titanic_raw_data = pd.read_csv('C:/Users/scien/Videos/titanic/titanic_full_data.csv')

In [25]:
titanic_raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1285 entries, 0 to 1284
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1285 non-null   int64  
 1   Survived     1285 non-null   int64  
 2   Pclass       1285 non-null   int64  
 3   Name         1285 non-null   object 
 4   Sex          1285 non-null   object 
 5   Age          1030 non-null   float64
 6   SibSp        1285 non-null   int64  
 7   Parch        1285 non-null   int64  
 8   Ticket       1285 non-null   object 
 9   Fare         1284 non-null   float64
 10  Cabin        294 non-null    object 
 11  Embarked     1283 non-null   object 
dtypes: float64(2), int64(5), object(5)
memory usage: 120.6+ KB


### 결측치, 문자열 데이터를 어떻게 전처리 해야하는가? 

- PassengerId (index)
- Name (지워버림)
- sex (원핫 인코딩으로 수치화)
- Ticket (지움)
- Cabin (빼줘)
- Embark (원핫 인코딩으로 수치화)

In [26]:
# Y 해당하는 survived 제외, 문자열 타입 Name, Cabin 제외 
columns = ['Survived','Pclass', 'Sex', 'Age', 'SibSp', 'Parch','Fare', 'Embarked']

In [27]:
# 입력변수 X를 만들자 
titanic_features = titanic_raw_data[columns]
titanic_features

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.2500,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.9250,S
3,1,1,female,35.0,1,0,53.1000,S
4,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
1280,0,3,male,,0,0,8.0500,S
1281,1,1,female,39.0,0,0,108.9000,C
1282,0,3,male,38.5,0,0,7.2500,S
1283,0,3,male,,0,0,8.0500,S


In [28]:
# 문자열 자료를 이산형으로 만들기 
# 굳이 2개 다 쓸 필요가 없어 하나만 이용할 것임 drop_first = True
# 만일 female을 남기고 싶다면?
# columns=['male', 'female']

titanic_features[['feamle', 'male']] = pd.get_dummies(titanic_features['Sex'])

In [29]:
titanic_features.drop(columns=['Sex'], inplace=True)

In [30]:
titanic_features

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Embarked,feamle,male
0,0,3,22.0,1,0,7.2500,S,0,1
1,1,1,38.0,1,0,71.2833,C,1,0
2,1,3,26.0,0,0,7.9250,S,1,0
3,1,1,35.0,1,0,53.1000,S,1,0
4,0,3,35.0,0,0,8.0500,S,0,1
...,...,...,...,...,...,...,...,...,...
1280,0,3,,0,0,8.0500,S,0,1
1281,1,1,39.0,0,0,108.9000,C,1,0
1282,0,3,38.5,0,0,7.2500,S,0,1
1283,0,3,,0,0,8.0500,S,0,1


In [31]:
titanic_features['Embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [32]:
dummy_columns = pd.get_dummies(titanic_features['Embarked'], prefix='Embarked').columns

In [33]:
titanic_features[dummy_columns] = pd.get_dummies(titanic_features['Embarked'], prefix='Embarked')

In [34]:
titanic_features.drop(columns=['Embarked'], inplace=True)

In [35]:
# pd.cut(data, bins, label) n개의 구간별로 나누기 pd.cut

bins = [0, 10, 20, 30, 40, 50, 60, 70, 80] # 시작과 끝값을 지정
labels = ['child', 'teens', '20s', '30s', '40s', '50s', '60s', '70s']

pd.cut(titanic_features['Age'], bins=bins, labels=labels).value_counts()

# 결측치 삭제 필요 

20s      357
30s      209
teens    156
40s      128
child     85
50s       62
60s       27
70s        6
Name: Age, dtype: int64

In [36]:
# 나이의 결측치 
titanic_features.Age.isna().sum()

255

In [37]:
# 평균 30살 
round(titanic_features.Age.mean())

30

In [38]:
# 나이 결측치를 평균 나이로 채워주기 
titanic_features.Age.fillna(round(titanic_features.Age.mean()), inplace=True)

In [39]:
# Fare의 결측치 

titanic_features.loc[titanic_features.Fare.isna()]

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,feamle,male,Embarked_C,Embarked_Q,Embarked_S
1035,0,3,60.5,0,0,,0,1,0,0,1


In [40]:
# 3등실의 평균 Fare는?
titanic_features.loc[titanic_features.Pclass == 3]['Fare'].mean()

13.269605772005766

In [41]:
titanic_features.Fare.fillna(round(titanic_features.loc[titanic_features.Pclass == 3]['Fare'].mean(), 4), inplace=True)

In [42]:
# 모든 결측치를 채웠다 
titanic_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1285 entries, 0 to 1284
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Survived    1285 non-null   int64  
 1   Pclass      1285 non-null   int64  
 2   Age         1285 non-null   float64
 3   SibSp       1285 non-null   int64  
 4   Parch       1285 non-null   int64  
 5   Fare        1285 non-null   float64
 6   feamle      1285 non-null   uint8  
 7   male        1285 non-null   uint8  
 8   Embarked_C  1285 non-null   uint8  
 9   Embarked_Q  1285 non-null   uint8  
 10  Embarked_S  1285 non-null   uint8  
dtypes: float64(2), int64(4), uint8(5)
memory usage: 66.6 KB


In [43]:
print(titanic_features.isna().sum())
print(titanic_target.isna().sum())

Survived      0
Pclass        0
Age           0
SibSp         0
Parch         0
Fare          0
feamle        0
male          0
Embarked_C    0
Embarked_Q    0
Embarked_S    0
dtype: int64


NameError: name 'titanic_target' is not defined

In [44]:
titanic_target = titanic_features[['Survived']]
titanic_target

Unnamed: 0,Survived
0,0
1,1
2,1
3,1
4,0
...,...
1280,0
1281,1
1282,0
1283,0


In [45]:
titanic_features = titanic_features.iloc[:,1:]

In [46]:
train_x, test_x, train_y, test_y = train_test_split(titanic_features, titanic_target, train_size=0.8, random_state = 123)
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape)

(1028, 10) (257, 10) (1028, 1) (257, 1)



# 두 번째 실습 
- 변수 선택법을 적용해 최적의 변수를 파악 
- 스케일링 + 변수 선택법 + VIF 다중공산성 + 정규화 + 매개변수  
    - SVM을 이용한 최적의 모델을 찾고
- 로지스틱 회귀모형과 비교 

#### rbf 커널 사용시 minmax 스케일링 정확도가 높았기 때문에 min max 스케일링을 이용한다 

In [47]:
std_scaler = MinMaxScaler()
std_scaler_fit = std_scaler.fit(train_x)
train_x_scale = pd.DataFrame(std_scaler_fit.transform(train_x), index=train_x.index, columns=train_x.columns)
test_x_scale = pd.DataFrame(std_scaler_fit.transform(test_x), index = test_x.index, columns=test_x.columns)

In [48]:
#accuracy = best_score 
best_score = 0

for gamma in [0.001, 0.01, 0.1, 1, 10, 100]:
    for c in [0.001, 0.01, 0.1, 1, 10, 100]:
        model = svm.SVC(kernel = 'rbf', C=c, gamma=gamma)
                                                    #CV cross value 교차검증 
        score = cross_val_score(model, train_x_scale, train_y, cv=10)
        score = np.mean(score)
    
        if score > best_score:
            best_score  = score 
            cvs_best_parameter_ = {'C': c, 'gamma' : gamma}

In [49]:
print('Accuracy: ',round(best_score,2)*100,'%')
print(cvs_best_parameter_)

Accuracy:  81.0 %
{'C': 10, 'gamma': 1}


### 다중공산성 확인

In [None]:
vif['vif factor'] = [variance_inflation_factor(train_x_scale.values, i) for i in range(0, train_x_scale.shape[1])]
vif['features'] = train_x_scale.columns
vif

### 다중 공산성 높은 것들 다 지우기 
- 대체 왜 VIF 프레임이 안만들어지는거지?? ;; 
- 이해가 안되네 
- range 문제였음
- male, female이 높게 나오긴 하는데 이거 지우면 볼게 없어짐
- 솔직히 다른 변수들 무의미 하니까 일단 넣고 진행

In [None]:
vif = pd.DataFrame()
vif_scaled = train_x_scale.drop(columns=['Embarked_C', 'Embarked_Q', 'Embarked_S'])
vif['vif factor'] = [variance_inflation_factor(vif_scaled.values, i) for i in range(0, vif_scaled.shape[1])]
vif['feature'] = vif_scaled.columns
vif

### 모델을 만들어보아요
1. 최적의 값 도출시키기

In [None]:
best_score = 0

for gamma in [0.001, 0.01, 0.1, 1, 10, 100]:
    for c in [0.001, 0.01, 0.1, 1, 10, 100]:
        model = svm.SVC(kernel = 'rbf', C=c, gamma=gamma)
                                                    #CV cross value 교차검증 
        score = cross_val_score(model, train_x_scale, train_y, cv=10)
        score = np.mean(score)
    
        if score > best_score:
            best_score  = score 
            cvs_best_parameter_ = {'C': c, 'gamma' : gamma}

In [None]:
param_rbf = {
    'C' : [0.001, 0.01, 0.1, 1, 10, 100],
    'gamma' : [0.001, 0.01, 0.1, 1, 10, 100]
}

grid_search = GridSearchCV(svm.SVC(kernel = 'rbf'), param_rbf, cv=10, return_train_score = True)
grid_search.fit(train_x_scale, train_y)

In [None]:
performance_matrix_MM = pd.DataFrame(columns = ['Grid_Search_rbf', 'Cross_value_rbf'], 
                                  index = ['best score', 'C value', 'Gamma value'])

performance_matrix_MM['Grid_Search_rbf']['best score'] = grid_search.best_score_
performance_matrix_MM['Grid_Search_rbf']['C value'] =grid_search.best_params_['C']
performance_matrix_MM['Grid_Search_rbf']['Gamma value'] =grid_search.best_params_['gamma']
performance_matrix_MM['Cross_value_rbf']['best score'] = best_score
performance_matrix_MM['Cross_value_rbf']['C value'] = cvs_best_parameter_['C']
performance_matrix_MM['Cross_value_rbf']['Gamma value'] = cvs_best_parameter_['gamma']
performance_matrix_MM

### 모델을 만들어 보아요 
- 최적의 값은 (C:10, Gamma:1) 
2. 이제 rbf 모델을 만들어 보아요 

In [None]:
y = train_y
x = vif_scaled

x_train_scaled, x_test_scaled, y_train_scaled, y_test_scaled = train_test_split(x,y, train_size=0.7, test_size=0.3, random_state=123)

In [None]:
x_train_scaled

In [None]:
vif1_model = svm.SVC(kernel='rbf', C=10, gamma=1).fit(x_train_scaled, y_train_scaled)
vif1_model

선택법을 결정할 차례
- 전진
- 후진 
- 단계적 