# 회귀모형 
import numpy as np
import matplotlib.pyplot as plt 
import pandas as pd 
import seaborn as sns 

# 쓸데없는 알림 방지
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

import itertools

#통계적 모형
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor 

#머신러닝
from sklearn import datasets
from sklearn import metrics
from sklearn import svm, neighbors, tree 

from sklearn.linear_model import Ridge, Lasso, LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.metrics import mean_squared_log_error, mean_squared_error, r2_score, mean_absolute_error #연속형일때 사용하는 경우 
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, confusion_matrix #범주형(분류모델)

from sklearn.preprocessing import StandardScaler, RobustScaler, Normalizer,MinMaxScaler # scale

In [2]:
titanic_raw_data = pd.read_csv('C:/Users/scien/Videos/titanic/titanic_full_data.csv')

In [3]:
titanic_raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1285 entries, 0 to 1284
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1285 non-null   int64  
 1   Survived     1285 non-null   int64  
 2   Pclass       1285 non-null   int64  
 3   Name         1285 non-null   object 
 4   Sex          1285 non-null   object 
 5   Age          1030 non-null   float64
 6   SibSp        1285 non-null   int64  
 7   Parch        1285 non-null   int64  
 8   Ticket       1285 non-null   object 
 9   Fare         1284 non-null   float64
 10  Cabin        294 non-null    object 
 11  Embarked     1283 non-null   object 
dtypes: float64(2), int64(5), object(5)
memory usage: 120.6+ KB


### 결측치, 문자열 데이터를 어떻게 전처리 해야하는가? 

- PassengerId (index)
- Name (지워버림)
- sex (원핫 인코딩으로 수치화)
- Ticket (지움)
- Cabin (빼줘)
- Embark (원핫 인코딩으로 수치화)

In [4]:
# 반응변수 Y를 만들자 
titanic_target = titanic_raw_data['Survived']

In [5]:
titanic_target

0       0
1       1
2       1
3       1
4       0
       ..
1280    0
1281    1
1282    0
1283    0
1284    1
Name: Survived, Length: 1285, dtype: int64

In [6]:
# Y 해당하는 survived 제외, 문자열 타입 Name, Cabin 제외 
columns = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch','Fare', 'Embarked']

In [7]:
# 입력변수 X를 만들자 
titanic_features = titanic_raw_data[columns]
titanic_features

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.2500,S
1,1,female,38.0,1,0,71.2833,C
2,3,female,26.0,0,0,7.9250,S
3,1,female,35.0,1,0,53.1000,S
4,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...
1280,3,male,,0,0,8.0500,S
1281,1,female,39.0,0,0,108.9000,C
1282,3,male,38.5,0,0,7.2500,S
1283,3,male,,0,0,8.0500,S


In [8]:
# 문자열 자료를 이산형으로 만들기 
# 굳이 2개 다 쓸 필요가 없어 하나만 이용할 것임 drop_first = True
# 만일 female을 남기고 싶다면?
# columns=['male', 'female']

titanic_features[['feamle', 'male']] = pd.get_dummies(titanic_features['Sex'])

In [9]:
titanic_features.drop(columns=['Sex'], inplace=True)

In [10]:
titanic_features

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Embarked,feamle,male
0,3,22.0,1,0,7.2500,S,0,1
1,1,38.0,1,0,71.2833,C,1,0
2,3,26.0,0,0,7.9250,S,1,0
3,1,35.0,1,0,53.1000,S,1,0
4,3,35.0,0,0,8.0500,S,0,1
...,...,...,...,...,...,...,...,...
1280,3,,0,0,8.0500,S,0,1
1281,1,39.0,0,0,108.9000,C,1,0
1282,3,38.5,0,0,7.2500,S,0,1
1283,3,,0,0,8.0500,S,0,1


In [11]:
titanic_features['Embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [12]:
dummy_columns = pd.get_dummies(titanic_features['Embarked'], prefix='Embarked').columns

In [13]:
titanic_features[dummy_columns] = pd.get_dummies(titanic_features['Embarked'], prefix='Embarked')

In [14]:
titanic_features.drop(columns=['Embarked'], inplace=True)

In [15]:
# pd.cut(data, bins, label) n개의 구간별로 나누기 pd.cut

bins = [0, 10, 20, 30, 40, 50, 60, 70, 80] # 시작과 끝값을 지정
labels = ['child', 'teens', '20s', '30s', '40s', '50s', '60s', '70s']

pd.cut(titanic_features['Age'], bins=bins, labels=labels).value_counts()

# 결측치 삭제 필요 

20s      357
30s      209
teens    156
40s      128
child     85
50s       62
60s       27
70s        6
Name: Age, dtype: int64

In [16]:
# 나이의 결측치 
titanic_features.Age.isna().sum()

255

In [17]:
# 평균 30살 
round(titanic_features.Age.mean())

30

In [18]:
# 나이 결측치를 평균 나이로 채워주기 
titanic_features.Age.fillna(round(titanic_features.Age.mean()), inplace=True)

In [19]:
titanic_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1285 entries, 0 to 1284
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Pclass      1285 non-null   int64  
 1   Age         1285 non-null   float64
 2   SibSp       1285 non-null   int64  
 3   Parch       1285 non-null   int64  
 4   Fare        1284 non-null   float64
 5   feamle      1285 non-null   uint8  
 6   male        1285 non-null   uint8  
 7   Embarked_C  1285 non-null   uint8  
 8   Embarked_Q  1285 non-null   uint8  
 9   Embarked_S  1285 non-null   uint8  
dtypes: float64(2), int64(3), uint8(5)
memory usage: 56.6 KB


In [20]:
# Fare의 결측치 

titanic_features.loc[titanic_features.Fare.isna()]

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,feamle,male,Embarked_C,Embarked_Q,Embarked_S
1035,3,60.5,0,0,,0,1,0,0,1


In [21]:
# 3등실의 평균 Fare는?
titanic_features.loc[titanic_features.Pclass == 3]['Fare'].mean()

13.269605772005766

In [22]:
titanic_features.Fare.fillna(round(titanic_features.loc[titanic_features.Pclass == 3]['Fare'].mean(), 4), inplace=True)

In [23]:
# 모든 결측치를 채웠다 
titanic_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1285 entries, 0 to 1284
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Pclass      1285 non-null   int64  
 1   Age         1285 non-null   float64
 2   SibSp       1285 non-null   int64  
 3   Parch       1285 non-null   int64  
 4   Fare        1285 non-null   float64
 5   feamle      1285 non-null   uint8  
 6   male        1285 non-null   uint8  
 7   Embarked_C  1285 non-null   uint8  
 8   Embarked_Q  1285 non-null   uint8  
 9   Embarked_S  1285 non-null   uint8  
dtypes: float64(2), int64(3), uint8(5)
memory usage: 56.6 KB


In [24]:
titanic_features

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,feamle,male,Embarked_C,Embarked_Q,Embarked_S
0,3,22.0,1,0,7.2500,0,1,0,0,1
1,1,38.0,1,0,71.2833,1,0,1,0,0
2,3,26.0,0,0,7.9250,1,0,0,0,1
3,1,35.0,1,0,53.1000,1,0,0,0,1
4,3,35.0,0,0,8.0500,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...
1280,3,30.0,0,0,8.0500,0,1,0,0,1
1281,1,39.0,0,0,108.9000,1,0,1,0,0
1282,3,38.5,0,0,7.2500,0,1,0,0,1
1283,3,30.0,0,0,8.0500,0,1,0,0,1


In [25]:
print(titanic_features.isna().sum())
print(titanic_target.isna().sum())

Pclass        0
Age           0
SibSp         0
Parch         0
Fare          0
feamle        0
male          0
Embarked_C    0
Embarked_Q    0
Embarked_S    0
dtype: int64
0


In [26]:
train_x, test_x, train_y, test_y = train_test_split(titanic_features, titanic_target, train_size=0.8, random_state = 123)
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape)

(1028, 10) (257, 10) (1028,) (257,)


### 모델을 적합하기 전에 스케일링이 필요하다. 

In [168]:
std_scaler = StandardScaler()
std_scaler_fit = std_scaler.fit(train_x)
train_x_scale = pd.DataFrame(std_scaler_fit.transform(train_x), index=train_x.index, columns=train_x.columns)
# x의 값을 스케일링 해주고, X와 Y좌표를 X의 데이터 그대로 적합시켜준다.


test_x_scale = pd.DataFrame(std_scaler_fit.transform(test_x), index = test_x.index, columns=test_x.columns)

### min max sclaer (가장 많이 사용하는 스케일링)

In [179]:
std_scaler = MinMaxScaler()
std_scaler_fit = std_scaler.fit(train_x)
train_x_scale = pd.DataFrame(std_scaler_fit.transform(train_x), index=train_x.index, columns=train_x.columns)
test_x_scale = pd.DataFrame(std_scaler_fit.transform(test_x), index = test_x.index, columns=test_x.columns)

### RobustCaler 

In [191]:
std_scaler = RobustScaler()
std_scaler_fit = std_scaler.fit(train_x)
train_x_scale = pd.DataFrame(std_scaler_fit.transform(train_x), index=train_x.index, columns=train_x.columns)
test_x_scale = pd.DataFrame(std_scaler_fit.transform(test_x), index = test_x.index, columns=test_x.columns)

### R에서 사용하던 노멀라이저 스케일링 

In [200]:
std_scaler = Normalizer()
std_scaler_fit = std_scaler.fit(train_x)
train_x_scale = pd.DataFrame(std_scaler_fit.transform(train_x), index=train_x.index, columns=train_x.columns)
test_x_scale = pd.DataFrame(std_scaler_fit.transform(test_x), index = test_x.index, columns=test_x.columns)

### 그렇다면 모델 linear에 최적하는 값은 무엇일까?
- C에 최적하는 값은 ㅁ웟인가?

### cross_value_score 로 C의 최적값 찾기 

In [31]:
model = svm.SVC(kernel='linear', C=1).fit(train_x_scale, train_y)

In [190]:
#accuracy = best_score 
best_score = 0

for c in [0.001, 0.01, 0.1, 1, 10, 100]:
    model = svm.SVC(kernel = 'linear', C=c)
                                                    #CV cross value 교차검증 
    score = cross_val_score(model, train_x_scale, train_y, cv=10)
    score = np.mean(score)
    
    if score > best_score:
        best_score  = score 
        best_parameter = {'C' : c}

KeyboardInterrupt: 

In [None]:
best_score

In [None]:
best_parameter

### Grid search로 C의 최적값 도출해보기 

In [None]:
param = {
    'C' : [0.001, 0.01, 0.1, 1, 10, 100]
}
            #cross_val_score를 쉽게 함                               # 점수를 돌려줌 
grid_search = GridSearchCV(svm.SVC(kernel='linear'), param, cv=10, return_train_score = True)
grid_search.fit(train_x_scale, train_y)

In [None]:
grid_search.score 

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_score_

In [None]:
grid_search.best_estimator

### 0.01로 잡아야 좋았다

### 모델 만들어서 fit하기 

In [38]:
model = svm.SVC(kernel='linear', C=0.01).fit(train_x_scale, train_y)

In [212]:
train_x_scale

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,feamle,male,Embarked_C,Embarked_Q,Embarked_S
399,0.064888,0.908429,0.000000,0.000000,0.410415,0.032444,0.000000,0.000000,0.0,0.032444
95,0.096037,0.960374,0.000000,0.000000,0.257700,0.000000,0.032012,0.000000,0.0,0.032012
807,0.150860,0.905162,0.000000,0.000000,0.390980,0.050287,0.000000,0.000000,0.0,0.050287
856,0.005851,0.263293,0.005851,0.005851,0.964627,0.005851,0.000000,0.000000,0.0,0.005851
550,0.008912,0.151502,0.000000,0.017824,0.988176,0.000000,0.008912,0.008912,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...
1238,0.089600,0.896005,0.029867,0.000000,0.431823,0.000000,0.029867,0.029867,0.0,0.000000
1147,0.096037,0.960374,0.000000,0.000000,0.257700,0.032012,0.000000,0.000000,0.0,0.032012
106,0.132774,0.929420,0.000000,0.000000,0.338574,0.044258,0.000000,0.000000,0.0,0.044258
1041,0.020113,0.844761,0.000000,0.000000,0.534009,0.000000,0.020113,0.000000,0.0,0.020113


In [213]:
train_y

399     1
95      0
807     0
856     1
550     1
       ..
1238    0
1147    1
106     1
1041    0
1122    1
Name: Survived, Length: 1028, dtype: int64

In [39]:
model.coef_

array([[-0.04430173, -0.26744944, -0.03759856, -0.00448814,  0.12863182,
         0.03706804, -0.04878747,  0.00849699,  0.00612164, -0.02633805]])

In [40]:
pd.concat([pd.DataFrame(model.coef_.T, columns=['coef']), pd.DataFrame(train_x_scale.columns, columns=['features'])], axis=1)

Unnamed: 0,coef,features
0,-0.044302,Pclass
1,-0.267449,Age
2,-0.037599,SibSp
3,-0.004488,Parch
4,0.128632,Fare
5,0.037068,feamle
6,-0.048787,male
7,0.008497,Embarked_C
8,0.006122,Embarked_Q
9,-0.026338,Embarked_S


In [41]:
pred_y = model.predict(test_x_scale)

In [42]:
pd.concat([pd.DataFrame(test_y.values, columns=['Y']), pd.DataFrame(pred_y, columns=['Yhat'])], axis=1)

Unnamed: 0,Y,Yhat
0,1,0
1,0,0
2,0,0
3,1,0
4,1,0
...,...,...
252,1,0
253,0,0
254,0,0
255,0,0


### 컨퓨전으로 정확도 파악하기 

In [43]:
cmat = confusion_matrix(test_y, pred_y)
cmat

array([[151,   0],
       [106,   0]], dtype=int64)

In [44]:
accuracy = (cmat[0,0] + cmat[1,1]) / np.sum(cmat)
accuracy

0.5875486381322957

In [45]:
test_y.value_counts()

0    151
1    106
Name: Survived, dtype: int64

# 첫번째 실습 

- `rbf` 커널을 사용하는 경우 `C`와 함꼐 `gamma`에 대한 매개변수도 최적의 값을 도출해야한다. 
- 어제 했던 `gridsearch` 혹은 모델의 스코어를 직접 비교하는 방식을 사용하여 매개변수 값을 찾아본다. 


In [169]:
model_rbf = svm.SVC(kernel='rbf', C=1, gamma=0.7).fit(train_x_scale, train_y)

In [170]:
model_rbf

SVC(C=1, gamma=0.7)

#### CVS로 최적의 값 도출 

In [201]:
#accuracy = best_score 
best_score = 0

for gamma in [0.001, 0.01, 0.1, 1, 10, 100]:
    for c in [0.001, 0.01, 0.1, 1, 10, 100]:
        model = svm.SVC(kernel = 'rbf', C=c, gamma=gamma)
                                                    #CV cross value 교차검증 
        score = cross_val_score(model, train_x_scale, train_y, cv=10)
        score = np.mean(score)
    
        if score > best_score:
            best_score  = score 
            cvs_best_parameter_ = {'C': c, 'gamma' : gamma}

In [202]:
best_score

0.774357509994289

In [203]:
cvs_best_parameter_

{'C': 100, 'gamma': 10}

### grid_search로 최적의 값 도출 

In [204]:
param_rbf = {
    'C' : [0.001, 0.01, 0.1, 1, 10, 100],
    'gamma' : [0.001, 0.01, 0.1, 1, 10, 100]
}

grid_search = GridSearchCV(svm.SVC(kernel = 'rbf'), param_rbf, cv=10, return_train_score = True)
grid_search.fit(train_x_scale, train_y)

GridSearchCV(cv=10, estimator=SVC(),
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100],
                         'gamma': [0.001, 0.01, 0.1, 1, 10, 100]},
             return_train_score=True)

In [205]:
grid_search.best_score_

0.774357509994289

In [206]:
grid_search.best_params_

{'C': 100, 'gamma': 10}

In [177]:
# Standard Scaler
performance_matrix_std = pd.DataFrame(columns = ['Grid_Search_rbf', 'Cross_value_rbf'], 
                                  index = ['best score', 'C value', 'Gamma value'])

performance_matrix_std['Grid_Search_rbf']['best score'] = grid_search.best_score_
performance_matrix_std['Grid_Search_rbf']['C value'] =grid_search.best_params_['C']
performance_matrix_std['Grid_Search_rbf']['Gamma value'] =grid_search.best_params_['gamma']
performance_matrix_std['Cross_value_rbf']['best score'] = best_score
performance_matrix_std['Cross_value_rbf']['C value'] = cvs_best_parameter_['C']
performance_matrix_std['Cross_value_rbf']['Gamma value'] = cvs_best_parameter_['gamma']

In [186]:
# MinMax Scaler 
performance_matrix_MM = pd.DataFrame(columns = ['Grid_Search_rbf', 'Cross_value_rbf'], 
                                  index = ['best score', 'C value', 'Gamma value'])

performance_matrix_MM['Grid_Search_rbf']['best score'] = grid_search.best_score_
performance_matrix_MM['Grid_Search_rbf']['C value'] =grid_search.best_params_['C']
performance_matrix_MM['Grid_Search_rbf']['Gamma value'] =grid_search.best_params_['gamma']
performance_matrix_MM['Cross_value_rbf']['best score'] = best_score
performance_matrix_MM['Cross_value_rbf']['C value'] = cvs_best_parameter_['C']
performance_matrix_MM['Cross_value_rbf']['Gamma value'] = cvs_best_parameter_['gamma']

In [198]:
# Robust Scaler 
performance_matrix_rb = pd.DataFrame(columns = ['Grid_Search_rbf', 'Cross_value_rbf'], 
                                  index = ['best score', 'C value', 'Gamma value'])

performance_matrix_rb['Grid_Search_rbf']['best score'] = grid_search.best_score_
performance_matrix_rb['Grid_Search_rbf']['C value'] =grid_search.best_params_['C']
performance_matrix_rb['Grid_Search_rbf']['Gamma value'] =grid_search.best_params_['gamma']
performance_matrix_rb['Cross_value_rbf']['best score'] = best_score
performance_matrix_rb['Cross_value_rbf']['C value'] = cvs_best_parameter_['C']
performance_matrix_rb['Cross_value_rbf']['Gamma value'] = cvs_best_parameter_['gamma']

In [207]:
# Normalizer  
performance_matrix_normal = pd.DataFrame(columns = ['Grid_Search_rbf', 'Cross_value_rbf'], 
                                  index = ['best score', 'C value', 'Gamma value'])

performance_matrix_normal['Grid_Search_rbf']['best score'] = grid_search.best_score_
performance_matrix_normal['Grid_Search_rbf']['C value'] =grid_search.best_params_['C']
performance_matrix_normal['Grid_Search_rbf']['Gamma value'] =grid_search.best_params_['gamma']
performance_matrix_normal['Cross_value_rbf']['best score'] = best_score
performance_matrix_normal['Cross_value_rbf']['C value'] = cvs_best_parameter_['C']
performance_matrix_normal['Cross_value_rbf']['Gamma value'] = cvs_best_parameter_['gamma']

In [209]:
performance_matrix_std #standard

Unnamed: 0,Grid_Search_rbf,Cross_value_rbf
best score,0.795736,0.795736
C value,100.0,100.0
Gamma value,0.01,0.01


In [210]:
performance_matrix_MM # max minimum 

Unnamed: 0,Grid_Search_rbf,Cross_value_rbf
best score,0.805473,0.805473
C value,10.0,10.0
Gamma value,1.0,1.0


In [211]:
performance_matrix_rb # robust 

Unnamed: 0,Grid_Search_rbf,Cross_value_rbf
best score,0.797754,0.797754
C value,1.0,1.0
Gamma value,0.1,0.1


In [208]:
performance_matrix_normal # noramlize 

Unnamed: 0,Grid_Search_rbf,Cross_value_rbf
best score,0.774358,0.774358
C value,100.0,100.0
Gamma value,10.0,10.0
