In [123]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# 데이터 로드

In [104]:
titanic_df = pd.read_csv('titanic_train.csv')
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [105]:
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


# 데이터 전처리

## 결측치 처리
- Age
- Cabin
- Embarked

In [106]:
titanic_df['Age'].fillna(titanic_df['Age'].mean(), inplace=True)
titanic_df['Cabin'].fillna('N', inplace=True)
titanic_df['Embarked'].fillna('N', inplace=True)
print('데이터 세트 Null 값 개수', titanic_df.isnull().sum())

데이터 세트 Null 값 개수 PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  titanic_df['Age'].fillna(titanic_df['Age'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  titanic_df['Cabin'].fillna('N', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we a

## drop
- PassengerId
- Name
- Ticket

In [107]:
titanic_df = titanic_df.drop(['PassengerId','Name','Ticket'], axis=1)
titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.25,N,S
1,1,1,female,38.0,1,0,71.2833,C85,C
2,1,3,female,26.0,0,0,7.925,N,S
3,1,1,female,35.0,1,0,53.1,C123,S
4,0,3,male,35.0,0,0,8.05,N,S


## label encoding
Sex, Cabin, Embarked

In [108]:
# Cabin - 앞에 한글자만 가져오기기
titanic_df['Cabin'] = titanic_df['Cabin'].str[:1]
titanic_df.head(3)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.25,N,S
1,1,1,female,38.0,1,0,71.2833,C,C
2,1,3,female,26.0,0,0,7.925,N,S


In [109]:
col_names = ['Sex','Cabin','Embarked']
for col in col_names:
    le = LabelEncoder()
    le.fit(titanic_df[col])
    titanic_df[col] = le.transform(titanic_df[col])
titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,1,22.0,1,0,7.25,7,3
1,1,1,0,38.0,1,0,71.2833,2,0
2,1,3,0,26.0,0,0,7.925,7,3
3,1,1,0,35.0,1,0,53.1,2,3
4,0,3,1,35.0,0,0,8.05,7,3


## Age, Fare - 수치형
1. 그대로 
2. 스케일링
3. 비닝

In [110]:
X_titanic_df = titanic_df.drop('Survived', axis=1)
y_titanic_df = titanic_df['Survived']

# 데이터 분할

In [111]:
X_train, X_test, y_train, y_test = train_test_split(X_titanic_df, y_titanic_df, test_size=0.2, random_state=11)

# 모델 학습

In [112]:
dt_clf = DecisionTreeClassifier(random_state=11)
rf_clf = RandomForestClassifier(random_state=11)
lr_clf = LogisticRegression(solver='liblinear')


In [113]:
# 모델 학습
dt_clf.fit(X_train, y_train)
rf_clf.fit(X_train, y_train)
lr_clf.fit(X_train, y_train)

# 예측
dt_pred = dt_clf.predict(X_test)
rf_pred = rf_clf.predict(X_test)
lr_pred = lr_clf.predict(X_test)

# 정확도
print("Decision Tree 정확도:", accuracy_score(y_test, dt_pred))
print("Random Forest 정확도:", accuracy_score(y_test, rf_pred))
print("Logistic Regression 정확도:", accuracy_score(y_test, lr_pred))

Decision Tree 정확도: 0.7877094972067039
Random Forest 정확도: 0.8547486033519553
Logistic Regression 정확도: 0.8659217877094972


## 교차 검증

In [114]:
# 위에거랑 비교하기 위해해
dt_clf_cv = DecisionTreeClassifier(random_state=11)
rf_clf_cv = RandomForestClassifier(random_state=11)
lr_clf_cv = LogisticRegression(solver='liblinear')

In [115]:
kfold = KFold(n_splits = 5) #Stratified 도 해보기기
dt_scores = []
rf_scores = []
lr_scores = []

for train_index, test_index in kfold.split(titanic_df):
    X_train, X_test = X_titanic_df.values[train_index], X_titanic_df.values[test_index]
    y_train, y_test = y_titanic_df.values[train_index], y_titanic_df.values[test_index]

    #DecisionTreeClassifier
    dt_clf_cv.fit(X_train, y_train)
    dt_pred = dt_clf_cv.predict(X_test)
    dt_scores.append(accuracy_score(y_test, dt_pred))

    #RandomForestClassifier
    rf_clf_cv.fit(X_train, y_train)
    rf_pred = rf_clf_cv.predict(X_test)
    rf_scores.append(accuracy_score(y_test, rf_pred))

    #LogisticRegression
    lr_clf_cv.fit(X_train, y_train)
    lr_pred = lr_clf_cv.predict(X_test)
    lr_scores.append(accuracy_score(y_test, lr_pred))

print("KFold 교차 검증 후 DecisionTreeClassifier 정확도:", dt_scores)
print("평균 정확도:", np.mean(dt_scores))
print("*" * 20)

print("KFold 교차 검증 후 RandomForestClassifier 정확도:", rf_scores)
print("평균 정확도:", np.mean(rf_scores))
print("*" * 20)

print("KFold 교차 검증 후 LogisticRegression 정확도:", lr_scores)
print("평균 정확도:", np.mean(lr_scores))
print("*" * 20)

KFold 교차 검증 후 DecisionTreeClassifier 정확도: [0.7541899441340782, 0.7808988764044944, 0.7865168539325843, 0.7696629213483146, 0.8202247191011236]
평균 정확도: 0.782298662984119
********************
KFold 교차 검증 후 RandomForestClassifier 정확도: [0.7932960893854749, 0.8089887640449438, 0.8370786516853933, 0.7752808988764045, 0.8595505617977528]
평균 정확도: 0.8148389931579938
********************
KFold 교차 검증 후 LogisticRegression 정확도: [0.7932960893854749, 0.7921348314606742, 0.7752808988764045, 0.7471910112359551, 0.8426966292134831]
평균 정확도: 0.7901198920343984
********************


In [116]:
# 함수로 만들기

def cross_validate_models(models, X_titanic_df, y_titanic_df):
    kfold = KFold(n_splits=5)
    results = {model_name: [] for model_name in models}

    for train_index, test_index in kfold.split(titanic_df):
        X_train, X_test = X_titanic_df.values[train_index], X_titanic_df.values[test_index]
        y_train, y_test = y_titanic_df.values[train_index], y_titanic_df.values[test_index]

        # 각 모델 학습 및 평가
        for model_name, model in models.items():
            model.fit(X_train, y_train)
            pred = model.predict(X_test)
            results[model_name].append(accuracy_score(y_test, pred))

    # 출력
    for model_name, scores in results.items():
        print(f"KFold 교차 검증 후 {model_name} 정확도: {scores}")
        print(f"평균 정확도: {np.mean(scores)}")
        print('*'*20)

# 사용
models = {
    "DecisionTreeClassifier": DecisionTreeClassifier(random_state=11),
    "RandomForestClassifier": RandomForestClassifier(random_state=11),
    "LogisticRegression": LogisticRegression(solver='liblinear')
}

# titanic_df가 X_titanic_df와 y_titanic_df로 분리된 상태에서
results = cross_validate_models(models, X_titanic_df, y_titanic_df)

KFold 교차 검증 후 DecisionTreeClassifier 정확도: [0.7541899441340782, 0.7808988764044944, 0.7865168539325843, 0.7696629213483146, 0.8202247191011236]
평균 정확도: 0.782298662984119
********************
KFold 교차 검증 후 RandomForestClassifier 정확도: [0.7932960893854749, 0.8089887640449438, 0.8370786516853933, 0.7752808988764045, 0.8595505617977528]
평균 정확도: 0.8148389931579938
********************
KFold 교차 검증 후 LogisticRegression 정확도: [0.7932960893854749, 0.7921348314606742, 0.7752808988764045, 0.7471910112359551, 0.8426966292134831]
평균 정확도: 0.7901198920343984
********************


## 파라미터 튜닝 + 교차검증
GridSearchCV(dt_clf_gcv, param_grid=p)

In [117]:
dt_clf_gcv = DecisionTreeClassifier(random_state=11)
rf_clf_gcv = RandomForestClassifier(random_state=11)
lr_clf_gcv = LogisticRegression(solver='liblinear')

In [118]:
# DecisionTreeClassifier (Logistic은 안됨)
parameters = {'max_depth':[1,2,3], 'min_samples_split':[2,3]}
grid_dt_clf = GridSearchCV(dt_clf_gcv, param_grid=parameters, scoring='accuracy', cv=5)
grid_dt_clf.fit(X_train, y_train)

score_df = pd.DataFrame(grid_dt_clf.cv_results_)

best_dt_clf = grid_dt_clf.best_estimator_
best_dt_pred = best_dt_clf.predict(X_test)
print('DecisionTree 파라미터 튜닝 후 정확도:', accuracy_score(y_test, best_dt_pred))

#RandomForestClassifier 
parameters = {'max_depth':[1,2,3], 'min_samples_split':[2,3]}
grid_rf_clf = GridSearchCV(rf_clf_gcv, param_grid=parameters, scoring='accuracy', cv=5)
grid_rf_clf.fit(X_train, y_train)

score_rf = pd.DataFrame(grid_rf_clf.cv_results_)

best_rf_clf = grid_rf_clf.best_estimator_
best_rf_pred = best_rf_clf.predict(X_test)
print('RandomForest 파라미터 튜닝 후 정확도:', accuracy_score(y_test, best_rf_pred))



DecisionTree 파라미터 튜닝 후 정확도: 0.8426966292134831
RandomForest 파라미터 튜닝 후 정확도: 0.848314606741573


In [119]:
# 파라미터 설정
parameters = {'max_depth': [1, 2, 3], 'min_samples_split': [2, 3]}

# DecisionTreeClassifier 하이퍼파라미터 튜닝
dt_clf = DecisionTreeClassifier(random_state=11)
grid_dt_clf = GridSearchCV(dt_clf, param_grid=parameters, scoring='accuracy', cv=5)
grid_dt_clf.fit(X_train, y_train)

# 결과 저장 및 평가
dt_score_df = pd.DataFrame(grid_dt_clf.cv_results_)
best_dt_clf = grid_dt_clf.best_estimator_
best_dt_pred = best_dt_clf.predict(X_test)
dt_accuracy = accuracy_score(y_test, best_dt_pred)

print("DecisionTreeClassifier 최적 파라미터:", grid_dt_clf.best_params_)
print("DecisionTreeClassifier 정확도:", dt_accuracy)

# RandomForestClassifier 하이퍼파라미터 튜닝
rf_clf = RandomForestClassifier(random_state=11)
grid_rf_clf = GridSearchCV(rf_clf, param_grid=parameters, scoring='accuracy', cv=5)
grid_rf_clf.fit(X_train, y_train)

# 결과 저장 및 평가
rf_score_df = pd.DataFrame(grid_rf_clf.cv_results_)
best_rf_clf = grid_rf_clf.best_estimator_
best_rf_pred = best_rf_clf.predict(X_test)
rf_accuracy = accuracy_score(y_test, best_rf_pred)

print("RandomForestClassifier 최적 파라미터:", grid_rf_clf.best_params_)
print("RandomForestClassifier 정확도:", rf_accuracy)


DecisionTreeClassifier 최적 파라미터: {'max_depth': 3, 'min_samples_split': 2}
DecisionTreeClassifier 정확도: 0.8426966292134831
RandomForestClassifier 최적 파라미터: {'max_depth': 3, 'min_samples_split': 3}
RandomForestClassifier 정확도: 0.848314606741573


## 스케일링 StandardScaler

In [120]:
X_train, X_test, y_train, y_test = train_test_split(X_titanic_df, y_titanic_df, test_size=0.2, random_state=11)

# StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# DecisionTreeClassifier
dt_scaled = DecisionTreeClassifier()
dt_scaled.fit(X_train_scaled, y_train)
dt_pred = dt_scaled.predict(X_test_scaled)
print('StandardScaler 스케일링 후 DecisionTree 정확도:', accuracy_score(y_test, dt_pred))

# RandomForestClassifier
rf_scaled = RandomForestClassifier()
rf_scaled.fit(X_train_scaled, y_train)
rf_pred = rf_scaled.predict(X_test_scaled)
print('StandardScaler 스케일링 후 RandomForest 정확도:', accuracy_score(y_test, rf_pred))

# LogisticRegression
lr_scaled = LogisticRegression()
lr_scaled.fit(X_train_scaled, y_train)
lr_pred = lr_scaled.predict(X_test_scaled)
print('StandardScaler 스케일링 후 LogisticRegression 정확도:', accuracy_score(y_test, lr_pred))

StandardScaler 스케일링 후 DecisionTree 정확도: 0.776536312849162
StandardScaler 스케일링 후 RandomForest 정확도: 0.8491620111731844
StandardScaler 스케일링 후 LogisticRegression 정확도: 0.8491620111731844


## 속성 선택

In [121]:
from sklearn.feature_selection import SelectFromModel

# 속성 선택 (Random Forest 모델로 특성 중요도 기준 상위 5개 속성 선택)
rf_selector = SelectFromModel(rf, threshold="mean", max_features=5)  # 중요도 기준 상위 5개 속성 선택
X_train_selected = rf_selector.transform(X_train_scaled)
X_test_selected = rf_selector.transform(X_test_scaled)

# 속성 선택 후 다시 모델 학습

# Decision Tree 모델 (속성 선택 후)
dt_selected = DecisionTreeClassifier(random_state=11)
dt_selected.fit(X_train_selected, y_train)
dt_pred_selected = dt_selected.predict(X_test_selected)
dt_selected_accuracy = accuracy_score(y_test, dt_pred_selected)

# Random Forest 모델 (속성 선택 후)
rf_selected = RandomForestClassifier(random_state=11)
rf_selected.fit(X_train_selected, y_train)
rf_pred_selected = rf_selected.predict(X_test_selected)
rf_selected_accuracy = accuracy_score(y_test, rf_pred_selected)

# Logistic Regression 모델 (속성 선택 후)
lr_selected = LogisticRegression(solver='liblinear', random_state=11)
lr_selected.fit(X_train_selected, y_train)
lr_pred_selected = lr_selected.predict(X_test_selected)
lr_selected_accuracy = accuracy_score(y_test, lr_pred_selected)

# 6. 성능 출력 (속성 선택 후)
print("속성 선택 후 Decision Tree 정확도:", dt_selected_accuracy)
print("속성 선택 후 Random Forest 정확도:", rf_selected_accuracy)
print("속성 선택 후 Logistic Regression 정확도:", lr_selected_accuracy)
print('*'*20)

속성 선택 후 Decision Tree 정확도: 0.7877094972067039
속성 선택 후 Random Forest 정확도: 0.8324022346368715
속성 선택 후 Logistic Regression 정확도: 0.8324022346368715
********************


## 스케일링 후 파라미터 튜닝 

In [122]:
# 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X_titanic_df, y_titanic_df, test_size=0.2, random_state=11)

# StandardScaler로 스케일링
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 파라미터 설정
parameters = {'max_depth': [1, 2, 3], 'min_samples_split': [2, 3]}

# DecisionTreeClassifier 하이퍼파라미터 튜닝
dt_clf = DecisionTreeClassifier(random_state=11)
grid_dt_clf = GridSearchCV(dt_clf, param_grid=parameters, scoring='accuracy', cv=5)
grid_dt_clf.fit(X_train_scaled, y_train)

# 결과 저장 및 평가
dt_score_df = pd.DataFrame(grid_dt_clf.cv_results_)
best_dt_clf = grid_dt_clf.best_estimator_
best_dt_pred = best_dt_clf.predict(X_test_scaled)
dt_accuracy = accuracy_score(y_test, best_dt_pred)

print("StandardScaler + DecisionTreeClassifier 최적 파라미터:", grid_dt_clf.best_params_)
print("StandardScaler + DecisionTreeClassifier 정확도:", dt_accuracy)

# RandomForestClassifier 하이퍼파라미터 튜닝
rf_clf = RandomForestClassifier(random_state=11)
grid_rf_clf = GridSearchCV(rf_clf, param_grid=parameters, scoring='accuracy', cv=5)
grid_rf_clf.fit(X_train_scaled, y_train)

# 결과 저장 및 평가
rf_score_df = pd.DataFrame(grid_rf_clf.cv_results_)
best_rf_clf = grid_rf_clf.best_estimator_
best_rf_pred = best_rf_clf.predict(X_test_scaled)
rf_accuracy = accuracy_score(y_test, best_rf_pred)

print("StandardScaler + RandomForestClassifier 최적 파라미터:", grid_rf_clf.best_params_)
print("StandardScaler + RandomForestClassifier 정확도:", rf_accuracy)

StandardScaler + DecisionTreeClassifier 최적 파라미터: {'max_depth': 3, 'min_samples_split': 2}
StandardScaler + DecisionTreeClassifier 정확도: 0.8715083798882681
StandardScaler + RandomForestClassifier 최적 파라미터: {'max_depth': 3, 'min_samples_split': 2}
StandardScaler + RandomForestClassifier 정확도: 0.8659217877094972
