## Importing the dataset and Data


PassengerID : 탑승객 고유 아이디  
Survival : 탑승객 생존 유무 (0: 사망, 1: 생존)  
Pclass : 등실의 등급  
Name : 이름  
Sex : 성별  
Age : 나이  
Sibsp : 함께 탐승한 형제자매, 아내, 남편의 수  
Parch : 함께 탐승한 부모, 자식의 수  
Ticket :티켓 번호  
Fare : 티켓의 요금  
Cabin : 객실번호  
Embarked : 배에 탑승한 항구 이름 ( C = Cherbourn, Q = Queenstown, S = Southampton)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

test = pd.read_csv("./test.csv")
train = pd.read_csv("./train.csv")

In [None]:
test

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [None]:
# train set 칼럼 확인
train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [None]:
# test set 칼럼 확인
test.columns

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [None]:
# train 정보 확인
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [None]:
# 상관계수 분석 
# 상관분석 correlation analysis
# 상관관계 correlation coeffiecient
train.corr()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
PassengerId,1.0,-0.005007,-0.035144,0.036847,-0.057527,-0.001652,0.012658
Survived,-0.005007,1.0,-0.338481,-0.077221,-0.035322,0.081629,0.257307
Pclass,-0.035144,-0.338481,1.0,-0.369226,0.083081,0.018443,-0.5495
Age,0.036847,-0.077221,-0.369226,1.0,-0.308247,-0.189119,0.096067
SibSp,-0.057527,-0.035322,0.083081,-0.308247,1.0,0.414838,0.159651
Parch,-0.001652,0.081629,0.018443,-0.189119,0.414838,1.0,0.216225
Fare,0.012658,0.257307,-0.5495,0.096067,0.159651,0.216225,1.0


## Missing Data

In [None]:
# train DataFrame의 결측값 갯수 확인
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [None]:
# test DataFrame의 결측값 갯수 확인
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

## Data Preprocessing

### 1. Age : 호칭별 평균값으로 Age 결측값 채우기

In [None]:
# 'Mr', 'Miss', 'Mrs', 'Ms', 'Master', 'Dr', Child로 이름 분류 후, 각 그룹의 평균값으로 결측치 채우기 (train 데이터)
age_dict_train = {}

# Mr, Ms, Miss, Mrs로 이름 분류
for title in ['Mr', 'Miss', 'Mrs', 'Ms', 'Master', 'Dr']:
    # 해당 타이틀을 가진 사람들의 나이 평균값 계산
    age = train[train['Name'].str.contains(title)]['Age'].mean()
    # 딕셔너리에 저장
    age_dict_train[title] = int(round(age))

# 타이틀에 해당되지 않는 사람들은 Other로 분류
age_dict_train['Other'] = int(round(train[train['Name'].apply(lambda x: 'Mr' not in x and
                                                          'Miss' not in x and
                                                          'Mrs' not in x and
                                                          'Ms' not in x and
                                                          'Master' not in x and
                                                          'Dr' not in x)]['Age'].mean()))

In [None]:
age_dict_train

{'Mr': 33, 'Miss': 22, 'Mrs': 36, 'Ms': 28, 'Master': 5, 'Dr': 39, 'Other': 44}

In [None]:
# 'Mr', 'Miss', 'Mrs', 'Ms', 'Master', 'Dr', Child로 이름 분류 후, 각 그룹의 평균값으로 결측치 채우기 (test 데이터)
age_dict_test = {}

# MR, MS, MISS, MRS로 이름 분류
for title in ['Mr', 'Miss', 'Mrs','Master', 'Dr']:
    # 해당 타이틀을 가진 사람들의 나이 평균값 계산
    age = test[test['Name'].str.contains(title)]['Age'].mean()
    # 딕셔너리에 저장
    age_dict_test[title] = int(round(age))

# test데이터의 Ms가 1명이므로 train데이터의 결과값을 가져와서 대체
test.loc[test['Name'].str.contains('Ms'), 'Age'] = age_dict_train['Ms']

# 타이틀에 해당되지 않는 사람들은 Other로 분류
age_dict_test['Other'] = int(round(test[test['Name'].apply(lambda x: 'Mr' not in x and
                                                       'Miss' not in x and
                                                       'Mrs' not in x and
                                                       'Ms' not in x and
                                                       'Master' not in x and
                                                       'Dr' not in x)]['Age'].mean()))

In [None]:
age_dict_test

{'Mr': 34, 'Miss': 22, 'Mrs': 39, 'Master': 7, 'Dr': 34, 'Other': 42}

In [None]:
for title, age in age_dict_train.items():
    # 위에 해당 타이틀을 가진 결측치 index 추출
    idx = train[train['Name'].str.contains(title) & train['Age'].isnull()].index

    # 해당 index의 결측치를 해당 타이틀의 나이 평균값으로 채우기
    train.loc[idx, 'Age'] = age

In [None]:
for title, age in age_dict_test.items():
    # 위에 해당 타이틀을 가진 결측치 index 추출
    idx = test[test['Name'].str.contains(title) & test['Age'].isnull()].index

    # 해당 index의 결측치를 해당 타이틀의 나이 평균값으로 채우기
    test.loc[idx, 'Age'] = age

In [None]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [None]:
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

### 2. Fare: Pclass별 평균가격으로 결측값 채우기

In [None]:
test[test['Fare'].isnull()]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
152,1044,3,"Storey, Mr. Thomas",male,60.5,0,0,3701,,,S


In [None]:
# Pclass가 3등급의 Fare 값중 가장 많이나온 10개의 값 추출 + 평균
fare3 = pd.DataFrame(test[test['Pclass']==3]['Fare'].value_counts().nlargest(10))
fare3 = fare3.reset_index()
fare3['index'].mean()

7.75709

In [None]:
test['Fare'] = test['Fare'].fillna("7.753825")

### 3. Embarked : 최빈값으로 결측값 채우기

In [None]:
train['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [None]:
# 대다수가 'S' 항구에서 탑승했으므로 S로 채우기
train['Embarked'] = train['Embarked'].fillna("S")

### 4. SIbSp, Parch : Companion로 묶어 처리하기

In [None]:
train['Companion'] = train['SibSp'] + train['Parch']
test['Companion'] = test['SibSp'] + test['Parch']

### 5. Cabin : LogisticRegression을 통해 결측값 채우기(후술)

In [None]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
Companion        0
dtype: int64

In [None]:
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          327
Embarked         0
Companion        0
dtype: int64

## Encoding categorical data

### 1. Sex : 남자는 0, 여자는 1로 LabelEncoding





In [None]:
train['Sex'] = train['Sex'].map({'male': 0, 'female': 1})
test['Sex'] = test['Sex'].map({'male': 0, 'female': 1})

### 2. Embarked : C는 0, Q는 1, S는 2로 LabelEncoding

In [None]:
train['Embarked'] = train['Embarked'].map({'C' : 0, 'Q' : 1, 'S' : 2})
test['Embarked'] = test['Embarked'].map({'C' : 0, 'Q' : 1, 'S' : 2})

In [None]:
train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Companion
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.2500,,2,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,0,1
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.9250,,2,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1000,C123,2,1
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.0500,,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",0,27.0,0,0,211536,13.0000,,2,0
887,888,1,1,"Graham, Miss. Margaret Edith",1,19.0,0,0,112053,30.0000,B42,2,0
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",1,22.0,1,2,W./C. 6607,23.4500,,2,3
889,890,1,1,"Behr, Mr. Karl Howell",0,26.0,0,0,111369,30.0000,C148,0,0


###3. Cabin : LogisticRegression을 통해 결측값 채우기 + LabelEncoding

In [None]:
# 정규표현식을 이용해 갑판이름만 추출
train['Cabin_only'] = train['Cabin'].str.extract(r'([A-Za-z]+)')
test['Cabin_only'] = test['Cabin'].str.extract(r'([A-Za-z]+)')

In [None]:
cabin_train = train.loc[train['Cabin_only'].notnull(), ['Pclass', 'Sex', 'Companion', 'Embarked', 'Cabin_only']]
cabin_test = train.loc[train['Cabin_only'].isnull(), ['Pclass', 'Sex', 'Companion', 'Embarked']]

cabin_train = cabin_train.dropna()
cabin_test = cabin_test.dropna()

X = cabin_train.iloc[:,:-1].values
y = cabin_train.iloc[:,-1].values

In [None]:
# LogisticRegression으로 Cabin 결측치 예측
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

from sklearn.linear_model import LogisticRegression
logistic_reg = LogisticRegression()
logistic_reg.fit(X_train, y_train)

y_pred_test = logistic_reg.predict(cabin_test)

accuracy = logistic_reg.score(X_test, y_test)
print("Accuracy:", accuracy)

Accuracy: 0.2926829268292683


In [None]:
len(y_pred_test)

687

In [None]:
# 예측한 결측치를 train Cabin_only columns에 채우기
n = 0

for i in range(len(train['Cabin_only'])):
    if type(train['Cabin_only'].loc[i]) != str :
        train['Cabin_only'].loc[i] = y_pred_test[n]
        n += 1


In [None]:
train['Cabin_only'] = train['Cabin_only'].map({'A' : 0, 'B' : 1, 'C' : 2, 'D' : 3, 'E': 4, 'F' : 5, 'G' : 6, 'T' : 7})

In [None]:
# 예측한 결측치를 test Cabin_only columns에 채우기
n = 0

for i in range(len(test['Cabin_only'])):
    if type(test['Cabin_only'].loc[i]) != str :
        test['Cabin_only'].loc[i] = y_pred_test[n]
        n += 1

In [None]:
test['Cabin_only'] = test['Cabin_only'].map({'A' : 0, 'B' : 1, 'C' : 2, 'D' : 3, 'E': 4, 'F' : 5, 'G' : 6, 'T' : 7})

## Standard Scaler

In [None]:
from sklearn.preprocessing import StandardScaler

# Fare값을 평균이 0이고, 분산이 1인 데이터셋으로 변환 -> 정규화
ss = StandardScaler()
ss.fit(train[['Fare']])
train_fare = ss.transform(train[['Fare']])
test_fare = ss.transform(test[['Fare']])

In [None]:
train['Fare2'] = train_fare
test['Fare2'] = test_fare

In [None]:
train.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Companion,Cabin_only,Fare2
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,2,1,5,-0.502445
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,0,1,2,0.786845
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,2,0,5,-0.488854


In [None]:
# 모델에 넣을 데이터 추출
data = train[['Age' ,'Pclass', 'Sex', 'Companion', 'Embarked', 'Fare2', 'Cabin_only', 'Survived']]
data_test = test[['Age' ,'Pclass', 'Sex', 'Companion', 'Embarked', 'Fare2', 'Cabin_only']]

X = data.iloc[:,:-1].values
y = data.iloc[:,-1].values
k = 5

In [None]:
data

Unnamed: 0,Age,Pclass,Sex,Companion,Embarked,Fare2,Cabin_only,Survived
0,22.0,3,0,1,2,-0.502445,5,0
1,38.0,1,1,1,0,0.786845,2,1
2,26.0,3,1,0,2,-0.488854,5,1
3,35.0,1,1,1,2,0.420730,2,1
4,35.0,3,0,0,2,-0.486337,5,0
...,...,...,...,...,...,...,...,...
886,27.0,2,0,0,2,-0.386671,4,0
887,19.0,1,1,0,2,-0.044381,1,1
888,22.0,3,1,3,2,-0.176263,5,0
889,26.0,1,0,0,0,-0.044381,2,1


## Splitting the dataset into the Training set and Test set

In [None]:
from sklearn.model_selection import train_test_split

# 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

### 1. K-Fold

In [None]:
import math
block_size = math.ceil(X.shape[0]/k)
block_size

179

In [None]:
def kfold_train(estimator, X, y, k=5):
    score_list = []

    for i in range(k):
        max_length = min(block_size*(i+1), X.shape[0])
        test_idxs = range(block_size * i, max_length)
        # print(test_idxs)
        # range(0, 179)
        # range(179, 358)
        # range(358, 537)
        # range(537, 716)
        # range(716, 891)

        train_idxs = []
        
        for idx in range(X.shape[0]):
            if idx not in test_idxs:
                train_idxs.append(idx)

        train_X, train_y = X[train_idxs], y[train_idxs]

        test_X, test_y = X[test_idxs], y[test_idxs]

        model.fit(train_X, train_y)

        score = model.score(test_X, test_y)
        score_list.append(score)

    avg_score = sum(score_list) / k
    print(avg_score)

In [None]:
train.shape

(891, 15)

In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
kfold_train(model, X, y, k=5)

0.8059888268156424


### 2. GridSearchCV


In [None]:
from sklearn.model_selection import GridSearchCV, KFold
# 탐색할 하이퍼 파라미터 지정
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [1, 5, 10, 15],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 4, 6],
}

# RandomForestClassifier 개체 생성
rf_clf = RandomForestClassifier()

# Kfold 정의
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

# GridSerchCV 정의
grid_search = GridSearchCV(
    rf_clf, param_grid, cv=kfold, scoring='accuracy', n_jobs=-1)

# FIT
grid_search.fit(X, y)

# Best parameters 출력
print("Best parameters: ", grid_search.best_params_)

# Best score 출력
print("Best score: ", grid_search.best_score_)

Best parameters:  {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 300}
Best score:  0.8372481325717155


## Model Fit

In [None]:
rf_clf = RandomForestClassifier(max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=300)

# Train the model using the training data
rf_clf.fit(X_train, y_train)
y_pred_test = model.predict(data_test)
# Evaluate the model on the test data
accuracy = rf_clf.score(X_test, y_test)
print("Accuracy:", accuracy)

Accuracy: 0.8379888268156425


## Submission

In [None]:
submission = pd.DataFrame({
        "PassengerId": test["PassengerId"],
        "Survived": y_pred_test
    })

In [None]:
submission.to_csv('Submission.csv', index = False)