# kaggle Titanic
---
[kaggle 링크 (https://www.kaggle.com/code/startupsci/titanic-data-science-solutions)]

In [93]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import my_utils as my

In [94]:
train_data = pd.read_csv("../data/kaggle_titanic/train.csv")
test_data = pd.read_csv("../data/kaggle_titanic/test.csv")

type(train_data), type(test_data)

(pandas.core.frame.DataFrame, pandas.core.frame.DataFrame)

In [95]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

In [96]:
train_data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [97]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


categorical data는 Survived, Sex, Embarked
<br>
Ordinal data는 Pclass
<br>
Continous data는 Age, Fare
<br>
Discrete Data는 SibSp, Parch
<br>

In [98]:
# Age, Cabin, Embarked에 결측치 존재
train_data.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [99]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [100]:
train_data[["Pclass", "Survived"]].groupby(["Pclass"], as_index=False).mean().sort_values(by="Survived", ascending=False)

Unnamed: 0,Pclass,Survived
0,1,0.62963
1,2,0.472826
2,3,0.242363


In [101]:
train_data[["Sex", "Survived"]].groupby(["Sex"], as_index=False).mean().sort_values(by="Survived", ascending=False)

Unnamed: 0,Sex,Survived
0,female,0.742038
1,male,0.188908


In [102]:
train_data[["SibSp", "Survived"]].groupby(["SibSp"], as_index=False).mean().sort_values(by="Survived", ascending=False)

Unnamed: 0,SibSp,Survived
1,1,0.535885
2,2,0.464286
0,0,0.345395
3,3,0.25
4,4,0.166667
5,5,0.0
6,8,0.0


In [103]:
train_data[["Parch", "Survived"]].groupby(["Parch"], as_index=False).mean().sort_values(by="Survived", ascending=False)

Unnamed: 0,Parch,Survived
3,3,0.6
1,1,0.550847
2,2,0.5
0,0,0.343658
5,5,0.2
4,4,0.0
6,6,0.0


Ticket과 Survival은 상관관계가 없어보여 drop
<br>
Cabin은 결측치가 많아 drop
<br>
PassenerId은 Survival에 기여하는 것이 없어보여 drop
<br>
Name은 Survival에 기여하는 것이 없어보여 drop

SibSp와 Parch를 더한 값으로 새로운 특징 생성

In [104]:
# Ticket, Cabin, Name, PassengerId 칼럼 삭제
train_data = train_data.drop(["Ticket", "Cabin", "Name", "PassengerId"], axis=1)
test_data = test_data.drop(["Ticket", "Cabin", "Name", "PassengerId"], axis=1)

categorical data를 encoding

In [105]:
train_data["Sex"] = train_data["Sex"].map({"female": 1, "male": 0})
test_data["Sex"] = test_data["Sex"].map({"female": 1, "male": 0})

In [106]:
train_data.isna().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

Age를 나이대별로 나눠줌

In [107]:
train_data.loc[train_data["Age"] <= 16, "Age"] = 0
train_data.loc[(train_data['Age'] > 16) & (train_data['Age'] <= 32), 'Age'] = 1
train_data.loc[(train_data['Age'] > 32) & (train_data['Age'] <= 48), 'Age'] = 2
train_data.loc[(train_data['Age'] > 48) & (train_data['Age'] <= 64), 'Age'] = 3
train_data.loc[ train_data['Age'] > 64, 'Age']
train_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,0,1.0,1,0,7.25,S
1,1,1,1,2.0,1,0,71.2833,C
2,1,3,1,1.0,0,0,7.925,S
3,1,1,1,2.0,1,0,53.1,S
4,0,3,0,2.0,0,0,8.05,S


In [108]:
test_data.loc[test_data["Age"] <= 16, "Age"] = 0
test_data.loc[(test_data['Age'] > 16) & (test_data['Age'] <= 32), 'Age'] = 1
test_data.loc[(test_data['Age'] > 32) & (test_data['Age'] <= 48), 'Age'] = 2
test_data.loc[(test_data['Age'] > 48) & (test_data['Age'] <= 64), 'Age'] = 3
test_data.loc[ test_data['Age'] > 64, 'Age']
test_data.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,0,2.0,0,0,7.8292,Q
1,3,1,2.0,1,0,7.0,S
2,2,0,3.0,0,0,9.6875,Q
3,3,0,1.0,0,0,8.6625,S
4,3,1,1.0,1,1,12.2875,S


In [109]:
train_data = train_data.drop(["SibSp", "Parch"], axis=1)
test_data = test_data.drop(["SibSp", "Parch"], axis=1)

In [110]:
train_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked
0,0,3,0,1.0,7.25,S
1,1,1,1,2.0,71.2833,C
2,1,3,1,1.0,7.925,S
3,1,1,1,2.0,53.1,S
4,0,3,0,2.0,8.05,S


In [111]:
test_data.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked
0,3,0,2.0,7.8292,Q
1,3,1,2.0,7.0,S
2,2,0,3.0,9.6875,Q
3,3,0,1.0,8.6625,S
4,3,1,1.0,12.2875,S


embarked의 결측치를 최빈값으로 채움

In [112]:
freq_port = train_data.Embarked.dropna().mode()[0]
freq_port

'S'

In [113]:
train_data["Embarked"] = train_data["Embarked"].fillna(freq_port)
test_data["Embarked"] = test_data["Embarked"].fillna(freq_port)

In [114]:
train_data["Embarked"] = train_data["Embarked"].map({"S": 0, "C": 1, "Q": 2})
test_data["Embarked"] = test_data["Embarked"].map({"S": 0, "C": 1, "Q": 2})

In [115]:
train_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked
0,0,3,0,1.0,7.25,0
1,1,1,1,2.0,71.2833,1
2,1,3,1,1.0,7.925,0
3,1,1,1,2.0,53.1,0
4,0,3,0,2.0,8.05,0


In [116]:
train_data.isna().sum()

Survived      0
Pclass        0
Sex           0
Age         177
Fare          0
Embarked      0
dtype: int64

In [117]:
test_data.isna().sum()

Pclass       0
Sex          0
Age         86
Fare         1
Embarked     0
dtype: int64

In [118]:
test_data["Fare"].fillna(test_data["Fare"].dropna().median(), inplace=True)
test_data.isna().sum()

Pclass       0
Sex          0
Age         86
Fare         0
Embarked     0
dtype: int64

Fare를 대역별로 나눔

In [119]:
train_data.loc[ train_data['Fare'] <= 7.91, 'Fare'] = 0
train_data.loc[(train_data['Fare'] > 7.91) & (train_data['Fare'] <= 14.454), 'Fare'] = 1
train_data.loc[(train_data['Fare'] > 14.454) & (train_data['Fare'] <= 31), 'Fare']   = 2
train_data.loc[ train_data['Fare'] > 31, 'Fare'] = 3
train_data['Fare'] = train_data['Fare'].astype(int)

In [120]:
test_data.loc[ test_data['Fare'] <= 7.91, 'Fare'] = 0
test_data.loc[(test_data['Fare'] > 7.91) & (test_data['Fare'] <= 14.454), 'Fare'] = 1
test_data.loc[(test_data['Fare'] > 14.454) & (test_data['Fare'] <= 31), 'Fare']   = 2
test_data.loc[ test_data['Fare'] > 31, 'Fare'] = 3
test_data['Fare'] = test_data['Fare'].astype(int)

#### 학습

In [121]:
test_data.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked
0,3,0,2.0,0,2
1,3,1,2.0,0,0
2,2,0,3.0,1,2
3,3,0,1.0,1,0
4,3,1,1.0,1,0


In [122]:
x_train = train_data.drop("Survived", axis=1)
y_train = train_data["Survived"]
x_test = test_data.copy()
x_train.shape, y_train.shape, x_test.shape

((891, 5), (891,), (418, 5))

In [123]:
x_train.drop("Age", axis=1, inplace=True)
x_test.drop("Age", axis=1, inplace=True)

In [125]:
svc = SVC()
svc.fit(x_train, y_train)
y_pred = svc.predict(x_test)
score = round(svc.score(x_train, y_train) * 100, 2)
score

80.92

In [126]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(x_train, y_train)
y_pred = knn.predict(x_test)
score = round(knn.score(x_train, y_train) * 100, 2)
score

80.58

In [127]:
dt = DecisionTreeClassifier()
dt.fit(x_train, y_train)
y_pred = dt.predict(x_test)
score = round(dt.score(x_train, y_train) * 100, 2)
score

81.26

In [128]:
rf = RandomForestClassifier(n_estimators=100)
rf.fit(x_train, y_train)
y_pred = rf.predict(x_test)
score = round(rf.score(x_train, y_train) * 100, 2)
score

81.26