## 타이타닉 데이터 분석

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn

In [2]:
titanic = pd.read_csv('titanic.csv')

In [6]:
titanic.head(10) # 일단 대략적 정보 띄우기. 

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,embarked,life,seat,port
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,S,live,1st,southampthon
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,S,live,1st,southampthon
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,S,dead,1st,southampthon
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,S,dead,1st,southampthon
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,S,dead,1st,southampthon
5,1,1,"Anderson, Mr. Harry",male,48.0,0,0,19952,26.55,S,live,1st,southampthon
6,1,1,"Andrews, Miss. Kornelia Theodosia",female,63.0,1,0,13502,77.9583,S,live,1st,southampthon
7,1,0,"Andrews, Mr. Thomas Jr",male,39.0,0,0,112050,0.0,S,dead,1st,southampthon
8,1,1,"Appleton, Mrs. Edward Dale (Charlotte Lamson)",female,53.0,2,0,11769,51.4792,S,live,1st,southampthon
9,1,0,"Artagaveytia, Mr. Ramon",male,71.0,0,0,PC 17609,49.5042,C,dead,1st,cherbourg


## 저번에 전처리를 한번 했었기 때문에  이번엔 머신러닝 위주로 
## 결측치 확인

In [9]:
# 레이블 분포 확인
titanic.life.value_counts()

dead    808
live    498
Name: life, dtype: int64

In [10]:
# 여러 특징 중 성별 분류 확인
titanic.sex.value_counts()

male      842
female    464
Name: sex, dtype: int64

In [12]:
# 여러 특성들 중 승선위치 분포 확인
titanic.port.value_counts()

southampthon    913
cherbourg       270
qeenstown       123
Name: port, dtype: int64

# 데이터 분석시 문자형 값보다는 숮자형 값을 더 잘 인식함
# 문자형 값 -> 숫자형값으로 변환하는 과정 필요

In [20]:
# 성별을 레이블인코딩으로 숫자형으로 변환 -> 파생변수
titanic['gender'] = titanic['sex'].apply(lambda x: 0 if x =='female' else 1)
titanic.iloc[:, [3, 13]].head(5)

Unnamed: 0,sex,gender
0,female,0
1,male,1
2,female,0
3,male,1
4,female,0


In [23]:
# 승선위치를 레이블인코딩으로 숫자형으로 변환 -> 파생변수
titanic['harbor'] = titanic['embarked'].apply(lambda x: 0 if x == 'C' else (1 if x == 'S' else 2) )

## 이렇게 쓰면 가독성이 떨어지죠? 성적 프로그램을 인라인으로 쓰면 어떻게되요? 가독성이 떨어짐. 따라서

### 분석에 필요한 컬럼을 뽑아 특성/ 레이블을 만듦.

In [28]:
data = titanic.iloc[:, [0,4,4,5,6,8,13,14]]
target = titanic.survived

In [None]:
from sklearn.tree import Decision

# 훈련 / 평가 데이터 분할

In [32]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [33]:
Xtrain, Xtest, ytrain, ytest = train_test_split(data, target, train_size=0.7, random_state=2111041110)

In [35]:
# 의사결정나무

dtclf = DecisionTreeClassifier()
dtclf.fit(Xtrain, ytrain)
pred = dtclf.predict(Xtest)
accuracy_score(ytest, pred)

0.7755102040816326

In [37]:
# 로지스틱 회귀

lrclf = LogisticRegression()
lrclf.fit(Xtrain, ytrain)
pred = lrclf.predict(Xtest)
accuracy_score(ytest, pred)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.8010204081632653

In [38]:
# 랜덤포레스트

rfclf = RandomForestClassifier()
rfclf.fit(Xtrain, ytrain)
pred = rfclf.predict(Xtest)
accuracy_score(ytest, pred)

0.8035714285714286

In [39]:
# 교차검증 1
dtclf = DecisionTreeClassifier(max_depth=3)
scores = cross_val_score(dtclf, data, target, cv=10, scoring = 'accuracy')
np.mean(scores)

0.7487081620669407

In [42]:
lrclf = LogisticRegression(max_iter=300)
scores = cross_val_score(lrclf, data, target, cv=10, scoring = 'accuracy')
np.mean(scores) # 0.75

0.7501820317087493

### 머신러닝 모델 평가

## 정확도만으로 모델의 성능을 평가 하는 것이 과연 옳은것인가? 
## 그것만으로는 모델의 성능을 평가하는 것이 무조건 옳지 않다. 타이타닉 데이터처럼 종속변류의 분류값이 편중이 되있다면 모델의 정확도를 정확하게 판단할 수 없다는 얘깁니다.

In [43]:
titanic.life.value_counts()

dead    808
live    498
Name: life, dtype: int64

In [44]:
# 성별에 따른 생존여부
titanic.groupby(['gender','life'])['life'].count()

gender  life
0       dead    127
        live    337
1       dead    681
        live    161
Name: life, dtype: int64

## ==> 여성의 생존율이 남성의 생존률보다 높기 때문에 간단한 조건문만으로 모델을 만들 수도 있음.