# 타이타닉 생존률 분석

- 연령대 기준 생존률
- plcass 기준 생존률
- 성별 기준 생존률

In [38]:
import seaborn  # load titanic data
import pandas  # data operation

titanic = seaborn.load_dataset('titanic')

In [39]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


---
## 상관계수 구하기

In [40]:
# 평균값, 최빈값, 중앙값 구하기
avg = titanic.mean(numeric_only=True)
mode = titanic.mode(numeric_only=True)
median = titanic.median(numeric_only=True)

value_status = pandas.concat([avg, mode, avg], axis=1)
value_status

Unnamed: 0,0,survived,pclass,age,sibsp,parch,fare,adult_male,alone,1
survived,0.383838,,,,,,,,,0.383838
pclass,2.308642,,,,,,,,,2.308642
age,29.699118,,,,,,,,,29.699118
sibsp,0.523008,,,,,,,,,0.523008
parch,0.381594,,,,,,,,,0.381594
fare,32.204208,,,,,,,,,32.204208
adult_male,0.602694,,,,,,,,,0.602694
alone,0.602694,,,,,,,,,0.602694
0,,0.0,3.0,24.0,0.0,0.0,8.05,True,True,


In [41]:
# titanic numeric data 컬럼들 표준편차
titanic.std(numeric_only=True)

survived       0.486592
pclass         0.836071
age           14.526497
sibsp          1.102743
parch          0.806057
fare          49.693429
adult_male     0.489615
alone          0.489615
dtype: float64

In [42]:
# 표준편차값 구한뒤 상관계수 찾아보기
std_data = titanic.std(numeric_only=True)
std_data2 = pandas.DataFrame([list(std_data)],
                             columns=std_data.index)
std_data2.corr()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,adult_male,alone
survived,,,,,,,,
pclass,,,,,,,,
age,,,,,,,,
sibsp,,,,,,,,
parch,,,,,,,,
fare,,,,,,,,
adult_male,,,,,,,,
alone,,,,,,,,


In [43]:
# titanic.apply(lambda x: x / abs(x.max()))

---
## 전체 탑승객 생존률

In [44]:
# 전체 탑승객 수
passengers = titanic.shape[0]  # 891

# 생존자 수
srv = titanic['survived'].sum()  # 342

# 생존률
srv_rate = round(srv / passengers * 100, 1)  # 38.4%

---
## 연령대, pclass, 성별 기준 생존률

In [45]:
# 기존 데이터 유지를 위해 titanic2 따로 생성
titanic2 = titanic.copy()

# 10~70대까지 연령대 생성
generation = pandas.cut(titanic['age'],
                        bins=8,
                        labels=[ "0's", "10's", "20's", "30's",
                                "40's", "50's", "60's", "70's"])

# titanic2 age컬럼 뒤에 generation컬럼 추가
titanic2.insert(4, 'generation', generation)
titanic2.filter(items=['age', 'generation']).head()

Unnamed: 0,age,generation
0,22.0,20's
1,38.0,30's
2,26.0,20's
3,35.0,30's
4,35.0,30's


In [46]:
# 3개의 컬럼별 생존률 데이터를 dict 자료형으로 생성
srv_data = {}
for column in titanic2[['generation', 'pclass', 'sex']]:
    cnt = titanic2.groupby(column)['survived'].count()
    r = cnt.agg(lambda x: round(x / passengers * 100, 1))
    srv_data[column] = r

In [47]:
# dict 데이터를 DataFrame으로 변환
srv_data2 = pandas.concat([srv_data['generation'],
                           srv_data['pclass'],
                           srv_data['sex']],
                          keys=['generation', 'pclass', 'sex'],
                          names=['category', 'group'])
srv_data2

category    group 
generation  0's        7.2
            10's      12.9
            20's      25.8
            30's      17.4
            40's       9.7
            50's       4.7
            60's       1.9
            70's       0.6
pclass      1         24.2
            2         20.7
            3         55.1
sex         female    35.2
            male      64.8
Name: survived, dtype: float64

In [74]:
for column in ['generation', 'pclass', 'sex']:
    high, low = srv_data2[column].agg([max, min])
    print(column+'생존률 높은 구간----------')
    print(srv_data2[srv_data2==high], high)
    print(column+'생존률 낮은 구간----------')
    print(srv_data2[srv_data2==low], high)
    print()

generation생존률 높은 구간----------
category    group
generation  20's     25.8
Name: survived, dtype: float64 25.8
generation생존률 낮은 구간----------
category    group
generation  70's     0.6
Name: survived, dtype: float64 25.8

pclass생존률 높은 구간----------
category  group
pclass    3        55.1
Name: survived, dtype: float64 55.1
pclass생존률 낮은 구간----------
category  group
pclass    2        20.7
Name: survived, dtype: float64 55.1

sex생존률 높은 구간----------
category  group
sex       male     64.8
Name: survived, dtype: float64 64.8
sex생존률 낮은 구간----------
category  group 
sex       female    35.2
Name: survived, dtype: float64 64.8

