In [1]:
import pandas as pd
import seaborn as sns

In [2]:
sns.get_dataset_names()

['anagrams',
 'anscombe',
 'attention',
 'brain_networks',
 'car_crashes',
 'diamonds',
 'dots',
 'exercise',
 'flights',
 'fmri',
 'gammas',
 'geyser',
 'iris',
 'mpg',
 'penguins',
 'planets',
 'taxis',
 'tips',
 'titanic']

In [3]:
titanic = sns.load_dataset('titanic') 
titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [4]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


#### column 선택

In [5]:
titanic.columns

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone'],
      dtype='object')

In [6]:
cols = ['survived', 'pclass', 'sex', 'age']
data = titanic[cols]
data

Unnamed: 0,survived,pclass,sex,age
0,0,3,male,22.0
1,1,1,female,38.0
2,1,3,female,26.0
3,1,1,female,35.0
4,0,3,male,35.0
...,...,...,...,...
886,0,2,male,27.0
887,1,1,female,19.0
888,0,3,female,
889,1,1,male,26.0


In [7]:
data.isna().sum()

survived      0
pclass        0
sex           0
age         177
dtype: int64

##### 나이 결측치 채우기

In [8]:
age_mean = data.groupby(['pclass', 'sex'])[['age']].mean()
age_mean = age_mean.unstack()
age_mean

Unnamed: 0_level_0,age,age
sex,female,male
pclass,Unnamed: 1_level_2,Unnamed: 2_level_2
1,34.611765,41.281386
2,28.722973,30.740707
3,21.75,26.507589


In [9]:
age_mean['age', 'female'][1]

34.61176470588235

In [10]:
import numpy as np
for s in data['sex'].unique():
    for c in data['pclass'].unique():
        condition = (data['age'].isna())*(data['sex'] == s ) * (data['pclass'] == c)
        
        data['age']= np.where(condition, age_mean['age', s][c], data['age'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['age']= np.where(condition, age_mean['age', s][c], data['age'])


In [11]:
data.isna().sum()

survived    0
pclass      0
sex         0
age         0
dtype: int64

#### 1. 성별, 등석, 나이에 따른 생존율

In [12]:
list(range(10,90,10))

[10, 20, 30, 40, 50, 60, 70, 80]

In [13]:
# 나이 범주화
# data['age'] = 
age_cut = pd.cut(data['age'],
#                      bins = 8
                    range(0,90,10),
                    labels= [str(x)+'대' for x in range(0,80,10)]
                    )

In [14]:
age_cut.unique()

['20대', '30대', '50대', '0대', '10대', '60대', '40대', '70대']
Categories (8, object): ['0대' < '10대' < '20대' < '30대' < '40대' < '50대' < '60대' < '70대']

In [15]:
data.insert(3, 'age_cut', age_cut)

In [16]:
data

Unnamed: 0,survived,pclass,sex,age_cut,age
0,0,3,male,20대,22.00
1,1,1,female,30대,38.00
2,1,3,female,20대,26.00
3,1,1,female,30대,35.00
4,0,3,male,30대,35.00
...,...,...,...,...,...
886,0,2,male,20대,27.00
887,1,1,female,10대,19.00
888,0,3,female,20대,21.75
889,1,1,male,20대,26.00


In [17]:
for col in data.columns[1:-1]:
    print(data.groupby(col).mean()[['survived']],'\n')

        survived
pclass          
1       0.629630
2       0.472826
3       0.242363 

        survived
sex             
female  0.742038
male    0.188908 

         survived
age_cut          
0대       0.593750
10대      0.382609
20대      0.326087
30대      0.462428
40대      0.355140
50대      0.404762
60대      0.235294
70대      0.200000 



In [18]:
data.groupby(['pclass', 'sex']).mean()[['survived']].unstack()

Unnamed: 0_level_0,survived,survived
sex,female,male
pclass,Unnamed: 1_level_2,Unnamed: 2_level_2
1,0.968085,0.368852
2,0.921053,0.157407
3,0.5,0.135447


In [19]:
data.groupby(['age_cut', 'sex']).mean()[['survived']].unstack().fillna('-')

Unnamed: 0_level_0,survived,survived
sex,female,male
age_cut,Unnamed: 1_level_2,Unnamed: 2_level_2
0대,0.612903,0.575758
10대,0.73913,0.144928
20대,0.704,0.131687
30대,0.859375,0.229358
40대,0.677419,0.223684
50대,0.928571,0.142857
60대,1.0,0.071429
70대,-,0.2


In [20]:
data.groupby(['age_cut', 'pclass']).mean()[['survived']].unstack().fillna('-')

Unnamed: 0_level_0,survived,survived,survived
pclass,1,2,3
age_cut,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
0대,0.666667,1.0,0.431818
10대,0.833333,0.5,0.253165
20대,0.725,0.428571,0.241509
30대,0.793103,0.403846,0.206349
40대,0.448276,0.526316,0.066667
50대,0.6,0.166667,0.0
60대,0.181818,0.333333,0.333333
70대,0.333333,-,0.0


In [26]:
data.groupby(['age_cut','pclass','sex']).mean()['survived'].unstack().unstack().fillna('-')

sex,female,female,female,male,male,male
pclass,1,2,3,1,2,3
age_cut,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
0대,0.0,1.0,0.5,1.0,1.0,0.363636
10대,1.0,1.0,0.52,0.4,0.1,0.12963
20대,0.952381,0.9,0.554054,0.473684,0.0,0.120419
30대,1.0,0.941176,0.428571,0.52,0.142857,0.142857
40대,0.923077,0.9,0.0,0.311111,0.111111,0.090909
50대,1.0,0.666667,-,0.285714,0.0,0.0
60대,1.0,-,1.0,0.0,0.333333,0.0
70대,-,-,-,0.333333,-,0.0
