# Seaborn 실습 -----------------

In [74]:
import pandas as pd
import numpy as np
import seaborn

In [75]:
df = seaborn.load_dataset('titanic')

In [76]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [77]:
print(df.index, df.columns, sep='\n')

RangeIndex(start=0, stop=891, step=1)
Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone'],
      dtype='object')


In [78]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


## 성별, 동석, 나이에 따른 생존률


### 데이터 정제

In [79]:
# 결측치 처리
df.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [80]:
# 결측치 처리
df['age'] = df['age'].fillna(df['age'].mean())
df['embark_town'] = df['embark_town'].fillna(df['embark_town'].mode()[0])
df['embarked'] = df['embarked'].fillna(df['embarked'].mode()[0])

In [81]:
df['deck'] = df['deck'].astype(str).replace({'nan': np.nan})
df['deck'] = df['deck'].fillna('DN')

In [82]:
df.isnull().sum()

survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
deck           0
embark_town    0
alive          0
alone          0
dtype: int64

In [83]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          891 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     891 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         891 non-null    object  
 12  embark_town  891 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(1), float64(2), int64(4), object(6)
memory usage: 86.4+ KB


In [84]:
# 데이터 표준화
df['age'] = df['age'].astype(int)

In [85]:
# 중복체크
df.duplicated().sum()

112

In [86]:
df = df.drop_duplicates().reset_index(drop=True)

In [87]:
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22,1,0,7.2500,S,Third,man,True,DN,Southampton,no,False
1,1,1,female,38,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26,0,0,7.9250,S,Third,woman,False,DN,Southampton,yes,True
3,1,1,female,35,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35,0,0,8.0500,S,Third,man,True,DN,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
774,0,3,female,39,0,5,29.1250,Q,Third,woman,False,DN,Queenstown,no,False
775,1,1,female,19,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
776,0,3,female,29,1,2,23.4500,S,Third,woman,False,DN,Southampton,no,False
777,1,1,male,26,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [88]:
# 데이터 표준화

In [89]:
age_cut = pd.cut(df['age'], [0,20, 40, 60, 80], labels=['child', 'young', 'elder', 'old'])
df.insert(4, 'age_cut', age_cut)

In [90]:
df

Unnamed: 0,survived,pclass,sex,age,age_cut,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22,young,1,0,7.2500,S,Third,man,True,DN,Southampton,no,False
1,1,1,female,38,young,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26,young,0,0,7.9250,S,Third,woman,False,DN,Southampton,yes,True
3,1,1,female,35,young,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35,young,0,0,8.0500,S,Third,man,True,DN,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
774,0,3,female,39,young,0,5,29.1250,Q,Third,woman,False,DN,Queenstown,no,False
775,1,1,female,19,child,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
776,0,3,female,29,young,1,2,23.4500,S,Third,woman,False,DN,Southampton,no,False
777,1,1,male,26,young,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


### 데이터 추출

#### 값으로 구하기

In [91]:
aliveRatio = df.groupby('sex')
maleDF = aliveRatio.get_group('male')
femaleDF = aliveRatio.get_group('female')

In [92]:
male = maleDF['survived'].value_counts()
maleAliveRatio = (male.loc[1] / (male.loc[0] + male.loc[1])) * 100
maleAliveRatio

21.604938271604937

In [93]:
female = femaleDF['survived'].value_counts()
femaleAliveRatio = (female.loc[1] / (female.loc[0] + female.loc[1])) * 100
femaleAliveRatio

74.06143344709898

#### 데이터 프레임 형태로 구하기

In [94]:
aliveDead = df.groupby('survived')

In [99]:
alive = aliveDead.get_group(1)
alive.describe()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,322.0,322.0,322.0,322.0,322.0,322.0
mean,1.0,1.900621,28.425466,0.481366,0.487578,50.210638
std,0.0,0.855293,14.07531,0.715621,0.786273,68.074711
min,1.0,1.0,0.0,0.0,0.0,0.0
25%,1.0,1.0,20.25,0.0,0.0,13.0
50%,1.0,2.0,29.0,0.0,0.0,26.25
75%,1.0,3.0,36.0,1.0,1.0,62.368725
max,1.0,3.0,80.0,4.0,5.0,512.3292


In [None]:
dead = aliveDead.get_group(0)
dead.describe()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,457.0,457.0,457.0,457.0,457.0,457.0
mean,0.0,2.477024,30.673961,0.557987,0.369803,24.084253
std,0.0,0.772341,13.501171,1.142014,0.871708,33.448586
min,0.0,1.0,1.0,0.0,0.0,0.0
25%,0.0,2.0,22.0,0.0,0.0,7.875
50%,0.0,3.0,29.0,0.0,0.0,13.0
75%,0.0,3.0,38.0,1.0,0.0,27.0
max,0.0,3.0,74.0,8.0,6.0,263.0
