### 함수

In [1]:
from IPython.display import Image
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
df = sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [3]:
df['who'].value_counts()

man      537
woman    271
child     83
Name: who, dtype: int64

In [4]:
def transform_who(x):
    if x == 'man':
        return '남자'
    elif x == 'woman':
        return '여자'
    else:
        return '아이'

In [5]:
df['who'].apply(transform_who)

0      남자
1      여자
2      여자
3      여자
4      남자
       ..
886    남자
887    여자
888    여자
889    남자
890    남자
Name: who, Length: 891, dtype: object

In [6]:
df['who'].apply(transform_who).value_counts()

남자    537
여자    271
아이     83
Name: who, dtype: int64

In [7]:
def transform_fare(x):
    return x['fare'] / x['age']

In [8]:
df.apply(transform_fare, axis=1)

0      0.329545
1      1.875876
2      0.304808
3      1.517143
4      0.230000
         ...   
886    0.481481
887    1.578947
888         NaN
889    1.153846
890    0.242188
Length: 891, dtype: float64

In [9]:
df['survived'].apply(lambda x: '생존' if x == 1 else '사망').value_counts()

사망    549
생존    342
Name: survived, dtype: int64

### Groupby

In [10]:
df.groupby(['sex', 'pclass']).mean()

  df.groupby(['sex', 'pclass']).mean()


Unnamed: 0_level_0,Unnamed: 1_level_0,survived,age,sibsp,parch,fare,adult_male,alone
sex,pclass,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
female,1,0.968085,34.611765,0.553191,0.457447,106.125798,0.0,0.361702
female,2,0.921053,28.722973,0.486842,0.605263,21.970121,0.0,0.421053
female,3,0.5,21.75,0.895833,0.798611,16.11881,0.0,0.416667
male,1,0.368852,41.281386,0.311475,0.278689,67.226127,0.97541,0.614754
male,2,0.157407,30.740707,0.342593,0.222222,19.741782,0.916667,0.666667
male,3,0.135447,26.507589,0.498559,0.224784,12.661633,0.919308,0.760807


In [11]:
df.groupby(['sex', 'pclass'])['survived'].mean()

sex     pclass
female  1         0.968085
        2         0.921053
        3         0.500000
male    1         0.368852
        2         0.157407
        3         0.135447
Name: survived, dtype: float64

In [12]:
pd.DataFrame(df.groupby(['sex', 'pclass'])['survived'].mean())

Unnamed: 0_level_0,Unnamed: 1_level_0,survived
sex,pclass,Unnamed: 2_level_1
female,1,0.968085
female,2,0.921053
female,3,0.5
male,1,0.368852
male,2,0.157407
male,3,0.135447


In [13]:
df.groupby(['sex', 'pclass'])['survived'].mean().reset_index()

Unnamed: 0,sex,pclass,survived
0,female,1,0.968085
1,female,2,0.921053
2,female,3,0.5
3,male,1,0.368852
4,male,2,0.157407
5,male,3,0.135447


In [14]:
df.groupby(['sex', 'pclass'])[['survived', 'age']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,survived,age
sex,pclass,Unnamed: 2_level_1,Unnamed: 3_level_1
female,1,0.968085,34.611765
female,2,0.921053,28.722973
female,3,0.5,21.75
male,1,0.368852,41.281386
male,2,0.157407,30.740707
male,3,0.135447,26.507589


In [15]:
df.groupby(['sex', 'pclass'])[['survived', 'age']].agg(['mean', 'sum'])

Unnamed: 0_level_0,Unnamed: 1_level_0,survived,survived,age,age
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,sum,mean,sum
sex,pclass,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
female,1,0.968085,91,34.611765,2942.0
female,2,0.921053,70,28.722973,2125.5
female,3,0.5,72,21.75,2218.5
male,1,0.368852,45,41.281386,4169.42
male,2,0.157407,17,30.740707,3043.33
male,3,0.135447,47,26.507589,6706.42


### 연습문제

In [16]:
sample = df.copy()

In [17]:
sample['class'].value_counts()

Third     491
First     216
Second    184
Name: class, dtype: int64

In [21]:
def classkor(x):
    if x == 'Third':
        return '삼등석'
    elif x == 'Second':
        return '이등석'
    else:
        return '일등석'

In [26]:
sample['class'].apply(classkor).value_counts()

삼등석    491
일등석    216
이등석    184
Name: class, dtype: int64

In [27]:
sample = df.copy()

In [31]:
sample.groupby(['pclass'])['survived'].mean()

pclass
1    0.629630
2    0.472826
3    0.242363
Name: survived, dtype: float64

In [35]:
pd.DataFrame(sample.groupby(['embarked'])['survived'].agg(['mean', 'var']))

Unnamed: 0_level_0,mean,var
embarked,Unnamed: 1_level_1,Unnamed: 2_level_1
C,0.553571,0.24861
Q,0.38961,0.240943
S,0.336957,0.223764


In [36]:
pd.DataFrame(sample.groupby(['who', 'pclass'])['survived'].agg(['mean', 'sum']))

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,sum
who,pclass,Unnamed: 2_level_1,Unnamed: 3_level_1
child,1,0.833333,5
child,2,1.0,19
child,3,0.431034,25
man,1,0.352941,42
man,2,0.080808,8
man,3,0.119122,38
woman,1,0.978022,89
woman,2,0.909091,60
woman,3,0.491228,56


In [40]:
sample['age'] = sample.groupby(['sex'])['age'].apply(lambda x: x.fillna(x.mean()))
print(sample['age'].isnull().sum())
print(f"age 평균: {sample['age'].mean():.2f}")

0
age 평균: 29.74


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  sample['age'] = sample.groupby(['sex'])['age'].apply(lambda x: x.fillna(x.mean()))


### Pivot table

In [41]:
df.pivot_table(index='who', values='survived')

Unnamed: 0_level_0,survived
who,Unnamed: 1_level_1
child,0.590361
man,0.163873
woman,0.756458


In [42]:
df.pivot_table(columns='who', values='survived')

who,child,man,woman
survived,0.590361,0.163873,0.756458


In [43]:
df.pivot_table(index=['who', 'pclass'], values='survived')

Unnamed: 0_level_0,Unnamed: 1_level_0,survived
who,pclass,Unnamed: 2_level_1
child,1,0.833333
child,2,1.0
child,3,0.431034
man,1,0.352941
man,2,0.080808
man,3,0.119122
woman,1,0.978022
woman,2,0.909091
woman,3,0.491228


In [44]:
df.pivot_table(index='who', columns='pclass', values='survived')

pclass,1,2,3
who,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
child,0.833333,1.0,0.431034
man,0.352941,0.080808,0.119122
woman,0.978022,0.909091,0.491228


In [45]:
df.pivot_table(index='who', columns='pclass', values='survived', aggfunc=['sum', 'mean'])

Unnamed: 0_level_0,sum,sum,sum,mean,mean,mean
pclass,1,2,3,1,2,3
who,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
child,5,19,25,0.833333,1.0,0.431034
man,42,8,38,0.352941,0.080808,0.119122
woman,89,60,56,0.978022,0.909091,0.491228


### 연습문제

In [46]:
tips = sns.load_dataset('tips')
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [48]:
tips.pivot_table(index='smoker', columns='day', values='tip')

day,Thur,Fri,Sat,Sun
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Yes,3.03,2.714,2.875476,3.516842
No,2.673778,2.8125,3.102889,3.167895


In [49]:
tips.pivot_table(index='day', columns='time', values='total_bill', aggfunc=['mean', 'sum'])

Unnamed: 0_level_0,mean,mean,sum,sum
time,Lunch,Dinner,Lunch,Dinner
day,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Thur,17.664754,18.78,1077.55,18.78
Fri,12.845714,19.663333,89.92,235.96
Sat,,20.441379,0.0,1778.4
Sun,,21.41,0.0,1627.16
