In [2]:
# 라이브러리 불러오기
import seaborn as sns
import pandas as pd

# **1-1 개별 원소에 함수 매핑**
## 1-1-1 시리즈 객체에 apply()메소드를 적용하면 인자로 전달하는 매핑 함수에 시리즈의 모든 원소를 하나씩 입력하고 함수의 리턴값을 돌려받는다.
### Series객체.apply(매핑함수)

In [2]:
# titanic 데이터셋에서 age, fare 2개 열을 선택하여 데이터프레임 만들기 
titanic = sns.load_dataset('titanic')
df = titanic.loc[:, ['age','fare']]
df.head()

Unnamed: 0,age,fare
0,22.0,7.25
1,38.0,71.2833
2,26.0,7.925
3,35.0,53.1
4,35.0,8.05


In [3]:
df['ten']=10  # 각 행에 숫자 10만을 원소로 갖는 열 추가
print(df.head())

    age     fare  ten
0  22.0   7.2500   10
1  38.0  71.2833   10
2  26.0   7.9250   10
3  35.0  53.1000   10
4  35.0   8.0500   10


In [4]:
# 사용자 함수 정의 > apply 메소드에 적용할 함수 만들기
def add_10(n): # 더하기 10 함수
  return n+10

def add_two_obj(a,b): # 더하기 함수
  return a+b

print(add_10(10))
print(add_two_obj(10,10))

20
20


In [5]:
# 시리즈의 원소에 apply() 적용
sr1=df['age'].apply(add_10) # age+10
print(type(sr1))
print(sr1.head(),'\n')
sr2=df['fare'].apply(add_two_obj, b=10) # fare+10
print(type(sr2))
print(sr2.head(),'\n')
sr3=df['age'].apply(lambda x: add_10(x)) # lambda df['age] : add_10(df['age'])
print(type(sr3))
print(sr3.head())

<class 'pandas.core.series.Series'>
0    32.0
1    48.0
2    36.0
3    45.0
4    45.0
Name: age, dtype: float64 

<class 'pandas.core.series.Series'>
0    17.2500
1    81.2833
2    17.9250
3    63.1000
4    18.0500
Name: fare, dtype: float64 

<class 'pandas.core.series.Series'>
0    32.0
1    48.0
2    36.0
3    45.0
4    45.0
Name: age, dtype: float64


## 1-1-2 데이터프레임 원소에 함수 매핑
### DataFrame 객체.applymap(매핑함수)

In [6]:
print(df.head())
df_map=df.applymap(add_10) # 데이터프레임에 applymap으로 add_10()함수를 매핑 적용
print(type(df_map))
df_map.head()

    age     fare  ten
0  22.0   7.2500   10
1  38.0  71.2833   10
2  26.0   7.9250   10
3  35.0  53.1000   10
4  35.0   8.0500   10
<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,age,fare,ten
0,32.0,17.25,20
1,48.0,81.2833,20
2,36.0,17.925,20
3,45.0,63.1,20
4,45.0,18.05,20


## 1-2 시리즈 객체에 함수 매핑
### 1-2-1 데이터 프레임의 각 열에 함수 매핑
### DataFrame 객체.apply(매핑함수, axis=0)

In [7]:
print(df.head())
# 사용자함수 정의
def missing_value(series): # 시리즈를 인자로 전달
  return series.isnull()   # 불린 시리즈를 반환

# 데이터프레임의 각 열을 인자로 전달하면 데이터 프레임을 반환
result=df.apply(missing_value, axis=0)
print(result.head(),'\n')
print(type(result))

    age     fare  ten
0  22.0   7.2500   10
1  38.0  71.2833   10
2  26.0   7.9250   10
3  35.0  53.1000   10
4  35.0   8.0500   10
     age   fare    ten
0  False  False  False
1  False  False  False
2  False  False  False
3  False  False  False
4  False  False  False 

<class 'pandas.core.frame.DataFrame'>


In [8]:
print(df.head())

# 사용자 함수 정의
def min_max(x):
  return x.max()-x.min()

# 데이터 프레임의 각 열을 인자로 전달하면 시리즈를 반환
result=df.apply(min_max)
print(result,'\n')
print(type(result))

    age     fare  ten
0  22.0   7.2500   10
1  38.0  71.2833   10
2  26.0   7.9250   10
3  35.0  53.1000   10
4  35.0   8.0500   10
age      79.5800
fare    512.3292
ten       0.0000
dtype: float64 

<class 'pandas.core.series.Series'>


## 1-2-2 데이터 프레임의 각 행에 함수 매핑
### DataFrame 객체.apply(매핑함수, axis=1)

In [9]:
titanic=sns.load_dataset('titanic')
df=titanic.loc[:,['age','fare']]
df['ten']=10
print(df.head(),'\n')

    age     fare  ten
0  22.0   7.2500   10
1  38.0  71.2833   10
2  26.0   7.9250   10
3  35.0  53.1000   10
4  35.0   8.0500   10 



In [10]:
def add_two_obj(a,b):
  return a+b

df['add']=df.apply(lambda x: add_two_obj(x['age'],x['ten']),axis=1)
print(df.head())

    age     fare  ten   add
0  22.0   7.2500   10  32.0
1  38.0  71.2833   10  48.0
2  26.0   7.9250   10  36.0
3  35.0  53.1000   10  45.0
4  35.0   8.0500   10  45.0


## 1-3 데이터 프레임 객체에 함수 매핑
### DataFrame객체.pipe(매핑함수)

In [11]:
# 각 열의 NaN 찾기
def missing_value(x):
  return x.isnull()

# 각 열의 NaN 개수 반환 
def missing_count(x):
  return missing_value(x).sum()

# 데이터 프레임의 총 NaN 개수
def total_number_missing(x):
  return missing_count(x).sum()

In [12]:
result_df=df.pipe(missing_value) # 데이터프레임 객체를 함수에 매핑하려면 pipe 메소드를 사용한다.
result_series=df.pipe(missing_count)
result_value=df.pipe(total_number_missing)

print(result_df)
print(result_series)
print(result_value)

       age   fare    ten    add
0    False  False  False  False
1    False  False  False  False
2    False  False  False  False
3    False  False  False  False
4    False  False  False  False
..     ...    ...    ...    ...
886  False  False  False  False
887  False  False  False  False
888   True  False  False   True
889  False  False  False  False
890  False  False  False  False

[891 rows x 4 columns]
age     177
fare      0
ten       0
add     177
dtype: int64
354


# 2-1 열 순서 변경
### DataFrame 객체[재구성한 열 이름의 리스트]

In [13]:
titanic=sns.load_dataset('titanic')
df=titanic.loc[0:4,'survived':'age']
df.head()

Unnamed: 0,survived,pclass,sex,age
0,0,3,male,22.0
1,1,1,female,38.0
2,1,3,female,26.0
3,1,1,female,35.0
4,0,3,male,35.0


In [14]:
columns=list(df.columns.values)
print(columns)

['survived', 'pclass', 'sex', 'age']


In [15]:
columns_sorted=sorted(columns)
df_sorted=df[columns_sorted]
print(df_sorted)

    age  pclass     sex  survived
0  22.0       3    male         0
1  38.0       1  female         1
2  26.0       3  female         1
3  35.0       1  female         1
4  35.0       3    male         0


In [16]:
columns_reversed=list(reversed(columns))
df_reversed=df[columns_reversed]
print(df_reversed)

    age     sex  pclass  survived
0  22.0    male       3         0
1  38.0  female       1         1
2  26.0  female       3         1
3  35.0  female       1         1
4  35.0    male       3         0


In [17]:
columns_customed=['pclass','sex','age','survived']
df_customed=df[columns_customed]
print(df_customed)

   pclass     sex   age  survived
0       3    male  22.0         0
1       1  female  38.0         1
2       3  female  26.0         1
3       1  female  35.0         1
4       3    male  35.0         0


## 2-2 열 분리

In [3]:
df=pd.read_excel('./주가데이터.xlsx')
print(df.dtypes,'\n')
df.head()

연월일     datetime64[ns]
당일종가             int64
전일종가             int64
시가               int64
고가               int64
저가               int64
거래량              int64
dtype: object 



Unnamed: 0,연월일,당일종가,전일종가,시가,고가,저가,거래량
0,2018-07-02,10100,600,10850,10900,10000,137977
1,2018-06-29,10700,300,10550,10900,9990,170253
2,2018-06-28,10400,500,10900,10950,10150,155769
3,2018-06-27,10900,100,10800,11050,10500,133548
4,2018-06-26,10800,350,10900,11000,10700,63039


In [5]:
df['연월일']=df['연월일'].astype('str')
dates=df['연월일'].str.split("-")
print(dates.head(),'\n')

0    [2018, 07, 02]
1    [2018, 06, 29]
2    [2018, 06, 28]
3    [2018, 06, 27]
4    [2018, 06, 26]
Name: 연월일, dtype: object 



In [6]:
df['연']=dates.str.get(0)
df['월']=dates.str.get(1)
df['일']=dates.str.get(2)
df.head()

Unnamed: 0,연월일,당일종가,전일종가,시가,고가,저가,거래량,연,월,일
0,2018-07-02,10100,600,10850,10900,10000,137977,2018,7,2
1,2018-06-29,10700,300,10550,10900,9990,170253,2018,6,29
2,2018-06-28,10400,500,10900,10950,10150,155769,2018,6,28
3,2018-06-27,10900,100,10800,11050,10500,133548,2018,6,27
4,2018-06-26,10800,350,10900,11000,10700,63039,2018,6,26


## 3-1 불린 인덱싱
### DataFrame 객체[불린시리즈]

In [9]:
titanic=sns.load_dataset('titanic')
mask1=(titanic.age>=10)&(titanic.age<20) # 나이가 10~19세인 승객만 따로 선택해서 mask1변수에 저장
df_teenage=titanic.loc[mask1,:]
df_teenage.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,...,adult_male,deck,embark_town,alive,alone
9,1,2,female,14.0,1,...,False,,Cherbourg,yes,False
14,0,3,female,14.0,0,...,False,,Southampton,no,True
22,1,3,female,15.0,0,...,False,,Queenstown,yes,True
27,0,1,male,19.0,3,...,True,C,Southampton,no,False
38,0,3,female,18.0,2,...,False,,Southampton,no,False


In [10]:
mask2=(titanic.age<10)&(titanic.sex=='female')
df_female_under10=titanic.loc[mask2,:]
df_female_under10.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,...,adult_male,deck,embark_town,alive,alone
10,1,3,female,4.0,1,...,False,G,Southampton,yes,False
24,0,3,female,8.0,3,...,False,,Southampton,no,False
43,1,2,female,3.0,1,...,False,,Cherbourg,yes,False
58,1,2,female,5.0,1,...,False,,Southampton,yes,False
119,0,3,female,2.0,4,...,False,,Southampton,no,False


In [11]:
mask3=(titanic.age<10)|(titanic.age>=60)
df_under10_morethan60=titanic.loc[mask3,['age','sex','alone']]
df_under10_morethan60.head()

Unnamed: 0,age,sex,alone
7,2.0,male,False
10,4.0,female,False
16,2.0,male,False
24,8.0,female,False
33,66.0,male,True


## 3-2 isin()
### DataFrame의 열 객체.isin(추출 값의 리스트)

In [12]:
pd.set_option('display.max_columns',10)
mask3=titanic['sibsp']==3
mask4=titanic['sibsp']==4
mask5=titanic['sibsp']==5
df_boolean=titanic[mask3|mask4|mask5]
df_boolean.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,...,adult_male,deck,embark_town,alive,alone
7,0,3,male,2.0,3,...,False,,Southampton,no,False
16,0,3,male,2.0,4,...,False,,Queenstown,no,False
24,0,3,female,8.0,3,...,False,,Southampton,no,False
27,0,1,male,19.0,3,...,True,C,Southampton,no,False
50,0,3,male,7.0,4,...,False,,Southampton,no,False


In [13]:
isin_filter=titanic['sibsp'].isin([3,4,5])
df_isin=titanic[isin_filter]
df_isin.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,...,adult_male,deck,embark_town,alive,alone
7,0,3,male,2.0,3,...,False,,Southampton,no,False
16,0,3,male,2.0,4,...,False,,Queenstown,no,False
24,0,3,female,8.0,3,...,False,,Southampton,no,False
27,0,1,male,19.0,3,...,True,C,Southampton,no,False
50,0,3,male,7.0,4,...,False,,Southampton,no,False


## 4-1 데이터 프레임 연결
### pandas.concat(데이터프레임의 리스트)

In [27]:
import pandas as pd

In [28]:
df1=pd.DataFrame({'a':['a0','a1','a2','a3'],
                  'b':['b0','b1','b2','b3'],
                  'c':['c0','c1','c2','c3']},
                 index=[0,1,2,3])

df2=pd.DataFrame({'a':['a2','a3','a4','a5'],
                  'b':['b2','b3','b4','b5'],
                  'c':['c2','c3','c4','c5'],
                  'd':['d2','d3','d4','d5'],},
                 index=[2,3,4,5])

print(df1,'\n',df2)
result1=pd.concat([df1,df2])
print(result1,'\n')

    a   b   c
0  a0  b0  c0
1  a1  b1  c1
2  a2  b2  c2
3  a3  b3  c3 
     a   b   c   d
2  a2  b2  c2  d2
3  a3  b3  c3  d3
4  a4  b4  c4  d4
5  a5  b5  c5  d5
    a   b   c    d
0  a0  b0  c0  NaN
1  a1  b1  c1  NaN
2  a2  b2  c2  NaN
3  a3  b3  c3  NaN
2  a2  b2  c2   d2
3  a3  b3  c3   d3
4  a4  b4  c4   d4
5  a5  b5  c5   d5 



of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  del sys.path[0]


In [29]:
result2=pd.concat([df1,df2],ignore_index=True)
print(result2,'\n')
result3=pd.concat([df1,df2],axis=1)
print(result3,'\n')
result3_in=pd.concat([df1,df2],axis=1,join='inner') # 각 데이터 프레임 간 교집합
print(result3_in,'\n')

    a   b   c    d
0  a0  b0  c0  NaN
1  a1  b1  c1  NaN
2  a2  b2  c2  NaN
3  a3  b3  c3  NaN
4  a2  b2  c2   d2
5  a3  b3  c3   d3
6  a4  b4  c4   d4
7  a5  b5  c5   d5 

     a    b    c    a    b    c    d
0   a0   b0   c0  NaN  NaN  NaN  NaN
1   a1   b1   c1  NaN  NaN  NaN  NaN
2   a2   b2   c2   a2   b2   c2   d2
3   a3   b3   c3   a3   b3   c3   d3
4  NaN  NaN  NaN   a4   b4   c4   d4
5  NaN  NaN  NaN   a5   b5   c5   d5 

    a   b   c   a   b   c   d
2  a2  b2  c2  a2  b2  c2  d2
3  a3  b3  c3  a3  b3  c3  d3 



of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [30]:
sr1=pd.Series(['e0','e1','e2','e3'],name='e')
sr2=pd.Series(['f0','f1','f2'],name='f',index=[3,4,5])
sr3=pd.Series(['g0','g1','g2','g3'],name='g')

result4=pd.concat([df1,df2],axis=1)
print(result4,'\n')
result5=pd.concat([df2,sr2],axis=1,sort=True)
print(result5,'\n')
result6=pd.concat([sr1,sr3],axis=1)
print(result6,'\n')
result7=pd.concat([sr1,sr3],axis=0)
print(result7,'\n')

     a    b    c    a    b    c    d
0   a0   b0   c0  NaN  NaN  NaN  NaN
1   a1   b1   c1  NaN  NaN  NaN  NaN
2   a2   b2   c2   a2   b2   c2   d2
3   a3   b3   c3   a3   b3   c3   d3
4  NaN  NaN  NaN   a4   b4   c4   d4
5  NaN  NaN  NaN   a5   b5   c5   d5 

    a   b   c   d    f
2  a2  b2  c2  d2  NaN
3  a3  b3  c3  d3   f0
4  a4  b4  c4  d4   f1
5  a5  b5  c5  d5   f2 

    e   g
0  e0  g0
1  e1  g1
2  e2  g2
3  e3  g3 

0    e0
1    e1
2    e2
3    e3
0    g0
1    g1
2    g2
3    g3
dtype: object 



## 4-2 데이터 프레임 병합
### pandas.merge(df_left,df_right,how='inner',on=None)
- 기준(열, 인덱스 = key)에 의해 두 데이터프레임을 병합
- key가 되는 열이나 인덱스는 반드시 양쪽 데이터프레임에 모두 존재해야 함.

In [32]:
pd.set_option('display.max_columns',10)
pd.set_option('display.max_colwidth',20)
pd.set_option('display.unicode.east_asian_width',True)

df1=pd.read_excel('./stock price.xlsx')
df2=pd.read_excel('./stock valuation.xlsx')

print(df1,'\n')
print(df2)

       id    stock_name          value   price
0  128940      한미약품   59385.666667  421000
1  130960        CJ E&M   58540.666667   98900
2  138250    엔에스쇼핑   14558.666667   13200
3  139480        이마트  239230.833333  254500
4  142280  녹십자엠에스     468.833333   10200
5  145990        삼양사   82750.000000   82000
6  185750        종근당   40293.666667  100500
7  192400    쿠쿠홀딩스  179204.666667  177500
8  199800          툴젠   -2514.333333  115400
9  204210  모두투어리츠    3093.333333    3475 

       id              name           eps     bps        per       pbr
0  130960            CJ E&M   6301.333333   54068  15.695091  1.829178
1  136480              하림    274.166667    3551  11.489362  0.887074
2  138040    메리츠금융지주   2122.333333   14894   6.313806  0.899691
3  139480            이마트  18268.166667  295780  13.931338  0.860437
4  145990            삼양사   5741.000000  108090  14.283226  0.758627
5  161390        한국타이어   5648.500000   51341   7.453306  0.820007
6  181710   NHN엔터테인먼트   2110.166667   784

In [33]:
merge_inner=pd.merge(df1,df2)
merge_inner

Unnamed: 0,id,stock_name,value,price,name,eps,bps,per,pbr
0,130960,CJ E&M,58540.666667,98900,CJ E&M,6301.333333,54068,15.695091,1.829178
1,139480,이마트,239230.833333,254500,이마트,18268.166667,295780,13.931338,0.860437
2,145990,삼양사,82750.0,82000,삼양사,5741.0,108090,14.283226,0.758627
3,185750,종근당,40293.666667,100500,종근당,3990.333333,40684,25.185866,2.470259
4,204210,모두투어리츠,3093.333333,3475,모두투어리츠,85.166667,5335,40.802348,0.651359


In [34]:
merge_outer=pd.merge(df1,df2,how='outer',on='id') # 각 데이터 프레임 간 합집합
merge_outer

Unnamed: 0,id,stock_name,value,price,name,eps,bps,per,pbr
0,128940,한미약품,59385.666667,421000.0,,,,,
1,130960,CJ E&M,58540.666667,98900.0,CJ E&M,6301.333333,54068.0,15.695091,1.829178
2,138250,엔에스쇼핑,14558.666667,13200.0,,,,,
3,139480,이마트,239230.833333,254500.0,이마트,18268.166667,295780.0,13.931338,0.860437
4,142280,녹십자엠에스,468.833333,10200.0,,,,,
5,145990,삼양사,82750.0,82000.0,삼양사,5741.0,108090.0,14.283226,0.758627
6,185750,종근당,40293.666667,100500.0,종근당,3990.333333,40684.0,25.185866,2.470259
7,192400,쿠쿠홀딩스,179204.666667,177500.0,,,,,
8,199800,툴젠,-2514.333333,115400.0,,,,,
9,204210,모두투어리츠,3093.333333,3475.0,모두투어리츠,85.166667,5335.0,40.802348,0.651359


In [35]:
merge_left=pd.merge(df1,df2,how='left',left_on='stock_name',right_on='name') 
merge_left

Unnamed: 0,id_x,stock_name,value,price,id_y,name,eps,bps,per,pbr
0,128940,한미약품,59385.666667,421000,,,,,,
1,130960,CJ E&M,58540.666667,98900,130960.0,CJ E&M,6301.333333,54068.0,15.695091,1.829178
2,138250,엔에스쇼핑,14558.666667,13200,,,,,,
3,139480,이마트,239230.833333,254500,139480.0,이마트,18268.166667,295780.0,13.931338,0.860437
4,142280,녹십자엠에스,468.833333,10200,,,,,,
5,145990,삼양사,82750.0,82000,145990.0,삼양사,5741.0,108090.0,14.283226,0.758627
6,185750,종근당,40293.666667,100500,185750.0,종근당,3990.333333,40684.0,25.185866,2.470259
7,192400,쿠쿠홀딩스,179204.666667,177500,,,,,,
8,199800,툴젠,-2514.333333,115400,,,,,,
9,204210,모두투어리츠,3093.333333,3475,204210.0,모두투어리츠,85.166667,5335.0,40.802348,0.651359


In [36]:
merge_right=pd.merge(df1,df2,how='right',left_on='stock_name',right_on='name')
merge_right

Unnamed: 0,id_x,stock_name,value,price,id_y,name,eps,bps,per,pbr
0,130960.0,CJ E&M,58540.666667,98900.0,130960,CJ E&M,6301.333333,54068,15.695091,1.829178
1,139480.0,이마트,239230.833333,254500.0,139480,이마트,18268.166667,295780,13.931338,0.860437
2,145990.0,삼양사,82750.0,82000.0,145990,삼양사,5741.0,108090,14.283226,0.758627
3,185750.0,종근당,40293.666667,100500.0,185750,종근당,3990.333333,40684,25.185866,2.470259
4,204210.0,모두투어리츠,3093.333333,3475.0,204210,모두투어리츠,85.166667,5335,40.802348,0.651359
5,,,,,136480,하림,274.166667,3551,11.489362,0.887074
6,,,,,138040,메리츠금융지주,2122.333333,14894,6.313806,0.899691
7,,,,,161390,한국타이어,5648.5,51341,7.453306,0.820007
8,,,,,181710,NHN엔터테인먼트,2110.166667,78434,30.755864,0.827447
9,,,,,207940,삼성바이오로직스,4644.166667,60099,89.790059,6.938551


In [37]:
price=df1[df1['price']<50000]
print(price.head(),'\n')
value=pd.merge(price,df2)
print(value)

       id    stock_name         value  price
2  138250    엔에스쇼핑  14558.666667  13200
4  142280  녹십자엠에스    468.833333  10200
9  204210  모두투어리츠   3093.333333   3475 

       id    stock_name        value  price          name        eps   bps  \
0  204210  모두투어리츠  3093.333333   3475  모두투어리츠  85.166667  5335   

         per       pbr  
0  40.802348  0.651359  


## 4-3 데이터프레임 결합
### DataFrame1.join(DataFrame2,how='left')

In [38]:
pd.set_option('display.max_columns',10)
pd.set_option('display.max_colwidth',20)
pd.set_option('display.unicode.east_asian_width',True)

df1=pd.read_excel('./stock price.xlsx',index_col='id')
df2=pd.read_excel('./stock valuation.xlsx',index_col='id')

df3=df1.join(df2)
df3

Unnamed: 0_level_0,stock_name,value,price,name,eps,bps,per,pbr
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
128940,한미약품,59385.666667,421000,,,,,
130960,CJ E&M,58540.666667,98900,CJ E&M,6301.333333,54068.0,15.695091,1.829178
138250,엔에스쇼핑,14558.666667,13200,,,,,
139480,이마트,239230.833333,254500,이마트,18268.166667,295780.0,13.931338,0.860437
142280,녹십자엠에스,468.833333,10200,,,,,
145990,삼양사,82750.0,82000,삼양사,5741.0,108090.0,14.283226,0.758627
185750,종근당,40293.666667,100500,종근당,3990.333333,40684.0,25.185866,2.470259
192400,쿠쿠홀딩스,179204.666667,177500,,,,,
199800,툴젠,-2514.333333,115400,,,,,
204210,모두투어리츠,3093.333333,3475,모두투어리츠,85.166667,5335.0,40.802348,0.651359


In [None]:
df4=df1.join(df2,how='inner')
df4

## 5-1 그룹 객체 만들기
### DataFrame객체.groupby(기준이 되는 열/리스트)

In [39]:
import pandas as pd
import seaborn as sns

In [40]:
titanic=sns.load_dataset('titanic')
df=titanic.loc[:,['age','sex','class','fare','survived']]
print('승객 수:',len(df))
print(df.head(),'\n')
grouped=df.groupby(['class'])
print(grouped)

승객 수: 891
    age     sex  class     fare  survived
0  22.0    male  Third   7.2500         0
1  38.0  female  First  71.2833         1
2  26.0  female  Third   7.9250         1
3  35.0  female  First  53.1000         1
4  35.0    male  Third   8.0500         0 

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000197C3643470>


In [41]:
for key, group in grouped:
  print('* key :',key)
  print('* number :',len(group))
  print(group.head())
  print('\n')

* key : First
* number : 216
     age     sex  class     fare  survived
1   38.0  female  First  71.2833         1
3   35.0  female  First  53.1000         1
6   54.0    male  First  51.8625         0
11  58.0  female  First  26.5500         1
23  28.0    male  First  35.5000         1


* key : Second
* number : 184
     age     sex   class     fare  survived
9   14.0  female  Second  30.0708         1
15  55.0  female  Second  16.0000         1
17   NaN    male  Second  13.0000         1
20  35.0    male  Second  26.0000         0
21  34.0    male  Second  13.0000         1


* key : Third
* number : 491
    age     sex  class     fare  survived
0  22.0    male  Third   7.2500         0
2  26.0  female  Third   7.9250         1
4  35.0    male  Third   8.0500         0
5   NaN    male  Third   8.4583         0
7   2.0    male  Third  21.0750         0




In [42]:
average=grouped.mean()
average

Unnamed: 0_level_0,age,fare,survived
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
First,38.233441,84.154687,0.62963
Second,29.87763,20.662183,0.472826
Third,25.14062,13.67555,0.242363


In [43]:
group3=grouped.get_group('Third')
group3.head()

Unnamed: 0,age,sex,class,fare,survived
0,22.0,male,Third,7.25,0
2,26.0,female,Third,7.925,1
4,35.0,male,Third,8.05,0
5,,male,Third,8.4583,0
7,2.0,male,Third,21.075,0


### 여러 열을 기준으로 그룹화

In [44]:
grouped_two=df.groupby(['class','sex'])

for key, group in grouped_two:
  print('*key:',key)
  print('*number:',len(group))
  print(group.head())
  print('\n')

*key: ('First', 'female')
*number: 94
     age     sex  class      fare  survived
1   38.0  female  First   71.2833         1
3   35.0  female  First   53.1000         1
11  58.0  female  First   26.5500         1
31   NaN  female  First  146.5208         1
52  49.0  female  First   76.7292         1


*key: ('First', 'male')
*number: 122
     age   sex  class      fare  survived
6   54.0  male  First   51.8625         0
23  28.0  male  First   35.5000         1
27  19.0  male  First  263.0000         0
30  40.0  male  First   27.7208         0
34  28.0  male  First   82.1708         0


*key: ('Second', 'female')
*number: 76
     age     sex   class     fare  survived
9   14.0  female  Second  30.0708         1
15  55.0  female  Second  16.0000         1
41  27.0  female  Second  21.0000         0
43   3.0  female  Second  41.5792         1
53  29.0  female  Second  26.0000         1


*key: ('Second', 'male')
*number: 108
     age   sex   class  fare  survived
17   NaN  male  Second 

In [45]:
average_two=grouped_two.mean()
print(average_two,'\n')
print(type(average_two))

                     age        fare  survived
class  sex                                    
First  female  34.611765  106.125798  0.968085
       male    41.281386   67.226127  0.368852
Second female  28.722973   21.970121  0.921053
       male    30.740707   19.741782  0.157407
Third  female  21.750000   16.118810  0.500000
       male    26.507589   12.661633  0.135447 

<class 'pandas.core.frame.DataFrame'>


In [46]:
group3f=grouped_two.get_group(('Third','female'))
group3f.head()

Unnamed: 0,age,sex,class,fare,survived
2,26.0,female,Third,7.925,1
8,27.0,female,Third,11.1333,1
10,4.0,female,Third,16.7,1
14,14.0,female,Third,7.8542,0
18,31.0,female,Third,18.0,0


## 5-2 그룹 연산 메소드 (적용-결합 단계)
- 데이터 집계 : mean(), max(), min(), sum(), count(), size(), var(), std(), describe(), info(), first(), last()

In [47]:
titanic=sns.load_dataset('titanic')
df=titanic.loc[:,['age','sex','class','fare','survived']]
grouped=df.groupby(['class'])
std_all=grouped.std()
print(std_all,'\n')
print(type(std_all),'\n')

std_fare=grouped.fare.std()
print(std_fare,'\n')
print(type(std_fare))

              age       fare  survived
class                                 
First   14.802856  78.380373  0.484026
Second  14.001077  13.417399  0.500623
Third   12.495398  11.778142  0.428949 

<class 'pandas.core.frame.DataFrame'> 

class
First     78.380373
Second    13.417399
Third     11.778142
Name: fare, dtype: float64 

<class 'pandas.core.series.Series'>


- 집계 연산을 처리하는 사용자 정의함수 agg()

In [48]:
def min_max(x):
  return x.max() - x.min()

agg_minmax=grouped.agg(min_max)
agg_minmax.head()
# 각 그룹별로 연산이 가능한 열에 대한 최대값과 최소값의 차를 구하면 데이터 값의 분포 범위를 알 수 있다. 

Unnamed: 0_level_0,age,fare,survived
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
First,79.08,512.3292,1
Second,69.33,73.5,1
Third,73.58,69.55,1


- 모든 열에 여러 함수를 매핑 : group객체.agg([함수1, 함수2, ... ,함수n])
- 각 열마다 다른 함수를 매핑 : group객체.agg(['열1':함수1, '열2':함수2, ..., '열n':함수n])


In [49]:
agg_all=grouped.agg(['min','max'])
agg_all.head()

Unnamed: 0_level_0,age,age,sex,sex,fare,fare,survived,survived
Unnamed: 0_level_1,min,max,min,max,min,max,min,max
class,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
First,0.92,80.0,female,male,0.0,512.3292,0,1
Second,0.67,70.0,female,male,0.0,73.5,0,1
Third,0.42,74.0,female,male,0.0,69.55,0,1


In [50]:
agg_sep=grouped.agg({'fare':['min','max'],'age':'mean'})
# 2개의 함수를 리스트 형태로 입력하면 각 열에 대하여 2개의 함수의 연산 결과를 집계 후 다른 열로 구분하여 표시한다.
# 함수를 열 이름에 추가하여 2중 열 구조를 만든다.
agg_sep.head()

Unnamed: 0_level_0,fare,fare,age
Unnamed: 0_level_1,min,max,mean
class,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
First,0.0,512.3292,38.233441
Second,0.0,73.5,29.87763
Third,0.0,69.55,25.14062


- 그룹 연산 데이터 변환 : group객체.transform(매핑함수)
- 그룹 연산의 결과를 원본 데이터 프레임과 같은 형태로 변형하여 정리

![표준점수](https://upload.wikimedia.org/wikipedia/commons/thumb/b/bb/Normal_distribution_and_scales.gif/1024px-Normal_distribution_and_scales.gif)

In [51]:
age_mean=grouped.age.mean()
print(age_mean,'\n')
age_std=grouped.age.std()
print(age_std,'\n')
for key, group in grouped.age:
  group_zscore=(group-age_mean.loc[key])/age_std.loc[key]
  print('* origin :',key)
  print(group_zscore.head(3))

class
First     38.233441
Second    29.877630
Third     25.140620
Name: age, dtype: float64 

class
First     14.802856
Second    14.001077
Third     12.495398
Name: age, dtype: float64 

* origin : First
1   -0.015770
3   -0.218434
6    1.065103
Name: age, dtype: float64
* origin : Second
9    -1.134029
15    1.794317
17         NaN
Name: age, dtype: float64
* origin : Third
0   -0.251342
2    0.068776
4    0.789041
Name: age, dtype: float64


In [52]:
def z_score(x):
  return (x-x.mean())/x.std()

age_zscore=grouped.age.transform(z_score)
print(age_zscore.loc[[1,9,0]],'\n')
print(len(age_zscore),'\n')
print(age_zscore.loc[0:9],'\n')
print(type(age_zscore))

1   -0.015770
9   -1.134029
0   -0.251342
Name: age, dtype: float64 

891 

0   -0.251342
1   -0.015770
2    0.068776
3   -0.218434
4    0.789041
5         NaN
6    1.065103
7   -1.851931
8    0.148805
9   -1.134029
Name: age, dtype: float64 

<class 'pandas.core.series.Series'>


In [53]:
grouped_filter=grouped.filter(lambda x: len(x) >=200)
print(grouped_filter.head(),'\n')
print(type(grouped_filter))

    age     sex  class     fare  survived
0  22.0    male  Third   7.2500         0
1  38.0  female  First  71.2833         1
2  26.0  female  Third   7.9250         1
3  35.0  female  First  53.1000         1
4  35.0    male  Third   8.0500         0 

<class 'pandas.core.frame.DataFrame'>


In [54]:
age_filter=grouped.filter(lambda x: x.age.mean() < 30)
print(age_filter.tail(),'\n')
print(type(age_filter))

      age     sex   class    fare  survived
884  25.0    male   Third   7.050         0
885  39.0  female   Third  29.125         0
886  27.0    male  Second  13.000         0
888   NaN  female   Third  23.450         0
890  32.0    male   Third   7.750         0 

<class 'pandas.core.frame.DataFrame'>


In [55]:
agg_grouped=grouped.apply(lambda x: x.describe())
agg_grouped

Unnamed: 0_level_0,Unnamed: 1_level_0,age,fare,survived
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
First,count,186.0,216.0,216.0
First,mean,38.233441,84.154687,0.62963
First,std,14.802856,78.380373,0.484026
First,min,0.92,0.0,0.0
First,25%,27.0,30.92395,0.0
First,50%,37.0,60.2875,1.0
First,75%,49.0,93.5,1.0
First,max,80.0,512.3292,1.0
Second,count,173.0,184.0,184.0
Second,mean,29.87763,20.662183,0.472826


In [56]:
def z_score(x):
  return (x-x.mean())/x.std()

age_zscore=grouped.age.apply(z_score)
print(age_zscore.head())    # 해당 값이 z-score zone에 속한다는 소린지 뭔지 모르겠음.
print(df['age'].head())   

0   -0.251342
1   -0.015770
2    0.068776
3   -0.218434
4    0.789041
Name: age, dtype: float64
0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
Name: age, dtype: float64


In [57]:
# 필터링 : age열의 데이터 평균이 30보다 작은 그룹만을 필터링 하여 출력

age_filter=grouped.apply(lambda x: x.age.mean() <30)
print(age_filter,'\n')

for x in age_filter.index:
  if age_filter[x]==True:
    age_filter_df=grouped.get_group(x)
    print(age_filter_df.head())

class
First     False
Second     True
Third      True
dtype: bool 

     age     sex   class     fare  survived
9   14.0  female  Second  30.0708         1
15  55.0  female  Second  16.0000         1
17   NaN    male  Second  13.0000         1
20  35.0    male  Second  26.0000         0
21  34.0    male  Second  13.0000         1
    age     sex  class     fare  survived
0  22.0    male  Third   7.2500         0
2  26.0  female  Third   7.9250         1
4  35.0    male  Third   8.0500         0
5   NaN    male  Third   8.4583         0
7   2.0    male  Third  21.0750         0


In [58]:
grouped=df.groupby(['class','sex'])

gdf=grouped.mean()
print(gdf,'\n') # dataframe 'gdf'는 평균연령, 지불요금, 생존률을 탑승칸 별 승객의 성별로 구분한 데이터프레임이다.
print(type(gdf),'\n') # type - 'DataFrame'
print()
print(gdf.loc['First'],'\n') # 일등석 탑승자의 정보 출력
print(gdf.loc['First','female'],'\n') # 일등석 탑승자 중 'female'해당 행 추출
print(gdf.xs('male',level='sex')) # 객실 등급별 남성 승객의 지불요금

                     age        fare  survived
class  sex                                    
First  female  34.611765  106.125798  0.968085
       male    41.281386   67.226127  0.368852
Second female  28.722973   21.970121  0.921053
       male    30.740707   19.741782  0.157407
Third  female  21.750000   16.118810  0.500000
       male    26.507589   12.661633  0.135447 

<class 'pandas.core.frame.DataFrame'> 


              age        fare  survived
sex                                    
female  34.611765  106.125798  0.968085
male    41.281386   67.226127  0.368852 

age          34.611765
fare        106.125798
survived      0.968085
Name: (First, female), dtype: float64 

              age       fare  survived
class                                 
First   41.281386  67.226127  0.368852
Second  30.740707  19.741782  0.157407
Third   26.507589  12.661633  0.135447


In [59]:
import pandas as pd
import seaborn as sns
pd.set_option('display.max_columns',10)
pd.set_option('display.max_colwidth',20)
titanic=sns.load_dataset('titanic')
df=titanic.loc[:,['age','sex','class','fare','survived']]
print(df.head())

    age     sex  class     fare  survived
0  22.0    male  Third   7.2500         0
1  38.0  female  First  71.2833         1
2  26.0  female  Third   7.9250         1
3  35.0  female  First  53.1000         1
4  35.0    male  Third   8.0500         0


In [60]:
pdf1=pd.pivot_table(df,index='class',columns='sex',values='age',aggfunc='mean')
pdf1.head()

sex,female,male
class,Unnamed: 1_level_1,Unnamed: 2_level_1
First,34.611765,41.281386
Second,28.722973,30.740707
Third,21.75,26.507589


In [61]:
pdf2=pd.pivot_table(df,index='class',columns='sex',values='survived',aggfunc=['mean','sum'])
pdf2.head()

Unnamed: 0_level_0,mean,mean,sum,sum
sex,female,male,female,male
class,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
First,0.968085,0.368852,91,45
Second,0.921053,0.157407,70,17
Third,0.5,0.135447,72,47


In [62]:
pdf3=pd.pivot_table(df,index=['class','sex'],columns='survived',values=['age','fare'],aggfunc=['mean','max'])
pdf3.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,mean,mean,mean,max,max,max,max
Unnamed: 0_level_1,Unnamed: 1_level_1,age,age,fare,fare,age,age,fare,fare
Unnamed: 0_level_2,survived,0,1,0,1,0,1,0,1
class,sex,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3
First,female,25.666667,34.939024,110.604167,105.978159,50.0,63.0,151.55,512.3292
First,male,44.581967,36.248,62.89491,74.63732,71.0,80.0,263.0,512.3292
Second,female,36.0,28.080882,18.25,22.288989,57.0,55.0,26.0,65.0
Second,male,33.369048,16.022,19.488965,21.0951,70.0,62.0,73.5,39.0
Third,female,23.818182,19.329787,19.773093,12.464526,48.0,63.0,69.55,31.3875


In [63]:
pdf3.xs('First')

Unnamed: 0_level_0,mean,mean,mean,mean,max,max,max,max
Unnamed: 0_level_1,age,age,fare,fare,age,age,fare,fare
survived,0,1,0,1,0,1,0,1
sex,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3
female,25.666667,34.939024,110.604167,105.978159,50.0,63.0,151.55,512.3292
male,44.581967,36.248,62.89491,74.63732,71.0,80.0,263.0,512.3292


In [64]:
pdf3.xs(('First','female'))

            survived
mean  age   0            25.666667
            1            34.939024
      fare  0           110.604167
            1           105.978159
max   age   0            50.000000
            1            63.000000
      fare  0           151.550000
            1           512.329200
Name: (First, female), dtype: float64

In [65]:
pdf3.xs('male',level='sex')

Unnamed: 0_level_0,mean,mean,mean,mean,max,max,max,max
Unnamed: 0_level_1,age,age,fare,fare,age,age,fare,fare
survived,0,1,0,1,0,1,0,1
class,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3
First,44.581967,36.248,62.89491,74.63732,71.0,80.0,263.0,512.3292
Second,33.369048,16.022,19.488965,21.0951,70.0,62.0,73.5,39.0
Third,27.255814,22.274211,12.204469,15.579696,74.0,45.0,69.55,56.4958


In [66]:
pdf3.xs(('Second','male'),level=[0,1])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,mean,mean,mean,max,max,max,max
Unnamed: 0_level_1,Unnamed: 1_level_1,age,age,fare,fare,age,age,fare,fare
Unnamed: 0_level_2,survived,0,1,0,1,0,1,0,1
class,sex,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3
Second,male,33.369048,16.022,19.488965,21.0951,70.0,62.0,73.5,39.0


In [67]:
pdf3.xs('mean',axis=1)

Unnamed: 0_level_0,Unnamed: 1_level_0,age,age,fare,fare
Unnamed: 0_level_1,survived,0,1,0,1
class,sex,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
First,female,25.666667,34.939024,110.604167,105.978159
First,male,44.581967,36.248,62.89491,74.63732
Second,female,36.0,28.080882,18.25,22.288989
Second,male,33.369048,16.022,19.488965,21.0951
Third,female,23.818182,19.329787,19.773093,12.464526
Third,male,27.255814,22.274211,12.204469,15.579696


In [68]:
pdf3.xs(('mean','age'),axis=1)

Unnamed: 0_level_0,survived,0,1
class,sex,Unnamed: 2_level_1,Unnamed: 3_level_1
First,female,25.666667,34.939024
First,male,44.581967,36.248
Second,female,36.0,28.080882
Second,male,33.369048,16.022
Third,female,23.818182,19.329787
Third,male,27.255814,22.274211


In [69]:
pdf3.xs(1,level='survived',axis=1)

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,mean,max,max
Unnamed: 0_level_1,Unnamed: 1_level_1,age,fare,age,fare
class,sex,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
First,female,34.939024,105.978159,63.0,512.3292
First,male,36.248,74.63732,80.0,512.3292
Second,female,28.080882,22.288989,55.0,65.0
Second,male,16.022,21.0951,62.0,39.0
Third,female,19.329787,12.464526,63.0,31.3875
Third,male,22.274211,15.579696,45.0,56.4958


In [70]:
pdf3.xs(('max','fare',0),level=[0,1,2],axis=1)

Unnamed: 0_level_0,Unnamed: 1_level_0,max
Unnamed: 0_level_1,Unnamed: 1_level_1,fare
Unnamed: 0_level_2,survived,0
class,sex,Unnamed: 2_level_3
First,female,151.55
First,male,263.0
Second,female,26.0
Second,male,73.5
Third,female,69.55
Third,male,69.55
