# Pandas #2
- 학습목표
    - 정제되지 않은 데이터를 여러 형태로 변환
    - 여러 기능 실습
    - 데이터 전처리 하는 방법
    
    
- 핵심키워드
    - pandas
    - groupby
    - pivot_table
    - Hierarchical index
    - aggregation
    - transformation
    - merge & concat
    - join
    - DB persistence

In [1]:
import pandas as pd
import numpy as np

# Groupby
- SQL groupby 명령어와 같음
- split => apply => combine
- 과정을 거쳐 연산함
- `df.groupby('기준컬럼')['적용받는컬럼'].agg(['적용연산'])`

<center><img src="https://image.slidesharecdn.com/slides-151008060416-lva1-app6892/95/pandas-powerful-data-analysis-tools-for-python-19-638.jpg?cb=1444284343" style="height: 80% width: 80%"></center>

In [11]:
df = pd.DataFrame([
    [0],
    [5],
    [10],
    [5],
    [10],
    [15],
    [10],
    [15],
    [20],
])
df.index = ['A', 'B', 'C', 'A', 'B', 'C', 'A', 'B', 'C']
print(df)
print()
df.reset_index().groupby(['index'])[0].agg(['sum', 'mean'])

    0
A   0
B   5
C  10
A   5
B  10
C  15
A  10
B  15
C  20



Unnamed: 0_level_0,sum,mean
index,Unnamed: 1_level_1,Unnamed: 2_level_1
A,15,5
B,30,10
C,45,15


In [12]:
df = pd.read_csv('wages.csv')
df.head()

Unnamed: 0,earn,height,sex,race,ed,age
0,79571.299011,73.89,male,white,16,49
1,96396.988643,66.23,female,white,16,62
2,48710.666947,63.77,female,white,16,33
3,80478.096153,63.22,female,other,16,95
4,82089.345498,63.08,female,white,17,43


In [25]:
df.groupby(['sex', 'race'])['earn'].mean()

sex     race    
female  black       26413.283253
        hispanic    21217.352092
        other       34164.346197
        white       23948.241172
male    black       31778.720282
        hispanic    31818.390677
        other       29189.706266
        white       48951.731450
Name: earn, dtype: float64

## Hierarchical index - unstack()
- Group으로 묶여진 데이터를 matrix 형태로 전환해줌
- Series 자료형에서 사용가능 (multi index)

In [37]:
df_unstack = df.groupby(['sex', 'race'])['earn'].mean()
df_unstack

sex     race    
female  black       26413.283253
        hispanic    21217.352092
        other       34164.346197
        white       23948.241172
male    black       31778.720282
        hispanic    31818.390677
        other       29189.706266
        white       48951.731450
Name: earn, dtype: float64

In [38]:
df_unstack.unstack()

race,black,hispanic,other,white
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,26413.283253,21217.352092,34164.346197,23948.241172
male,31778.720282,31818.390677,29189.706266,48951.73145


In [35]:
df_unstack.to_excel('test.xlsx', '성별&인종에 따른 수입평균')

## Hierarchical index - swaplevel()
- Multi Index의 순서를 바꿔준다.
- index level을 변경할 수 있음

In [55]:
print( df_unstack )
print()

df_unstack.swaplevel().sort_index()

sex     race    
female  black       26413.283253
        hispanic    21217.352092
        other       34164.346197
        white       23948.241172
male    black       31778.720282
        hispanic    31818.390677
        other       29189.706266
        white       48951.731450
Name: earn, dtype: float64



race      sex   
black     female    26413.283253
          male      31778.720282
hispanic  female    21217.352092
          male      31818.390677
other     female    34164.346197
          male      29189.706266
white     female    23948.241172
          male      48951.731450
Name: earn, dtype: float64

## Hierarchical Index - Operations
- index level을 기준으로 기본 연산 수행 가능

In [60]:
df_unstack.swaplevel().sort_index()

race      sex   
black     female    26413.283253
          male      31778.720282
hispanic  female    21217.352092
          male      31818.390677
other     female    34164.346197
          male      29189.706266
white     female    23948.241172
          male      48951.731450
Name: earn, dtype: float64

In [58]:
df_unstack.swaplevel().sort_index().sum(level=0)

race
black       58192.003536
hispanic    53035.742769
other       63354.052463
white       72899.972622
Name: earn, dtype: float64

In [59]:
df_unstack.swaplevel().sort_index().sum(level=1)

sex
female    105743.222714
male      141738.548676
Name: earn, dtype: float64

# Groupby #2