In [1]:
import pandas as pd
import numpy as np



In [2]:
index = [('California', 2000), ('California', 2010),
        ('New York', 2000), ('New York', 2010),
        ('Texas', 2000), ('Texas', 2010)]

population = [33871648, 37254834,
             17829401, 12332123,
             59812023, 50123003]

pop = pd.Series(population, index=index)
pop


(California, 2000)    33871648
(California, 2010)    37254834
(New York, 2000)      17829401
(New York, 2010)      12332123
(Texas, 2000)         59812023
(Texas, 2010)         50123003
dtype: int64

In [3]:
index = pd.MultiIndex.from_tuples(index)
index

MultiIndex([('California', 2000),
            ('California', 2010),
            (  'New York', 2000),
            (  'New York', 2010),
            (     'Texas', 2000),
            (     'Texas', 2010)],
           )

In [4]:
pop = pop.reindex(index)
pop

California  2000    33871648
            2010    37254834
New York    2000    17829401
            2010    12332123
Texas       2000    59812023
            2010    50123003
dtype: int64

In [5]:
pop[:, 2010]

California    37254834
New York      12332123
Texas         50123003
dtype: int64

In [6]:
pop_df = pop.unstack()

In [7]:
pop_df

Unnamed: 0,2000,2010
California,33871648,37254834
New York,17829401,12332123
Texas,59812023,50123003


In [8]:
pop_df.stack()

California  2000    33871648
            2010    37254834
New York    2000    17829401
            2010    12332123
Texas       2000    59812023
            2010    50123003
dtype: int64

In [9]:
pop_df = pd.DataFrame({'total': pop,
                     'under18' : [9267089, 9284094,
                                 4687374, 4318033,
                                 5906301, 6879014]})
pop_df

Unnamed: 0,Unnamed: 1,total,under18
California,2000,33871648,9267089
California,2010,37254834,9284094
New York,2000,17829401,4687374
New York,2010,12332123,4318033
Texas,2000,59812023,5906301
Texas,2010,50123003,6879014


In [10]:
f_u18 = pop_df['under18'] / pop_df['total']
f_u18


California  2000    0.273594
            2010    0.249205
New York    2000    0.262901
            2010    0.350145
Texas       2000    0.098748
            2010    0.137243
dtype: float64

In [11]:
f_u18.unstack()

Unnamed: 0,2000,2010
California,0.273594,0.249205
New York,0.262901,0.350145
Texas,0.098748,0.137243


In [12]:
## 열의 multiIndex

index = pd.MultiIndex.from_product([[2013, 2014], [1,2]],
                                    names=['year', 'visit'])
index

MultiIndex([(2013, 1),
            (2013, 2),
            (2014, 1),
            (2014, 2)],
           names=['year', 'visit'])

In [13]:
columns = pd.MultiIndex.from_product([['Bob', 'Guido', 'Sue'], ['HR', 'Temp']],
                                    names=['subject', 'type'])

In [14]:
columns

MultiIndex([(  'Bob',   'HR'),
            (  'Bob', 'Temp'),
            ('Guido',   'HR'),
            ('Guido', 'Temp'),
            (  'Sue',   'HR'),
            (  'Sue', 'Temp')],
           names=['subject', 'type'])

In [15]:
# 일부 데이터 모형 만들기
data = np.round(np.random.randn(4, 6) ,1)
data

array([[ 1.2,  0. , -1.5,  0.6,  1.1, -1.6],
       [ 0.4,  1.1,  1.8, -0. ,  1.2,  1. ],
       [-0.8, -1.5, -0.3,  1.8,  0.3,  0.2],
       [ 1.3, -0. ,  0.7, -1.9, -1.9, -0.2]])

In [16]:
data[:, ::2] *= 10

In [17]:
data 

array([[ 12. ,   0. , -15. ,   0.6,  11. ,  -1.6],
       [  4. ,   1.1,  18. ,  -0. ,  12. ,   1. ],
       [ -8. ,  -1.5,  -3. ,   1.8,   3. ,   0.2],
       [ 13. ,  -0. ,   7. ,  -1.9, -19. ,  -0.2]])

In [18]:
data += 37

In [19]:
data

array([[49. , 37. , 22. , 37.6, 48. , 35.4],
       [41. , 38.1, 55. , 37. , 49. , 38. ],
       [29. , 35.5, 34. , 38.8, 40. , 37.2],
       [50. , 37. , 44. , 35.1, 18. , 36.8]])

In [20]:
# DataFrame 생성

health_data = pd.DataFrame(data, index=index, columns=columns)
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,49.0,37.0,22.0,37.6,48.0,35.4
2013,2,41.0,38.1,55.0,37.0,49.0,38.0
2014,1,29.0,35.5,34.0,38.8,40.0,37.2
2014,2,50.0,37.0,44.0,35.1,18.0,36.8


In [21]:
health_data['Guido']

Unnamed: 0_level_0,type,HR,Temp
year,visit,Unnamed: 2_level_1,Unnamed: 3_level_1
2013,1,22.0,37.6
2013,2,55.0,37.0
2014,1,34.0,38.8
2014,2,44.0,35.1


In [22]:
pop

California  2000    33871648
            2010    37254834
New York    2000    17829401
            2010    12332123
Texas       2000    59812023
            2010    50123003
dtype: int64

In [23]:
pop['California', 2000]

33871648

In [24]:
pop['California']

2000    33871648
2010    37254834
dtype: int64

In [25]:
pop.index.names= ['state', 'year']

In [26]:
pop

state       year
California  2000    33871648
            2010    37254834
New York    2000    17829401
            2010    12332123
Texas       2000    59812023
            2010    50123003
dtype: int64

In [27]:
pop.loc['California': 'New York']

state       year
California  2000    33871648
            2010    37254834
New York    2000    17829401
            2010    12332123
dtype: int64

In [28]:
pop[:, 2000]

state
California    33871648
New York      17829401
Texas         59812023
dtype: int64

In [29]:
pop[pop > 22000000]

state       year
California  2000    33871648
            2010    37254834
Texas       2000    59812023
            2010    50123003
dtype: int64

In [30]:
pop[['California', 'Texas']]

state       year
California  2000    33871648
            2010    37254834
Texas       2000    59812023
            2010    50123003
dtype: int64

In [31]:
# 다중 인덱스를 가진 DataFrame

health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,49.0,37.0,22.0,37.6,48.0,35.4
2013,2,41.0,38.1,55.0,37.0,49.0,38.0
2014,1,29.0,35.5,34.0,38.8,40.0,37.2
2014,2,50.0,37.0,44.0,35.1,18.0,36.8


In [32]:
health_data['Guido', 'HR']

year  visit
2013  1        22.0
      2        55.0
2014  1        34.0
      2        44.0
Name: (Guido, HR), dtype: float64

In [33]:
health_data.iloc[:2, :2]

Unnamed: 0_level_0,subject,Bob,Bob
Unnamed: 0_level_1,type,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2
2013,1,49.0,37.0
2013,2,41.0,38.1


In [34]:
health_data.iloc[:3, :3]

Unnamed: 0_level_0,subject,Bob,Bob,Guido
Unnamed: 0_level_1,type,HR,Temp,HR
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2013,1,49.0,37.0,22.0
2013,2,41.0,38.1,55.0
2014,1,29.0,35.5,34.0


In [35]:
health_data.loc[:, ('Bob', 'HR')]

year  visit
2013  1        49.0
      2        41.0
2014  1        29.0
      2        50.0
Name: (Bob, HR), dtype: float64

In [36]:
type(health_data.loc[:, ('Bob', 'HR')])

pandas.core.series.Series

In [37]:
idx = pd.IndexSlice

In [38]:
health_data.loc[idx[:, 1], idx[:, 'HR']]

Unnamed: 0_level_0,subject,Bob,Guido,Sue
Unnamed: 0_level_1,type,HR,HR,HR
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2013,1,49.0,22.0,48.0
2014,1,29.0,34.0,40.0


In [40]:
# 다중인덱스 재정렬은 일단 PASS

# 인덱스 설정 및 재설정
pop_flat = pop.reset_index(name='population')
pop_flat

Unnamed: 0,state,year,population
0,California,2000,33871648
1,California,2010,37254834
2,New York,2000,17829401
3,New York,2010,12332123
4,Texas,2000,59812023
5,Texas,2010,50123003


In [41]:
pop

state       year
California  2000    33871648
            2010    37254834
New York    2000    17829401
            2010    12332123
Texas       2000    59812023
            2010    50123003
dtype: int64

In [42]:
pop_flat.set_index(['state', 'year'])

Unnamed: 0_level_0,Unnamed: 1_level_0,population
state,year,Unnamed: 2_level_1
California,2000,33871648
California,2010,37254834
New York,2000,17829401
New York,2010,12332123
Texas,2000,59812023
Texas,2010,50123003


In [43]:
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,49.0,37.0,22.0,37.6,48.0,35.4
2013,2,41.0,38.1,55.0,37.0,49.0,38.0
2014,1,29.0,35.5,34.0,38.8,40.0,37.2
2014,2,50.0,37.0,44.0,35.1,18.0,36.8


In [44]:
data_mean = health_data.mean(level='year')
data_mean

  data_mean = health_data.mean(level='year')


subject,Bob,Bob,Guido,Guido,Sue,Sue
type,HR,Temp,HR,Temp,HR,Temp
year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2013,45.0,37.55,38.5,37.3,48.5,36.7
2014,39.5,36.25,39.0,36.95,29.0,37.0


In [45]:
data_mean.mean(axis=1, level='type')

  data_mean.mean(axis=1, level='type')


type,HR,Temp
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2013,44.0,37.183333
2014,35.833333,36.733333
