In [1]:
import numpy as np
import pandas as pd

In [2]:
#there are a number of basic options for rearranging tabular data.
#reshape or pivot operation: an option to reshape tabular data
#stack and unstack
#stack: pivots from the columns in the data to the rows.
#unstack: pivots from the rows into the columns
np.arange(6)

array([0, 1, 2, 3, 4, 5])

In [3]:
np.arange(6).reshape((2, 3))

array([[0, 1, 2],
       [3, 4, 5]])

In [4]:
pd.DataFrame(np.arange(6).reshape((2, 3)))

Unnamed: 0,0,1,2
0,0,1,2
1,3,4,5


In [5]:
pd.DataFrame(np.arange(6).reshape((2, 3)),
            index=pd.Index(['Ohio', 'Colorado']))

Unnamed: 0,0,1,2
Ohio,0,1,2
Colorado,3,4,5


In [6]:
pd.DataFrame(np.arange(6).reshape((2, 3)),
            index=pd.Index(['Ohio', 'Colorado']),
            columns=pd.Index(['one', 'two', 'three']))

Unnamed: 0,one,two,three
Ohio,0,1,2
Colorado,3,4,5


In [7]:
data = pd.DataFrame(np.arange(6).reshape((2, 3)),
            index=pd.Index(['Ohio', 'Colorado'], name='state'),
            columns=pd.Index(['one', 'two', 'three'],
            name='number'))

In [8]:
data

number,one,two,three
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,0,1,2
Colorado,3,4,5


In [9]:
#use the stack method to pivot the columns into the rows to create a Series
result = data.stack()

In [10]:
result

state     number
Ohio      one       0
          two       1
          three     2
Colorado  one       3
          two       4
          three     5
dtype: int64

In [11]:
#from a heirarchally indexed series, we can rearrange the data back into a
#DataFrame with unstack
result.unstack()

number,one,two,three
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,0,1,2
Colorado,3,4,5


In [12]:
#by default, we unstack the innermost level
result.unstack(level=0)

state,Ohio,Colorado
number,Unnamed: 1_level_1,Unnamed: 2_level_1
one,0,3
two,1,4
three,2,5


In [13]:
#use name of level too
result.unstack(level='state')

state,Ohio,Colorado
number,Unnamed: 1_level_1,Unnamed: 2_level_1
one,0,3
two,1,4
three,2,5


In [14]:
#now let's replicate the result.unstack
result.unstack(level=1) #same as result.unstack()

number,one,two,three
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,0,1,2
Colorado,3,4,5


In [15]:
result.unstack(level='number') #same as level=1

number,one,two,three
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,0,1,2
Colorado,3,4,5


In [16]:
#unstacking and missing data
#if, while unstacking, we find data not present in each subgroup,
#then pandas will introduce missing data, as in the following example
s1 = pd.Series([0, 1, 2, 3], index=['a', 'b', 'c', 'd'],
              dtype='Int64')

In [17]:
s2 = pd.Series([4, 5, 6], index=['c', 'd', 'e'], dtype='Int64')

In [18]:
s1

a    0
b    1
c    2
d    3
dtype: Int64

In [19]:
s2

c    4
d    5
e    6
dtype: Int64

In [20]:
data2 = pd.concat([s1, s2], keys=['one', 'zero'])

In [21]:
data2

one   a    0
      b    1
      c    2
      d    3
zero  c    4
      d    5
      e    6
dtype: Int64

In [22]:
data2['one']

a    0
b    1
c    2
d    3
dtype: Int64

In [23]:
data2['zero'].iloc[0]

4

In [24]:
data2['zero'].loc['c']

4

In [25]:
data2['zero'].loc['c':'e']

c    4
d    5
e    6
dtype: Int64

In [26]:
#pg.272
#stacking filters out missing data by default, so the operation is more
#easily invertible
data2.unstack()

Unnamed: 0,a,b,c,d,e
one,0.0,1.0,2,3,
zero,,,4,5,6.0


In [27]:
data2.unstack().stack()

one   a    0
      b    1
      c    2
      d    3
zero  c    4
      d    5
      e    6
dtype: Int64

In [28]:
data2.unstack().stack(dropna=False)

one   a       0
      b       1
      c       2
      d       3
      e    <NA>
zero  a    <NA>
      b    <NA>
      c       4
      d       5
      e       6
dtype: Int64

In [29]:
#when you unstack a DataFrame, the unstacked area becomes the lowest
#level in th result
pd.DataFrame({'left': result, 'right': result + 5})

Unnamed: 0_level_0,Unnamed: 1_level_0,left,right
state,number,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,one,0,5
Ohio,two,1,6
Ohio,three,2,7
Colorado,one,3,8
Colorado,two,4,9
Colorado,three,5,10


In [30]:
pd.DataFrame({'left': result, 'right': result + 5},
            columns=pd.Index(['left', 'right']))

Unnamed: 0_level_0,Unnamed: 1_level_0,left,right
state,number,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,one,0,5
Ohio,two,1,6
Ohio,three,2,7
Colorado,one,3,8
Colorado,two,4,9
Colorado,three,5,10


In [31]:
df = pd.DataFrame({'left': result, 'right': result + 5},
            columns=pd.Index(['left', 'right'],
            name='side'))

In [32]:
df

Unnamed: 0_level_0,side,left,right
state,number,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,one,0,5
Ohio,two,1,6
Ohio,three,2,7
Colorado,one,3,8
Colorado,two,4,9
Colorado,three,5,10


In [33]:
df.unstack(level='state')

side,left,left,right,right
state,Ohio,Colorado,Ohio,Colorado
number,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
one,0,3,5,8
two,1,4,6,9
three,2,5,7,10


In [36]:
#as with unstack, when we call stack we indicate the name of the
#axis that we stack
df.unstack(level='state').stack(level='side')

Unnamed: 0_level_0,state,Colorado,Ohio
number,side,Unnamed: 2_level_1,Unnamed: 3_level_1
one,left,3,0
one,right,8,5
two,left,4,1
two,right,9,6
three,left,5,2
three,right,10,7


In [38]:
#as with unstack, when we call stack we indicate the name of the
#axis that we stack
df.unstack(level='number').stack(level='side')

Unnamed: 0_level_0,number,one,three,two
state,side,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Ohio,left,0,2,1
Ohio,right,5,7,6
Colorado,left,3,5,4
Colorado,right,8,10,9
