In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('examples/macrodata.csv')

In [3]:
data = data.loc[:, ['year', 'quarter', 'realgdp', 'infl', 'unemp']]

In [4]:
data

Unnamed: 0,year,quarter,realgdp,infl,unemp
0,1959,1,2710.349,0.00,5.8
1,1959,2,2778.801,2.34,5.1
2,1959,3,2775.488,2.74,5.3
3,1959,4,2785.204,0.27,5.6
4,1960,1,2847.699,2.31,5.2
...,...,...,...,...,...
198,2008,3,13324.600,-3.16,6.0
199,2008,4,13141.920,-8.79,6.9
200,2009,1,12925.410,0.94,8.1
201,2009,2,12901.504,3.37,9.2


In [5]:
data.head()

Unnamed: 0,year,quarter,realgdp,infl,unemp
0,1959,1,2710.349,0.0,5.8
1,1959,2,2778.801,2.34,5.1
2,1959,3,2775.488,2.74,5.3
3,1959,4,2785.204,0.27,5.6
4,1960,1,2847.699,2.31,5.2


In [6]:
#first, use pd.PeriodIndex to combine year and quarter values
#set index to consist of datetime values at the end of each quarter
periods = pd.PeriodIndex(year=data.pop('year'),
                        quarter=data.pop('quarter'),
                        name='date')

In [7]:
periods

PeriodIndex(['1959Q1', '1959Q2', '1959Q3', '1959Q4', '1960Q1', '1960Q2',
             '1960Q3', '1960Q4', '1961Q1', '1961Q2',
             ...
             '2007Q2', '2007Q3', '2007Q4', '2008Q1', '2008Q2', '2008Q3',
             '2008Q4', '2009Q1', '2009Q2', '2009Q3'],
            dtype='period[Q-DEC]', name='date', length=203)

In [8]:
data.index = periods.to_timestamp('D')

In [9]:
data.head()

Unnamed: 0_level_0,realgdp,infl,unemp
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1959-01-01,2710.349,0.0,5.8
1959-04-01,2778.801,2.34,5.1
1959-07-01,2775.488,2.74,5.3
1959-10-01,2785.204,0.27,5.6
1960-01-01,2847.699,2.31,5.2


In [10]:
#note we used the pop method on the DataFrame which returns a column
#the pop method simultaneously deletes the column from the DataFrame
#next, select subset of items and give columns index the name "item"
data = data.reindex(columns=['realgdp', 'infl', 'unemp'])

In [11]:
data.columns.name = 'item'

In [12]:
data.head()

item,realgdp,infl,unemp
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1959-01-01,2710.349,0.0,5.8
1959-04-01,2778.801,2.34,5.1
1959-07-01,2775.488,2.74,5.3
1959-10-01,2785.204,0.27,5.6
1960-01-01,2847.699,2.31,5.2


In [13]:
#lastly, rehsape with stack, turn new index levels into columns with reset_index
#give column containing data values the same value
data.stack()

date        item   
1959-01-01  realgdp     2710.349
            infl           0.000
            unemp          5.800
1959-04-01  realgdp     2778.801
            infl           2.340
                         ...    
2009-04-01  infl           3.370
            unemp          9.200
2009-07-01  realgdp    12990.341
            infl           3.560
            unemp          9.600
Length: 609, dtype: float64

In [14]:
data.stack().reset_index()

Unnamed: 0,date,item,0
0,1959-01-01,realgdp,2710.349
1,1959-01-01,infl,0.000
2,1959-01-01,unemp,5.800
3,1959-04-01,realgdp,2778.801
4,1959-04-01,infl,2.340
...,...,...,...
604,2009-04-01,infl,3.370
605,2009-04-01,unemp,9.200
606,2009-07-01,realgdp,12990.341
607,2009-07-01,infl,3.560


In [41]:
data.stack().reset_index().rename(columns={0: 'value'})

Unnamed: 0,date,item,value
0,1959-01-01,realgdp,2710.349
1,1959-01-01,infl,0.000
2,1959-01-01,unemp,5.800
3,1959-04-01,realgdp,2778.801
4,1959-04-01,infl,2.340
...,...,...,...
604,2009-04-01,infl,3.370
605,2009-04-01,unemp,9.200
606,2009-07-01,realgdp,12990.341
607,2009-07-01,infl,3.560


In [42]:
long_data = (data.stack()
             .reset_index()
             .rename(columns={0: 'value'}))

In [43]:
long_data[:10]

Unnamed: 0,date,item,value
0,1959-01-01,realgdp,2710.349
1,1959-01-01,infl,0.0
2,1959-01-01,unemp,5.8
3,1959-04-01,realgdp,2778.801
4,1959-04-01,infl,2.34
5,1959-04-01,unemp,5.1
6,1959-07-01,realgdp,2775.488
7,1959-07-01,infl,2.74
8,1959-07-01,unemp,5.3
9,1959-10-01,realgdp,2785.204


In [49]:
#pivoted0 = long_data.pivot(index='item', columns='value',
#                          values='date')
#pivoted0

In [18]:
pivoted = long_data.pivot(index='date', columns='item',
                         values='value')

In [19]:
pivoted.head()

item,infl,realgdp,unemp
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1959-01-01,0.0,2710.349,5.8
1959-04-01,2.34,2778.801,5.1
1959-07-01,2.74,2775.488,5.3
1959-10-01,0.27,2785.204,5.6
1960-01-01,2.31,2847.699,5.2


In [20]:
#the first two values passed are the real values to be used as the row and
#column index
#then an optional value to fill the DataFrame
long_data['value2'] = np.random.standard_normal(len(long_data))

In [21]:
long_data[:10]

Unnamed: 0,date,item,value,value2
0,1959-01-01,realgdp,2710.349,0.41255
1,1959-01-01,infl,0.0,-0.488508
2,1959-01-01,unemp,5.8,-1.379543
3,1959-04-01,realgdp,2778.801,-0.520984
4,1959-04-01,infl,2.34,-1.434506
5,1959-04-01,unemp,5.1,0.099148
6,1959-07-01,realgdp,2775.488,1.44577
7,1959-07-01,infl,2.74,-1.000864
8,1959-07-01,unemp,5.3,0.662867
9,1959-10-01,realgdp,2785.204,1.237628


In [22]:
#by omiting the last argument, we get a DataFrame with heirarchical columns
pivoted = long_data.pivot(index='date', columns='item')

In [23]:
pivoted.head()

Unnamed: 0_level_0,value,value,value,value2,value2,value2
item,infl,realgdp,unemp,infl,realgdp,unemp
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1959-01-01,0.0,2710.349,5.8,-0.488508,0.41255,-1.379543
1959-04-01,2.34,2778.801,5.1,-1.434506,-0.520984,0.099148
1959-07-01,2.74,2775.488,5.3,-1.000864,1.44577,0.662867
1959-10-01,0.27,2785.204,5.6,-0.224166,1.237628,0.235112
1960-01-01,2.31,2847.699,5.2,0.77905,-0.513827,1.019909


In [24]:
pivoted['value'].head()

item,infl,realgdp,unemp
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1959-01-01,0.0,2710.349,5.8
1959-04-01,2.34,2778.801,5.1
1959-07-01,2.74,2775.488,5.3
1959-10-01,0.27,2785.204,5.6
1960-01-01,2.31,2847.699,5.2


In [25]:
#note that pivot is equivalent to creating a heirarchical index using
#set_index followed by a call to unstack
long_data.set_index(['date', 'item'])

Unnamed: 0_level_0,Unnamed: 1_level_0,value,value2
date,item,Unnamed: 2_level_1,Unnamed: 3_level_1
1959-01-01,realgdp,2710.349,0.412550
1959-01-01,infl,0.000,-0.488508
1959-01-01,unemp,5.800,-1.379543
1959-04-01,realgdp,2778.801,-0.520984
1959-04-01,infl,2.340,-1.434506
...,...,...,...
2009-04-01,infl,3.370,0.681496
2009-04-01,unemp,9.200,-2.170316
2009-07-01,realgdp,12990.341,-1.160741
2009-07-01,infl,3.560,-0.629935


In [26]:
#note that pivot is equivalent to creating a heirarchical index using
#set_index followed by a call to unstack
long_data.set_index(['date', 'item']).unstack(level='item')

Unnamed: 0_level_0,value,value,value,value2,value2,value2
item,infl,realgdp,unemp,infl,realgdp,unemp
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1959-01-01,0.00,2710.349,5.8,-0.488508,0.412550,-1.379543
1959-04-01,2.34,2778.801,5.1,-1.434506,-0.520984,0.099148
1959-07-01,2.74,2775.488,5.3,-1.000864,1.445770,0.662867
1959-10-01,0.27,2785.204,5.6,-0.224166,1.237628,0.235112
1960-01-01,2.31,2847.699,5.2,0.779050,-0.513827,1.019909
...,...,...,...,...,...,...
2008-07-01,-3.16,13324.600,6.0,-0.214497,-0.583226,0.005574
2008-10-01,-8.79,13141.920,6.9,0.257813,-1.367470,-0.064770
2009-01-01,0.94,12925.410,8.1,1.058445,0.245423,0.710216
2009-04-01,3.37,12901.504,9.2,0.681496,0.063339,-2.170316


In [27]:
#note that pivot is equivalent to creating a heirarchical index using
#set_index followed by a call to unstack
unstacked = long_data.set_index(['date', 'item']).unstack(level='item')

In [28]:
unstacked

Unnamed: 0_level_0,value,value,value,value2,value2,value2
item,infl,realgdp,unemp,infl,realgdp,unemp
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1959-01-01,0.00,2710.349,5.8,-0.488508,0.412550,-1.379543
1959-04-01,2.34,2778.801,5.1,-1.434506,-0.520984,0.099148
1959-07-01,2.74,2775.488,5.3,-1.000864,1.445770,0.662867
1959-10-01,0.27,2785.204,5.6,-0.224166,1.237628,0.235112
1960-01-01,2.31,2847.699,5.2,0.779050,-0.513827,1.019909
...,...,...,...,...,...,...
2008-07-01,-3.16,13324.600,6.0,-0.214497,-0.583226,0.005574
2008-10-01,-8.79,13141.920,6.9,0.257813,-1.367470,-0.064770
2009-01-01,0.94,12925.410,8.1,1.058445,0.245423,0.710216
2009-04-01,3.37,12901.504,9.2,0.681496,0.063339,-2.170316


In [29]:
#pivoting wide to long format
df = pd.DataFrame({'key': ['foo', 'bar', 'baz'],
                  'A': [1, 2, 3],
                  'B': [4, 5, 6],
                  'C': [7, 8, 9]})

In [30]:
df

Unnamed: 0,key,A,B,C
0,foo,1,4,7
1,bar,2,5,8
2,baz,3,6,9


In [31]:
#The 'key' column may be a group indicator, with the other columns as data values
#when using pd.melt, we must indicate which columns if any are group indicators
#Let's use "key" as the only such indicator in this scenario
melted = pd.melt(df, id_vars='key')

In [32]:
melted

Unnamed: 0,key,variable,value
0,foo,A,1
1,bar,A,2
2,baz,A,3
3,foo,B,4
4,bar,B,5
5,baz,B,6
6,foo,C,7
7,bar,C,8
8,baz,C,9


In [33]:
#we can use pivot to reshape back to the original layout
reshaped = melted.pivot(index='key', columns='variable',
                       values='value')

In [34]:
reshaped

variable,A,B,C
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,2,5,8
baz,3,6,9
foo,1,4,7


In [35]:
#pivot creates an inde from the column used as the row labels.
#we can use reset_index to move this data back to a column
reshaped.reset_index()

variable,key,A,B,C
0,bar,2,5,8
1,baz,3,6,9
2,foo,1,4,7


In [36]:
#we can also specify subset of columns to be value columns
pd.melt(df, id_vars='key', value_vars=['A', 'B'])

Unnamed: 0,key,variable,value
0,foo,A,1
1,bar,A,2
2,baz,A,3
3,foo,B,4
4,bar,B,5
5,baz,B,6


In [37]:
#pd.melt can be used without group identifiers
pd.melt(df, value_vars=['A', 'B', 'C'])

Unnamed: 0,variable,value
0,A,1
1,A,2
2,A,3
3,B,4
4,B,5
5,B,6
6,C,7
7,C,8
8,C,9


In [38]:
pd.melt(df, value_vars=['key', 'A', 'B'])

Unnamed: 0,variable,value
0,key,foo
1,key,bar
2,key,baz
3,A,1
4,A,2
5,A,3
6,B,4
7,B,5
8,B,6
