# Pivot Data from Long to Wide Form

In [1]:
import pandas as pd 
import janitor as jn 

In [2]:
df = [{'name': 'Alice', 'variable': 'wk1', 'value': 5},
 {'name': 'Alice', 'variable': 'wk2', 'value': 9},
 {'name': 'Alice', 'variable': 'wk3', 'value': 20},
 {'name': 'Alice', 'variable': 'wk4', 'value': 22},
 {'name': 'Bob', 'variable': 'wk1', 'value': 7},
 {'name': 'Bob', 'variable': 'wk2', 'value': 11},
 {'name': 'Bob', 'variable': 'wk3', 'value': 17},
 {'name': 'Bob', 'variable': 'wk4', 'value': 33},
 {'name': 'Carla', 'variable': 'wk1', 'value': 6},
 {'name': 'Carla', 'variable': 'wk2', 'value': 13},
 {'name': 'Carla', 'variable': 'wk3', 'value': 39},
 {'name': 'Carla', 'variable': 'wk4', 'value': 40}]


df = pd.DataFrame(df)

df

Unnamed: 0,name,variable,value
0,Alice,wk1,5
1,Alice,wk2,9
2,Alice,wk3,20
3,Alice,wk4,22
4,Bob,wk1,7
5,Bob,wk2,11
6,Bob,wk3,17
7,Bob,wk4,33
8,Carla,wk1,6
9,Carla,wk2,13


Reshaping to wide form:

In [3]:
df.pivot_wider(
        index = "name",
        names_from = "variable",
        values_from = "value"
    )

Unnamed: 0,name,wk1,wk2,wk3,wk4
0,Alice,5,9,20,22
1,Bob,7,11,17,33
2,Carla,6,13,39,40


Pivoting on multiple columns is possible :

In [4]:
df = [{'name': 1, 'n': 10.0, 'pct': 0.1},
 {'name': 2, 'n': 20.0, 'pct': 0.2},
 {'name': 3, 'n': 30.0, 'pct': 0.3}]


df = pd.DataFrame(df)

df

Unnamed: 0,name,n,pct
0,1,10.0,0.1
1,2,20.0,0.2
2,3,30.0,0.3


In [5]:
(df.assign(num = 0)
   .pivot_wider(
      index = "num",
      names_from = "name",
      values_from = ["n", "pct"],
      names_sep = "_"
      )
)

Unnamed: 0,num,n_1,n_2,n_3,pct_1,pct_2,pct_3
0,0,10.0,20.0,30.0,0.1,0.2,0.3


You may choose not to flatten the columns, by setting `flatten_levels` to ``False``:



In [6]:
df = [{'dep': 5.5, 'step': 1, 'a': 20, 'b': 30},
 {'dep': 5.5, 'step': 2, 'a': 25, 'b': 37},
 {'dep': 6.1, 'step': 1, 'a': 22, 'b': 19},
 {'dep': 6.1, 'step': 2, 'a': 18, 'b': 29}]


df = pd.DataFrame(df)

df

Unnamed: 0,dep,step,a,b
0,5.5,1,20,30
1,5.5,2,25,37
2,6.1,1,22,19
3,6.1,2,18,29


In [7]:
df.pivot_wider(
    index = "dep",
    names_from  = 'step',
    flatten_levels = False
    )

Unnamed: 0_level_0,a,a,b,b
step,1,2,1,2
dep,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
5.5,20,25,30,37
6.1,22,18,19,29


The order of the levels can be changed with the `levels_order` parameter, which internally uses pandas' [reorder_levels](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.reorder_levels.html):

In [8]:
df.pivot_wider(
    index = "dep",
    names_from  = 'step',
    flatten_levels = False,
    levels_order = ['step', None]
    )

step,1,2,1,2
Unnamed: 0_level_1,a,a,b,b
dep,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
5.5,20,25,30,37
6.1,22,18,19,29


In [9]:
df.pivot_wider(
    index = "dep",
    names_from = 'step',
    flatten_levels = True,
    )

Unnamed: 0,dep,a_1,a_2,b_1,b_2
0,5.5,20,25,30,37
1,6.1,22,18,19,29


In [10]:
df.pivot_wider(
    index = "dep",
    names_from = 'step',
    flatten_levels= True,
    levels_order = ['step', None]
    )

Unnamed: 0,dep,1_a,2_a,1_b,2_b
0,5.5,20,25,30,37
1,6.1,22,18,19,29


`names_sep` and `names_glue` come in handy in situations where `names_from` and/or `values_from` contain multiple variables; it is used primarily when the columns are flattened. The default value for `names_sep` is ``_``:

In [11]:
# default value of names_sep is '_'
df.pivot_wider(index = "dep", names_from = "step")

Unnamed: 0,dep,a_1,a_2,b_1,b_2
0,5.5,20,25,30,37
1,6.1,22,18,19,29


In [12]:
df.pivot_wider(
    index = "dep",
    names_from = "step",
    names_sep = "")


Unnamed: 0,dep,a1,a2,b1,b2
0,5.5,20,25,30,37
1,6.1,22,18,19,29


With `names_glue` you can glue the individual levels (if MultiIndex) into one (similar to `names_sep`), or you can modify the final columns, as long as it can be passed to `pd.Index.map`:

In [13]:
# replicate `names_sep`
df.pivot_wider(
    index = "dep",
    names_from = "step",
    names_sep = None,
    names_glue = "_".join
    )

Unnamed: 0,dep,a_1,a_2,b_1,b_2
0,5.5,20,25,30,37
1,6.1,22,18,19,29


In [14]:
# going beyond names_sep
df.pivot_wider(
    index = "dep",
    names_from = "step",
    names_sep = None,
    names_glue = lambda col: f"{col[0]}_step{col[1]}"
    )

Unnamed: 0,dep,a_step1,a_step2,b_step1,b_step2
0,5.5,20,25,30,37
1,6.1,22,18,19,29


There are scenarios where the column order of the final dataframe is important:

In [15]:
df = [{'Salesman': 'Knut', 'Height': 6, 'product': 'bat', 'price': 5},
 {'Salesman': 'Knut', 'Height': 6, 'product': 'ball', 'price': 1},
 {'Salesman': 'Knut', 'Height': 6, 'product': 'wand', 'price': 3},
 {'Salesman': 'Steve', 'Height': 5, 'product': 'pen', 'price': 2}]

df = pd.DataFrame(df)

df

Unnamed: 0,Salesman,Height,product,price
0,Knut,6,bat,5
1,Knut,6,ball,1
2,Knut,6,wand,3
3,Steve,5,pen,2


In [16]:
idx = df.groupby(['Salesman', 'Height']).cumcount().add(1)

(df.assign(idx = idx)
   .pivot_wider(index = ['Salesman', 'Height'], names_from = 'idx')
)

Unnamed: 0,Salesman,Height,product_1,product_2,product_3,price_1,price_2,price_3
0,Knut,6,bat,ball,wand,5.0,1.0,3.0
1,Steve,5,pen,,,2.0,,


To get the columns in a form where `product` alternates with `price`, we can combine `pivot_wider` (or plain `pd.pivot`) with `pd.sort_index` and `janitor.collapse_levels`:

In [17]:
(df.assign(idx = idx)
   .pivot_wider(index = ['Salesman', 'Height'], 
                names_from = 'idx',
                flatten_levels = False)
   .sort_index(level='idx', 
               axis = 'columns', 
               sort_remaining=False)
   .collapse_levels()
   .reset_index()
)

Unnamed: 0,Salesman,Height,product_1,price_1,product_2,price_2,product_3,price_3
0,Knut,6,bat,5.0,ball,1.0,wand,3.0
1,Steve,5,pen,2.0,,,,


In [18]:
df = pd.DataFrame(
        {
            "geoid": [1, 1, 13, 13],
            "name": ["Alabama", "Alabama", "Georgia", "Georgia"],
            "variable": [
                "pop_renter",
                "median_rent",
                "pop_renter",
                "median_rent",
            ],
            "estimate": [1434765, 747, 3592422, 927],
            "error": [16736, 3, 33385, 3],
        }
    )

df

Unnamed: 0,geoid,name,variable,estimate,error
0,1,Alabama,pop_renter,1434765,16736
1,1,Alabama,median_rent,747,3
2,13,Georgia,pop_renter,3592422,33385
3,13,Georgia,median_rent,927,3


In [19]:
df.pivot_wider(
        index=["geoid", "name"],
        names_from="variable",
        values_from=["estimate", "error"],
        levels_order=["variable", None],
    )

Unnamed: 0,geoid,name,median_rent_estimate,pop_renter_estimate,median_rent_error,pop_renter_error
0,1,Alabama,747,1434765,3,16736
1,13,Georgia,927,3592422,3,33385


For the reshaping above, we would like to maintain the order in `variable`, where `pop_renter` comes before `median_rent`; this can be achieved by converting the `variable` column to a categorical, before reshaping:

In [20]:
(df.encode_categorical(variable = (None, "appearance"))
   .pivot_wider(index = ["geoid", "name"],
                names_from = "variable",
                values_from = ["estimate", "error"],
                levels_order = ["variable", None])
    )

Unnamed: 0,geoid,name,pop_renter_estimate,median_rent_estimate,pop_renter_error,median_rent_error
0,1,Alabama,1434765,747,16736,3
1,13,Georgia,3592422,927,33385,3
