# Topics: 
    
    
1. Tidying variable values as column names with stack
2. Tidying variable values as column names with melt
3. Stacking multiple groups of variables simultaneously
4. Inverting stacked data
5. Unstacking after a groupby aggregation
6. Replicating pivot_table with a groupby aggregation
7. Renaming axis levels for easy reshaping
8. Tidying when multiple variables are stored as column names
9. Tidying when multiple variables are stored as column values
10. Tidying when two or more values are stored in the same cell
11. Tidying when variables are stored in column names and values
12. Tidying when multiple observational units are stored in the same table

In [13]:
import pandas as pd 
import numpy as np

### 1. Tidying variable values as column names with stack

In [14]:
state_fruit = pd.read_csv('data/state_fruit2.csv')
state_fruit

Unnamed: 0,State,Apple,Orange,Banana
0,Texas,12,10,40
1,Arizona,9,7,12
2,Florida,0,14,190


In [15]:
state_fruit.stack()

0  State       Texas
   Apple          12
   Orange         10
   Banana         40
1  State     Arizona
   Apple           9
   Orange          7
   Banana         12
2  State     Florida
   Apple           0
   Orange         14
   Banana        190
dtype: object

In [16]:
state_fruit_tidy = state_fruit.stack().reset_index()
state_fruit_tidy

Unnamed: 0,level_0,level_1,0
0,0,State,Texas
1,0,Apple,12
2,0,Orange,10
3,0,Banana,40
4,1,State,Arizona
5,1,Apple,9
6,1,Orange,7
7,1,Banana,12
8,2,State,Florida
9,2,Apple,0


In [17]:
state_fruit_tidy.columns = ['state', 'fruit', 'weight']
state_fruit_tidy

Unnamed: 0,state,fruit,weight
0,0,State,Texas
1,0,Apple,12
2,0,Orange,10
3,0,Banana,40
4,1,State,Arizona
5,1,Apple,9
6,1,Orange,7
7,1,Banana,12
8,2,State,Florida
9,2,Apple,0


In [18]:
state_fruit.stack().rename_axis(['state', 'fruit'])

state  fruit 
0      State       Texas
       Apple          12
       Orange         10
       Banana         40
1      State     Arizona
       Apple           9
       Orange          7
       Banana         12
2      State     Florida
       Apple           0
       Orange         14
       Banana        190
dtype: object

In [19]:
state_fruit.stack().rename_axis(['state', 'fruit']).reset_index(name='weight')

Unnamed: 0,state,fruit,weight
0,0,State,Texas
1,0,Apple,12
2,0,Orange,10
3,0,Banana,40
4,1,State,Arizona
5,1,Apple,9
6,1,Orange,7
7,1,Banana,12
8,2,State,Florida
9,2,Apple,0


In [20]:
#Make STATE as index first and then Stack

In [21]:
state_fruit.stack()

0  State       Texas
   Apple          12
   Orange         10
   Banana         40
1  State     Arizona
   Apple           9
   Orange          7
   Banana         12
2  State     Florida
   Apple           0
   Orange         14
   Banana        190
dtype: object

In [22]:
state_fruit.set_index('State').stack()

State          
Texas    Apple      12
         Orange     10
         Banana     40
Arizona  Apple       9
         Orange      7
         Banana     12
Florida  Apple       0
         Orange     14
         Banana    190
dtype: int64

### 2. Tidying variable values as column names with melt

In [23]:
state_fruit2 = pd.read_csv('data/state_fruit2.csv')
state_fruit2

Unnamed: 0,State,Apple,Orange,Banana
0,Texas,12,10,40
1,Arizona,9,7,12
2,Florida,0,14,190


In [24]:
state_fruit2.melt(id_vars=['State'], value_vars=['Apple', 'Orange', 'Banana'])

Unnamed: 0,State,variable,value
0,Texas,Apple,12
1,Arizona,Apple,9
2,Florida,Apple,0
3,Texas,Orange,10
4,Arizona,Orange,7
5,Florida,Orange,14
6,Texas,Banana,40
7,Arizona,Banana,12
8,Florida,Banana,190


In [25]:
state_fruit2.melt(id_vars=['State'],
 value_vars=['Apple', 'Orange', 'Banana'],
 var_name='Fruit',
 value_name='Weight')

Unnamed: 0,State,Fruit,Weight
0,Texas,Apple,12
1,Arizona,Apple,9
2,Florida,Apple,0
3,Texas,Orange,10
4,Arizona,Orange,7
5,Florida,Orange,14
6,Texas,Banana,40
7,Arizona,Banana,12
8,Florida,Banana,190


In [26]:
state_fruit2.melt()

Unnamed: 0,variable,value
0,State,Texas
1,State,Arizona
2,State,Florida
3,Apple,12
4,Apple,9
5,Apple,0
6,Orange,10
7,Orange,7
8,Orange,14
9,Banana,40


In [27]:
state_fruit2.melt(id_vars='State')

Unnamed: 0,State,variable,value
0,Texas,Apple,12
1,Arizona,Apple,9
2,Florida,Apple,0
3,Texas,Orange,10
4,Arizona,Orange,7
5,Florida,Orange,14
6,Texas,Banana,40
7,Arizona,Banana,12
8,Florida,Banana,190


### 3. Stacking multiple groups of variables simultaneously

In [28]:
movie = pd.read_csv('data/movie.csv')
actor = movie[['movie_title', 'actor_1_name',
 'actor_2_name', 'actor_3_name',
 'actor_1_facebook_likes',
 'actor_2_facebook_likes',
 'actor_3_facebook_likes']]
actor.head()

Unnamed: 0,movie_title,actor_1_name,actor_2_name,actor_3_name,actor_1_facebook_likes,actor_2_facebook_likes,actor_3_facebook_likes
0,Avatar,CCH Pounder,Joel David Moore,Wes Studi,1000.0,936.0,855.0
1,Pirates of the Caribbean: At World's End,Johnny Depp,Orlando Bloom,Jack Davenport,40000.0,5000.0,1000.0
2,Spectre,Christoph Waltz,Rory Kinnear,Stephanie Sigman,11000.0,393.0,161.0
3,The Dark Knight Rises,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,27000.0,23000.0,23000.0
4,Star Wars: Episode VII - The Force Awakens,Doug Walker,Rob Walker,,131.0,12.0,


In [29]:
def change_col_name(col_name):
    col_name = col_name.replace('_name', '')
    if 'facebook' in col_name:
        fb_idx = col_name.find('facebook')
        col_name = col_name[:5] + col_name[fb_idx - 1:] \
         + col_name[5:fb_idx-1]
    return col_name

In [30]:
actor2 = actor.rename(columns=change_col_name)
actor2.head()

Unnamed: 0,movie_title,actor_1,actor_2,actor_3,actor_facebook_likes_1,actor_facebook_likes_2,actor_facebook_likes_3
0,Avatar,CCH Pounder,Joel David Moore,Wes Studi,1000.0,936.0,855.0
1,Pirates of the Caribbean: At World's End,Johnny Depp,Orlando Bloom,Jack Davenport,40000.0,5000.0,1000.0
2,Spectre,Christoph Waltz,Rory Kinnear,Stephanie Sigman,11000.0,393.0,161.0
3,The Dark Knight Rises,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,27000.0,23000.0,23000.0
4,Star Wars: Episode VII - The Force Awakens,Doug Walker,Rob Walker,,131.0,12.0,


In [38]:
stubs = ['actor', 'actor_facebook_likes']
actor2_tidy = pd.wide_to_long(actor2,
 stubnames=stubs,
 i=['movie_title'],
 j='actor_num',
 sep='_')
actor2_tidy.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,actor,actor_facebook_likes
movie_title,actor_num,Unnamed: 2_level_1,Unnamed: 3_level_1
Avatar,1,CCH Pounder,1000.0
Pirates of the Caribbean: At World's End,1,Johnny Depp,40000.0
Spectre,1,Christoph Waltz,11000.0
The Dark Knight Rises,1,Tom Hardy,27000.0
Star Wars: Episode VII - The Force Awakens,1,Doug Walker,131.0


In [35]:
len(actor2_tidy)

14748