# Topics: 
    
    
1. Tidying variable values as column names with stack
2. Tidying variable values as column names with melt
3. Stacking multiple groups of variables simultaneously
4. Inverting stacked data
5. Unstacking after a groupby aggregation
6. Replicating pivot_table with a groupby aggregation
7. Renaming axis levels for easy reshaping
8. Tidying when multiple variables are stored as column names
9. Tidying when multiple variables are stored as column values
10. Tidying when two or more values are stored in the same cell
11. Tidying when variables are stored in column names and values
12. Tidying when multiple observational units are stored in the same table

In [1334]:
import pandas as pd 
import numpy as np

### 1. Tidying variable values as column names with stack

In [1335]:
state_fruit = pd.read_csv('data/state_fruit2.csv')
state_fruit

Unnamed: 0,State,Apple,Orange,Banana
0,Texas,12,10,40
1,Arizona,9,7,12
2,Florida,0,14,190


In [1336]:
state_fruit.stack()

0  State       Texas
   Apple          12
   Orange         10
   Banana         40
1  State     Arizona
   Apple           9
   Orange          7
   Banana         12
2  State     Florida
   Apple           0
   Orange         14
   Banana        190
dtype: object

In [1337]:
state_fruit_tidy = state_fruit.stack().reset_index()
state_fruit_tidy

Unnamed: 0,level_0,level_1,0
0,0,State,Texas
1,0,Apple,12
2,0,Orange,10
3,0,Banana,40
4,1,State,Arizona
5,1,Apple,9
6,1,Orange,7
7,1,Banana,12
8,2,State,Florida
9,2,Apple,0


In [1338]:
state_fruit_tidy.columns = ['state', 'fruit', 'weight']
state_fruit_tidy

Unnamed: 0,state,fruit,weight
0,0,State,Texas
1,0,Apple,12
2,0,Orange,10
3,0,Banana,40
4,1,State,Arizona
5,1,Apple,9
6,1,Orange,7
7,1,Banana,12
8,2,State,Florida
9,2,Apple,0


In [1339]:
state_fruit.stack().rename_axis(['state', 'fruit'])

state  fruit 
0      State       Texas
       Apple          12
       Orange         10
       Banana         40
1      State     Arizona
       Apple           9
       Orange          7
       Banana         12
2      State     Florida
       Apple           0
       Orange         14
       Banana        190
dtype: object

In [1340]:
state_fruit.stack().rename_axis(['state', 'fruit']).reset_index(name='weight')

Unnamed: 0,state,fruit,weight
0,0,State,Texas
1,0,Apple,12
2,0,Orange,10
3,0,Banana,40
4,1,State,Arizona
5,1,Apple,9
6,1,Orange,7
7,1,Banana,12
8,2,State,Florida
9,2,Apple,0


In [1341]:
#Make STATE as index first and then Stack

In [1342]:
state_fruit.stack()

0  State       Texas
   Apple          12
   Orange         10
   Banana         40
1  State     Arizona
   Apple           9
   Orange          7
   Banana         12
2  State     Florida
   Apple           0
   Orange         14
   Banana        190
dtype: object

In [1343]:
state_fruit.set_index('State').stack()

State          
Texas    Apple      12
         Orange     10
         Banana     40
Arizona  Apple       9
         Orange      7
         Banana     12
Florida  Apple       0
         Orange     14
         Banana    190
dtype: int64

### 2. Tidying variable values as column names with melt

In [1344]:
state_fruit2 = pd.read_csv('data/state_fruit2.csv')
state_fruit2

Unnamed: 0,State,Apple,Orange,Banana
0,Texas,12,10,40
1,Arizona,9,7,12
2,Florida,0,14,190


In [1345]:
state_fruit2.melt(id_vars=['State'], value_vars=['Apple', 'Orange', 'Banana'])

Unnamed: 0,State,variable,value
0,Texas,Apple,12
1,Arizona,Apple,9
2,Florida,Apple,0
3,Texas,Orange,10
4,Arizona,Orange,7
5,Florida,Orange,14
6,Texas,Banana,40
7,Arizona,Banana,12
8,Florida,Banana,190


In [1346]:
state_fruit2.melt(id_vars=['State'],
 value_vars=['Apple', 'Orange', 'Banana'],
 var_name='Fruit',
 value_name='Weight')

Unnamed: 0,State,Fruit,Weight
0,Texas,Apple,12
1,Arizona,Apple,9
2,Florida,Apple,0
3,Texas,Orange,10
4,Arizona,Orange,7
5,Florida,Orange,14
6,Texas,Banana,40
7,Arizona,Banana,12
8,Florida,Banana,190


In [1347]:
state_fruit2.melt()

Unnamed: 0,variable,value
0,State,Texas
1,State,Arizona
2,State,Florida
3,Apple,12
4,Apple,9
5,Apple,0
6,Orange,10
7,Orange,7
8,Orange,14
9,Banana,40


In [1348]:
state_fruit2.melt(id_vars='State')

Unnamed: 0,State,variable,value
0,Texas,Apple,12
1,Arizona,Apple,9
2,Florida,Apple,0
3,Texas,Orange,10
4,Arizona,Orange,7
5,Florida,Orange,14
6,Texas,Banana,40
7,Arizona,Banana,12
8,Florida,Banana,190


### 3. Stacking multiple groups of variables simultaneously

In [1349]:
movie = pd.read_csv('data/movie.csv')
movie.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


In [1350]:
movie.columns.tolist()

['color',
 'director_name',
 'num_critic_for_reviews',
 'duration',
 'director_facebook_likes',
 'actor_3_facebook_likes',
 'actor_2_name',
 'actor_1_facebook_likes',
 'gross',
 'genres',
 'actor_1_name',
 'movie_title',
 'num_voted_users',
 'cast_total_facebook_likes',
 'actor_3_name',
 'facenumber_in_poster',
 'plot_keywords',
 'movie_imdb_link',
 'num_user_for_reviews',
 'language',
 'country',
 'content_rating',
 'budget',
 'title_year',
 'actor_2_facebook_likes',
 'imdb_score',
 'aspect_ratio',
 'movie_facebook_likes']

In [1351]:
actor = movie[['movie_title', 'actor_1_name',
 'actor_2_name', 'actor_3_name',
 'actor_1_facebook_likes',
 'actor_2_facebook_likes',
 'actor_3_facebook_likes']]
actor.head()

Unnamed: 0,movie_title,actor_1_name,actor_2_name,actor_3_name,actor_1_facebook_likes,actor_2_facebook_likes,actor_3_facebook_likes
0,Avatar,CCH Pounder,Joel David Moore,Wes Studi,1000.0,936.0,855.0
1,Pirates of the Caribbean: At World's End,Johnny Depp,Orlando Bloom,Jack Davenport,40000.0,5000.0,1000.0
2,Spectre,Christoph Waltz,Rory Kinnear,Stephanie Sigman,11000.0,393.0,161.0
3,The Dark Knight Rises,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,27000.0,23000.0,23000.0
4,Star Wars: Episode VII - The Force Awakens,Doug Walker,Rob Walker,,131.0,12.0,


In [1352]:
actor.columns.tolist()

['movie_title',
 'actor_1_name',
 'actor_2_name',
 'actor_3_name',
 'actor_1_facebook_likes',
 'actor_2_facebook_likes',
 'actor_3_facebook_likes']

In [1353]:
def change_col_name(col_name):
    col_name = col_name.replace('_name', '')
    if 'facebook' in col_name:
        fb_idx = col_name.find('facebook')
        col_name = col_name[:5] + col_name[fb_idx - 1:] \
         + col_name[5:fb_idx-1]
    return col_name

In [1354]:
actor2 = actor.rename(columns=change_col_name)
actor2.head()

Unnamed: 0,movie_title,actor_1,actor_2,actor_3,actor_facebook_likes_1,actor_facebook_likes_2,actor_facebook_likes_3
0,Avatar,CCH Pounder,Joel David Moore,Wes Studi,1000.0,936.0,855.0
1,Pirates of the Caribbean: At World's End,Johnny Depp,Orlando Bloom,Jack Davenport,40000.0,5000.0,1000.0
2,Spectre,Christoph Waltz,Rory Kinnear,Stephanie Sigman,11000.0,393.0,161.0
3,The Dark Knight Rises,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,27000.0,23000.0,23000.0
4,Star Wars: Episode VII - The Force Awakens,Doug Walker,Rob Walker,,131.0,12.0,


In [1355]:
stubs = ['actor', 'actor_facebook_likes']
actor2_tidy = pd.wide_to_long(actor2,
 stubnames=stubs,
 i=['movie_title'],
 j='actor_num',
 sep='_')
actor2_tidy.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,actor,actor_facebook_likes
movie_title,actor_num,Unnamed: 2_level_1,Unnamed: 3_level_1
Avatar,1,CCH Pounder,1000.0
Pirates of the Caribbean: At World's End,1,Johnny Depp,40000.0
Spectre,1,Christoph Waltz,11000.0
The Dark Knight Rises,1,Tom Hardy,27000.0
Star Wars: Episode VII - The Force Awakens,1,Doug Walker,131.0


In [1356]:
len(actor2_tidy)

14748

In [1357]:
df = pd.read_csv('data/stackme.csv')
df

Unnamed: 0,State,Country,a1,b2,Test,d,e
0,TX,US,0.45,0.3,Test1,2,6
1,MA,US,0.03,1.2,Test2,9,7
2,ON,CAN,0.7,4.2,Test3,4,2


In [1358]:
df2 = df.rename(columns = {'a1':'group1_a1', 'b2':'group1_b2',
 'd':'group2_a1', 'e':'group2_b2'})
df2

Unnamed: 0,State,Country,group1_a1,group1_b2,Test,group2_a1,group2_b2
0,TX,US,0.45,0.3,Test1,2,6
1,MA,US,0.03,1.2,Test2,9,7
2,ON,CAN,0.7,4.2,Test3,4,2


In [1359]:
pd.wide_to_long(df2,
 stubnames=['group1', 'group2'],
 i=['State', 'Country', 'Test'],
 j='Label',
 suffix='.+',
 sep='_')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,group1,group2
State,Country,Test,Label,Unnamed: 4_level_1,Unnamed: 5_level_1
TX,US,Test1,a1,0.45,2
TX,US,Test1,b2,0.3,6
MA,US,Test2,a1,0.03,9
MA,US,Test2,b2,1.2,7
ON,CAN,Test3,a1,0.7,4
ON,CAN,Test3,b2,4.2,2


### 4. Inverting stacked data

* Stack == Melt
* Unstack == Pivot
* Stack and Unstack are simpler methods - less flexibilty
* Melt and Pivot - more control to the user

In [1360]:
usecol_func = lambda x: 'UGDS_' in x or x == 'INSTNM'
college = pd.read_csv('data/college.csv',
 index_col='INSTNM',
 usecols=usecol_func)

In [1361]:
college.head()

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.0333,0.9353,0.0055,0.0019,0.0024,0.0019,0.0,0.0059,0.0138
University of Alabama at Birmingham,0.5922,0.26,0.0283,0.0518,0.0022,0.0007,0.0368,0.0179,0.01
Amridge University,0.299,0.4192,0.0069,0.0034,0.0,0.0,0.0,0.0,0.2715
University of Alabama in Huntsville,0.6988,0.1255,0.0382,0.0376,0.0143,0.0002,0.0172,0.0332,0.035
Alabama State University,0.0158,0.9208,0.0121,0.0019,0.001,0.0006,0.0098,0.0243,0.0137


In [1362]:
college_stacked = college.stack()
college_stacked

INSTNM                                     
Alabama A & M University         UGDS_WHITE    0.0333
                                 UGDS_BLACK    0.9353
                                 UGDS_HISP     0.0055
                                 UGDS_ASIAN    0.0019
                                 UGDS_AIAN     0.0024
                                                ...  
Coastal Pines Technical College  UGDS_AIAN     0.0034
                                 UGDS_NHPI     0.0017
                                 UGDS_2MOR     0.0191
                                 UGDS_NRA      0.0028
                                 UGDS_UNKN     0.0056
Length: 61866, dtype: float64

In [1363]:
college_stacked.unstack()

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.0333,0.9353,0.0055,0.0019,0.0024,0.0019,0.0000,0.0059,0.0138
University of Alabama at Birmingham,0.5922,0.2600,0.0283,0.0518,0.0022,0.0007,0.0368,0.0179,0.0100
Amridge University,0.2990,0.4192,0.0069,0.0034,0.0000,0.0000,0.0000,0.0000,0.2715
University of Alabama in Huntsville,0.6988,0.1255,0.0382,0.0376,0.0143,0.0002,0.0172,0.0332,0.0350
Alabama State University,0.0158,0.9208,0.0121,0.0019,0.0010,0.0006,0.0098,0.0243,0.0137
...,...,...,...,...,...,...,...,...,...
Hollywood Institute of Beauty Careers-West Palm Beach,0.2182,0.4182,0.2364,0.0182,0.0000,0.0000,0.0000,0.0182,0.0909
Hollywood Institute of Beauty Careers-Casselberry,0.1200,0.3333,0.4400,0.0000,0.0000,0.0000,0.0400,0.0000,0.0667
Coachella Valley Beauty College-Beaumont,0.3284,0.1045,0.4925,0.0149,0.0299,0.0149,0.0149,0.0000,0.0000
Dewey University-Mayaguez,0.0000,0.0000,1.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000


In [1364]:
college2 = pd.read_csv('data/college.csv',
 usecols=usecol_func)
college2.head()

Unnamed: 0,INSTNM,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
0,Alabama A & M University,0.0333,0.9353,0.0055,0.0019,0.0024,0.0019,0.0,0.0059,0.0138
1,University of Alabama at Birmingham,0.5922,0.26,0.0283,0.0518,0.0022,0.0007,0.0368,0.0179,0.01
2,Amridge University,0.299,0.4192,0.0069,0.0034,0.0,0.0,0.0,0.0,0.2715
3,University of Alabama in Huntsville,0.6988,0.1255,0.0382,0.0376,0.0143,0.0002,0.0172,0.0332,0.035
4,Alabama State University,0.0158,0.9208,0.0121,0.0019,0.001,0.0006,0.0098,0.0243,0.0137


In [1365]:
college_melted = college2.melt(id_vars='INSTNM',
 var_name='Race',
 value_name='Percentage')
college_melted.head()

Unnamed: 0,INSTNM,Race,Percentage
0,Alabama A & M University,UGDS_WHITE,0.0333
1,University of Alabama at Birmingham,UGDS_WHITE,0.5922
2,Amridge University,UGDS_WHITE,0.299
3,University of Alabama in Huntsville,UGDS_WHITE,0.6988
4,Alabama State University,UGDS_WHITE,0.0158


In [1366]:
melted_inv = college_melted.pivot(index='INSTNM',
 columns='Race',
 values='Percentage')
melted_inv.head()

Race,UGDS_2MOR,UGDS_AIAN,UGDS_ASIAN,UGDS_BLACK,UGDS_HISP,UGDS_NHPI,UGDS_NRA,UGDS_UNKN,UGDS_WHITE
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
A & W Healthcare Educators,0.0,0.0,0.0,0.975,0.025,0.0,0.0,0.0,0.0
A T Still University of Health Sciences,,,,,,,,,
ABC Beauty Academy,0.0,0.0,0.9333,0.0333,0.0333,0.0,0.0,0.0,0.0
ABC Beauty College Inc,0.0,0.0,0.0,0.6579,0.0526,0.0,0.0,0.0,0.2895
AI Miami International University of Art and Design,0.0018,0.0,0.0018,0.0198,0.4773,0.0,0.0025,0.4644,0.0324


In [1367]:
college2_replication = melted_inv.loc[college2['INSTNM'],
college2.columns[1:]]\
.reset_index()

In [1368]:
college2.equals(college2_replication)

True

### 5. Unstacking after a groupby aggregation

In [1369]:
employee = pd.read_csv('data/employee.csv')
employee.head()

Unnamed: 0,UNIQUE_ID,POSITION_TITLE,DEPARTMENT,BASE_SALARY,RACE,EMPLOYMENT_TYPE,GENDER,EMPLOYMENT_STATUS,HIRE_DATE,JOB_DATE
0,0,ASSISTANT DIRECTOR (EX LVL),Municipal Courts Department,121862.0,Hispanic/Latino,Full Time,Female,Active,2006-06-12,2012-10-13
1,1,LIBRARY ASSISTANT,Library,26125.0,Hispanic/Latino,Full Time,Female,Active,2000-07-19,2010-09-18
2,2,POLICE OFFICER,Houston Police Department-HPD,45279.0,White,Full Time,Male,Active,2015-02-03,2015-02-03
3,3,ENGINEER/OPERATOR,Houston Fire Department (HFD),63166.0,White,Full Time,Male,Active,1982-02-08,1991-05-25
4,4,ELECTRICIAN,General Services Department,56347.0,White,Full Time,Male,Active,1989-06-19,1994-10-22


In [1370]:
employee.groupby('RACE')['BASE_SALARY'].mean().astype(int)

RACE
American Indian or Alaskan Native    60272
Asian/Pacific Islander               61660
Black or African American            50137
Hispanic/Latino                      52345
Others                               51278
White                                64419
Name: BASE_SALARY, dtype: int32

In [1371]:
agg = employee.groupby(['RACE', 'GENDER'])['BASE_SALARY'].mean().astype(int)
agg

RACE                               GENDER
American Indian or Alaskan Native  Female    60238
                                   Male      60305
Asian/Pacific Islander             Female    63226
                                   Male      61033
Black or African American          Female    48915
                                   Male      51082
Hispanic/Latino                    Female    46503
                                   Male      54782
Others                             Female    63785
                                   Male      38771
White                              Female    66793
                                   Male      63940
Name: BASE_SALARY, dtype: int32

In [1372]:
agg.unstack('GENDER')

GENDER,Female,Male
RACE,Unnamed: 1_level_1,Unnamed: 2_level_1
American Indian or Alaskan Native,60238,60305
Asian/Pacific Islander,63226,61033
Black or African American,48915,51082
Hispanic/Latino,46503,54782
Others,63785,38771
White,66793,63940


In [1373]:
agg.unstack('RACE')

RACE,American Indian or Alaskan Native,Asian/Pacific Islander,Black or African American,Hispanic/Latino,Others,White
GENDER,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Female,60238,63226,48915,46503,63785,66793
Male,60305,61033,51082,54782,38771,63940


In [1374]:
agg.unstack(level = 0)

RACE,American Indian or Alaskan Native,Asian/Pacific Islander,Black or African American,Hispanic/Latino,Others,White
GENDER,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Female,60238,63226,48915,46503,63785,66793
Male,60305,61033,51082,54782,38771,63940


In [1375]:
agg.unstack(level = 1)

GENDER,Female,Male
RACE,Unnamed: 1_level_1,Unnamed: 2_level_1
American Indian or Alaskan Native,60238,60305
Asian/Pacific Islander,63226,61033
Black or African American,48915,51082
Hispanic/Latino,46503,54782
Others,63785,38771
White,66793,63940


In [1376]:
agg2 = employee.groupby(['RACE', 'GENDER'])['BASE_SALARY'] \
 .agg(['mean', 'max', 'min']).astype(int)
agg2

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,max,min
RACE,GENDER,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
American Indian or Alaskan Native,Female,60238,98536,26125
American Indian or Alaskan Native,Male,60305,81239,26125
Asian/Pacific Islander,Female,63226,130416,26125
Asian/Pacific Islander,Male,61033,163228,27914
Black or African American,Female,48915,150416,24960
Black or African American,Male,51082,275000,26125
Hispanic/Latino,Female,46503,126115,26125
Hispanic/Latino,Male,54782,165216,26104
Others,Female,63785,63785,63785
Others,Male,38771,38771,38771


In [1377]:
agg2.unstack('GENDER')

Unnamed: 0_level_0,mean,mean,max,max,min,min
GENDER,Female,Male,Female,Male,Female,Male
RACE,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
American Indian or Alaskan Native,60238,60305,98536,81239,26125,26125
Asian/Pacific Islander,63226,61033,130416,163228,26125,27914
Black or African American,48915,51082,150416,275000,24960,26125
Hispanic/Latino,46503,54782,126115,165216,26125,26104
Others,63785,38771,63785,38771,63785,38771
White,66793,63940,178331,210588,27955,26125


### 6. Replicating pivot_table with a groupby aggregation

In [1378]:
flights = pd.read_csv('data/flights.csv')
fp = flights.pivot_table(index='AIRLINE',
 columns='ORG_AIR',
 values='CANCELLED',
 aggfunc='sum',
 fill_value=0).round(2)
fp.head()

ORG_AIR,ATL,DEN,DFW,IAH,LAS,LAX,MSP,ORD,PHX,SFO
AIRLINE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
AA,3,4,86,3,3,11,3,35,4,2
AS,0,0,0,0,0,0,0,0,0,0
B6,0,0,0,0,0,0,0,0,0,1
DL,28,1,0,0,1,1,4,0,1,2
EV,18,6,27,36,0,0,6,53,0,0


In [1379]:
fg = flights.groupby(['AIRLINE', 'ORG_AIR'])['CANCELLED'].sum()
fg.head(10)

AIRLINE  ORG_AIR
AA       ATL         3
         DEN         4
         DFW        86
         IAH         3
         LAS         3
         LAX        11
         MSP         3
         ORD        35
         PHX         4
         SFO         2
Name: CANCELLED, dtype: int64

In [1380]:
fg_unstack = fg.unstack('ORG_AIR', fill_value=0)
fg_unstack

ORG_AIR,ATL,DEN,DFW,IAH,LAS,LAX,MSP,ORD,PHX,SFO
AIRLINE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
AA,3,4,86,3,3,11,3,35,4,2
AS,0,0,0,0,0,0,0,0,0,0
B6,0,0,0,0,0,0,0,0,0,1
DL,28,1,0,0,1,1,4,0,1,2
EV,18,6,27,36,0,0,6,53,0,0
F9,0,2,1,0,1,1,1,4,0,0
HA,0,0,0,0,0,0,0,0,0,0
MQ,5,0,62,0,0,0,0,85,0,0
NK,1,1,6,0,1,1,3,10,2,0
OO,3,25,2,10,0,15,4,41,9,33


In [1381]:
fp.equals(fg_unstack)

True

*To achive the same thing:\
        pivot_table takes 1 step\
        groupby + unstack takes two steps 

* fill_value is used at the unstack part and not in the groupby part
* fill_value is not available in groupby parameters
* fill_value is available only in the pivot_table

In [1382]:
#complex aggregations: pivot_table requires np.sum kind of functions
#so use groupby for better results

In [1383]:
flights.pivot_table(index=['AIRLINE', 'MONTH'],
 columns=['ORG_AIR', 'CANCELLED'],
 values=['DEP_DELAY', 'DIST'],
 aggfunc=[np.sum, np.mean],
 fill_value=0)

Unnamed: 0_level_0,Unnamed: 1_level_0,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,...,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean
Unnamed: 0_level_1,Unnamed: 1_level_1,DEP_DELAY,DEP_DELAY,DEP_DELAY,DEP_DELAY,DEP_DELAY,DEP_DELAY,DEP_DELAY,DEP_DELAY,DEP_DELAY,DEP_DELAY,...,DIST,DIST,DIST,DIST,DIST,DIST,DIST,DIST,DIST,DIST
Unnamed: 0_level_2,ORG_AIR,ATL,ATL,DEN,DEN,DFW,DFW,IAH,IAH,LAS,LAS,...,LAX,LAX,MSP,MSP,ORD,ORD,PHX,PHX,SFO,SFO
Unnamed: 0_level_3,CANCELLED,0,1,0,1,0,1,0,1,0,1,...,0,1,0,1,0,1,0,1,0,1
AIRLINE,MONTH,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4,Unnamed: 17_level_4,Unnamed: 18_level_4,Unnamed: 19_level_4,Unnamed: 20_level_4,Unnamed: 21_level_4,Unnamed: 22_level_4
AA,1,-13,0,113,0,4276,-3,117,0,1036,0,...,1678.037037,2475.000000,809.000000,0.0,1068.876033,0.000000,1167.666667,0.0,1860.166667,0.0
AA,2,-39,0,71,0,2662,0,8,0,-55,0,...,1745.892308,1818.000000,1008.000000,0.0,1193.782178,771.142857,1311.461538,868.0,1337.916667,2586.0
AA,3,-2,0,69,0,5692,0,109,0,326,0,...,1781.567568,1744.000000,964.733333,0.0,1058.933333,802.000000,1171.363636,0.0,1502.758621,0.0
AA,4,1,0,304,0,3518,0,104,0,790,0,...,1850.923913,0.000000,648.714286,0.0,1094.633094,943.600000,1266.214286,0.0,1646.903226,0.0
AA,5,52,0,352,0,5510,0,55,0,93,0,...,1820.478261,0.000000,787.250000,0.0,998.774775,999.500000,1240.444444,0.0,1436.892857,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WN,7,2604,0,1919,0,0,0,0,0,4600,0,...,912.453704,327.777778,647.266667,0.0,0.000000,0.000000,799.160256,369.0,636.210526,0.0
WN,8,1718,0,1180,0,0,0,0,0,3151,0,...,835.404040,346.000000,508.703704,0.0,0.000000,0.000000,891.569767,0.0,644.857143,392.0
WN,9,1033,0,705,0,0,0,0,0,1400,0,...,830.210000,317.666667,644.416667,0.0,0.000000,0.000000,872.840000,0.0,731.578947,354.5
WN,11,700,0,1372,0,0,0,0,0,1309,0,...,748.404040,459.333333,573.642857,0.0,0.000000,0.000000,823.258741,872.0,580.875000,392.0


In [1384]:
flights.groupby(['AIRLINE', 'MONTH', 'ORG_AIR', 'CANCELLED']) \
 ['DEP_DELAY', 'DIST'] \
 .agg(['mean', 'sum']) \
 .unstack(['ORG_AIR', 'CANCELLED'], fill_value=0) \
 .swaplevel(0, 1, axis='columns')

  flights.groupby(['AIRLINE', 'MONTH', 'ORG_AIR', 'CANCELLED']) \


Unnamed: 0_level_0,Unnamed: 1_level_0,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,...,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum
Unnamed: 0_level_1,Unnamed: 1_level_1,DEP_DELAY,DEP_DELAY,DEP_DELAY,DEP_DELAY,DEP_DELAY,DEP_DELAY,DEP_DELAY,DEP_DELAY,DEP_DELAY,DEP_DELAY,...,DIST,DIST,DIST,DIST,DIST,DIST,DIST,DIST,DIST,DIST
Unnamed: 0_level_2,ORG_AIR,ATL,ATL,DEN,DEN,DFW,DFW,IAH,IAH,LAS,LAS,...,LAX,LAX,MSP,MSP,ORD,ORD,PHX,PHX,SFO,SFO
Unnamed: 0_level_3,CANCELLED,0,1,0,1,0,1,0,1,0,1,...,0,1,0,1,0,1,0,1,0,1
AIRLINE,MONTH,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4,Unnamed: 17_level_4,Unnamed: 18_level_4,Unnamed: 19_level_4,Unnamed: 20_level_4,Unnamed: 21_level_4,Unnamed: 22_level_4
AA,1,-3.250000,0.0,7.062500,0.0,11.977591,-3.0,9.750000,0.0,32.375000,0.0,...,135921,2475,7281,0,129334,0,21018,0,33483,0
AA,2,-3.000000,,5.461538,,8.756579,,1.000000,,-3.055556,,...,113483,5454,5040,0,120572,5398,17049,868,32110,2586
AA,3,-0.166667,,7.666667,0.0,15.383784,,10.900000,0.0,12.074074,0.0,...,131836,1744,14471,0,127072,802,25770,0,43580,0
AA,4,0.071429,0.0,20.266667,0.0,10.501493,,6.933333,0.0,27.241379,0.0,...,170285,0,4541,0,152154,4718,17727,0,51054,0
AA,5,5.777778,0.0,23.466667,,16.798780,,3.055556,,2.818182,0.0,...,167484,0,6298,0,110864,1999,11164,0,40233,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WN,7,21.700000,0.0,13.143836,0.0,0.000000,0.0,0.000000,0.0,22.439024,,...,98545,2950,19418,0,0,0,124669,369,24176,0
WN,8,16.207547,0.0,7.375000,0.0,0.000000,0.0,0.000000,0.0,16.158974,,...,82705,1384,13735,0,0,0,153350,0,18056,784
WN,9,8.680672,0.0,4.378882,0.0,0.000000,0.0,0.000000,0.0,7.179487,0.0,...,83021,953,15466,0,0,0,130926,0,27800,709
WN,11,5.932203,,8.215569,,0.000000,0.0,0.000000,0.0,7.522989,,...,74092,1378,8031,0,0,0,117726,872,23235,784


In [1385]:
#swaplevel why?

### 7. Renaming axis levels for easy reshaping

In [1386]:
college = pd.read_csv('data/college.csv')

In [1387]:
cg = college.groupby(['STABBR', 'RELAFFIL']) \
['UGDS', 'SATMTMID'] \
.agg(['size', 'min', 'max']).head(6)

  cg = college.groupby(['STABBR', 'RELAFFIL']) \


In [1388]:
cg

Unnamed: 0_level_0,Unnamed: 1_level_0,UGDS,UGDS,UGDS,SATMTMID,SATMTMID,SATMTMID
Unnamed: 0_level_1,Unnamed: 1_level_1,size,min,max,size,min,max
STABBR,RELAFFIL,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
AK,0,7,109.0,12865.0,7,,
AK,1,3,27.0,275.0,3,503.0,503.0
AL,0,72,12.0,29851.0,72,420.0,590.0
AL,1,24,13.0,3033.0,24,400.0,560.0
AR,0,68,18.0,21405.0,68,427.0,565.0
AR,1,18,20.0,4485.0,18,495.0,600.0


In [1389]:
cg = cg.rename_axis(['AGG_COLS', 'AGG_FUNCS'], axis='columns')
cg

Unnamed: 0_level_0,AGG_COLS,UGDS,UGDS,UGDS,SATMTMID,SATMTMID,SATMTMID
Unnamed: 0_level_1,AGG_FUNCS,size,min,max,size,min,max
STABBR,RELAFFIL,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
AK,0,7,109.0,12865.0,7,,
AK,1,3,27.0,275.0,3,503.0,503.0
AL,0,72,12.0,29851.0,72,420.0,590.0
AL,1,24,13.0,3033.0,24,400.0,560.0
AR,0,68,18.0,21405.0,68,427.0,565.0
AR,1,18,20.0,4485.0,18,495.0,600.0


In [1390]:
cg.stack('AGG_FUNCS').head()

Unnamed: 0_level_0,Unnamed: 1_level_0,AGG_COLS,UGDS,SATMTMID
STABBR,RELAFFIL,AGG_FUNCS,Unnamed: 3_level_1,Unnamed: 4_level_1
AK,0,size,7.0,7.0
AK,0,min,109.0,
AK,0,max,12865.0,
AK,1,size,3.0,3.0
AK,1,min,27.0,503.0


In [1391]:
cg.stack('AGG_FUNCS').swaplevel('AGG_FUNCS', 'STABBR',
 axis='index').head()

Unnamed: 0_level_0,Unnamed: 1_level_0,AGG_COLS,UGDS,SATMTMID
AGG_FUNCS,RELAFFIL,STABBR,Unnamed: 3_level_1,Unnamed: 4_level_1
size,0,AK,7.0,7.0
min,0,AK,109.0,
max,0,AK,12865.0,
size,1,AK,3.0,3.0
min,1,AK,27.0,503.0


In [1392]:
cg.stack('AGG_FUNCS') \
 .swaplevel('AGG_FUNCS', 'STABBR', axis='index') \
 .sort_index(level='RELAFFIL', axis='index') \
 .sort_index(level='AGG_COLS', axis='columns').head(6)

Unnamed: 0_level_0,Unnamed: 1_level_0,AGG_COLS,SATMTMID,UGDS
AGG_FUNCS,RELAFFIL,STABBR,Unnamed: 3_level_1,Unnamed: 4_level_1
max,0,AK,,12865.0
max,0,AL,590.0,29851.0
max,0,AR,565.0,21405.0
min,0,AK,,109.0
min,0,AL,420.0,12.0
min,0,AR,427.0,18.0


In [1393]:
cg.stack('AGG_FUNCS').unstack(['STABBR'])

Unnamed: 0_level_0,AGG_COLS,UGDS,UGDS,UGDS,SATMTMID,SATMTMID,SATMTMID
Unnamed: 0_level_1,STABBR,AK,AL,AR,AK,AL,AR
RELAFFIL,AGG_FUNCS,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
0,size,7.0,72.0,68.0,7.0,72.0,68.0
0,min,109.0,12.0,18.0,,420.0,427.0
0,max,12865.0,29851.0,21405.0,,590.0,565.0
1,size,3.0,24.0,18.0,3.0,24.0,18.0
1,min,27.0,13.0,20.0,503.0,400.0,495.0
1,max,275.0,3033.0,4485.0,503.0,560.0,600.0


In [1394]:
cg.stack('AGG_FUNCS').unstack(['RELAFFIL'])

Unnamed: 0_level_0,AGG_COLS,UGDS,UGDS,SATMTMID,SATMTMID
Unnamed: 0_level_1,RELAFFIL,0,1,0,1
STABBR,AGG_FUNCS,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
AK,size,7.0,3.0,7.0,3.0
AK,min,109.0,27.0,,503.0
AK,max,12865.0,275.0,,503.0
AL,size,72.0,24.0,72.0,24.0
AL,min,12.0,13.0,420.0,400.0
AL,max,29851.0,3033.0,590.0,560.0
AR,size,68.0,18.0,68.0,18.0
AR,min,18.0,20.0,427.0,495.0
AR,max,21405.0,4485.0,565.0,600.0


In [1395]:
cg.stack('AGG_FUNCS').unstack(['RELAFFIL', 'STABBR'])

AGG_COLS,UGDS,UGDS,UGDS,UGDS,UGDS,UGDS,SATMTMID,SATMTMID,SATMTMID,SATMTMID,SATMTMID,SATMTMID
RELAFFIL,0,1,0,1,0,1,0,1,0,1,0,1
STABBR,AK,AK,AL,AL,AR,AR,AK,AK,AL,AL,AR,AR
AGG_FUNCS,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3
size,7.0,3.0,72.0,24.0,68.0,18.0,7.0,3.0,72.0,24.0,68.0,18.0
min,109.0,27.0,12.0,13.0,18.0,20.0,,503.0,420.0,400.0,427.0,495.0
max,12865.0,275.0,29851.0,3033.0,21405.0,4485.0,,503.0,590.0,560.0,565.0,600.0


In [1396]:
cg.stack('AGG_FUNCS').unstack(['RELAFFIL', 'STABBR'])

AGG_COLS,UGDS,UGDS,UGDS,UGDS,UGDS,UGDS,SATMTMID,SATMTMID,SATMTMID,SATMTMID,SATMTMID,SATMTMID
RELAFFIL,0,1,0,1,0,1,0,1,0,1,0,1
STABBR,AK,AK,AL,AL,AR,AR,AK,AK,AL,AL,AR,AR
AGG_FUNCS,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3
size,7.0,3.0,72.0,24.0,68.0,18.0,7.0,3.0,72.0,24.0,68.0,18.0
min,109.0,27.0,12.0,13.0,18.0,20.0,,503.0,420.0,400.0,427.0,495.0
max,12865.0,275.0,29851.0,3033.0,21405.0,4485.0,,503.0,590.0,560.0,565.0,600.0


In [1397]:
cg.rename_axis([None, None], axis='index') \
 .rename_axis([None, None], axis='columns')

Unnamed: 0_level_0,Unnamed: 1_level_0,UGDS,UGDS,UGDS,SATMTMID,SATMTMID,SATMTMID
Unnamed: 0_level_1,Unnamed: 1_level_1,size,min,max,size,min,max
AK,0,7,109.0,12865.0,7,,
AK,1,3,27.0,275.0,3,503.0,503.0
AL,0,72,12.0,29851.0,72,420.0,590.0
AL,1,24,13.0,3033.0,24,400.0,560.0
AR,0,68,18.0,21405.0,68,427.0,565.0
AR,1,18,20.0,4485.0,18,495.0,600.0


### 8. Tidying when multiple variables are stored as column names

In [1398]:
weightlifting = pd.read_csv('data/weightlifting_men.csv')
weightlifting

Unnamed: 0,Weight Category,M35 35-39,M40 40-44,M45 45-49,M50 50-54,M55 55-59,M60 60-64,M65 65-69,M70 70-74,M75 75-79,M80 80+
0,56,137,130,125,115,102,92,80,67,62,55
1,62,152,145,137,127,112,102,90,75,67,57
2,69,167,160,150,140,125,112,97,82,75,60
3,77,182,172,165,150,135,122,107,90,82,65
4,85,192,182,175,160,142,130,112,95,87,70
5,94,202,192,182,167,150,137,120,100,90,75
6,105,210,200,190,175,157,142,122,102,95,80
7,105+,217,207,197,182,165,150,127,107,100,85


In [1399]:
wl_melt = weightlifting.melt(id_vars='Weight Category',
 var_name='sex_age',
 value_name='Qual Total')
wl_melt.head()

Unnamed: 0,Weight Category,sex_age,Qual Total
0,56,M35 35-39,137
1,62,M35 35-39,152
2,69,M35 35-39,167
3,77,M35 35-39,182
4,85,M35 35-39,192


In [1400]:
sex_age = wl_melt['sex_age'].str.split(expand=True)
sex_age.head()

Unnamed: 0,0,1
0,M35,35-39
1,M35,35-39
2,M35,35-39
3,M35,35-39
4,M35,35-39


In [1401]:
sex_age.columns = ['Sex', 'Age Group']
sex_age.head()

Unnamed: 0,Sex,Age Group
0,M35,35-39
1,M35,35-39
2,M35,35-39
3,M35,35-39
4,M35,35-39


In [1402]:
sex_age['Sex'] = sex_age['Sex'].str[0]
sex_age.head()

Unnamed: 0,Sex,Age Group
0,M,35-39
1,M,35-39
2,M,35-39
3,M,35-39
4,M,35-39


In [1403]:
wl_cat_total = wl_melt[['Weight Category', 'Qual Total']]
wl_tidy = pd.concat([sex_age, wl_cat_total], axis='columns')
wl_tidy.head()

Unnamed: 0,Sex,Age Group,Weight Category,Qual Total
0,M,35-39,56,137
1,M,35-39,62,152
2,M,35-39,69,167
3,M,35-39,77,182
4,M,35-39,85,192


In [1404]:
cols = ['Weight Category', 'Qual Total']
sex_age[cols] = wl_melt[cols]

In [1405]:
sex_age

Unnamed: 0,Sex,Age Group,Weight Category,Qual Total
0,M,35-39,56,137
1,M,35-39,62,152
2,M,35-39,69,167
3,M,35-39,77,182
4,M,35-39,85,192
...,...,...,...,...
75,M,80+,77,65
76,M,80+,85,70
77,M,80+,94,75
78,M,80+,105,80


In [1406]:
age_group = wl_melt.sex_age.str.extract('(\d{2}[-+](?:\d{2})?)',
 expand=False)

In [1407]:
age_group

0     35-39
1     35-39
2     35-39
3     35-39
4     35-39
      ...  
75      80+
76      80+
77      80+
78      80+
79      80+
Name: sex_age, Length: 80, dtype: object

In [1408]:
sex = wl_melt.sex_age.str[0]
new_cols = {'Sex':sex,
 'Age Group': age_group}

In [1409]:
sex.head()

0    M
1    M
2    M
3    M
4    M
Name: sex_age, dtype: object

In [1410]:
age_group.head()

0    35-39
1    35-39
2    35-39
3    35-39
4    35-39
Name: sex_age, dtype: object

In [1411]:
wl_tidy2 = wl_melt.assign(**new_cols) \
 .drop('sex_age',axis='columns')

In [1412]:
wl_tidy2

Unnamed: 0,Weight Category,Qual Total,Sex,Age Group
0,56,137,M,35-39
1,62,152,M,35-39
2,69,167,M,35-39
3,77,182,M,35-39
4,85,192,M,35-39
...,...,...,...,...
75,77,65,M,80+
76,85,70,M,80+
77,94,75,M,80+
78,105,80,M,80+


In [1413]:
wl_tidy2.sort_index(axis=1).equals(wl_tidy.sort_index(axis=1))

True

### 9. Tidying when multiple variables are stored as column values - part 2 

In [1414]:
inspections = pd.read_csv('data/restaurant_inspections.csv',
 parse_dates=['Date'])
inspections.head()

Unnamed: 0,Name,Date,Info,Value
0,E & E Grill House,2017-08-08,Borough,MANHATTAN
1,E & E Grill House,2017-08-08,Cuisine,American
2,E & E Grill House,2017-08-08,Description,Non-food contact surface improperly constructe...
3,E & E Grill House,2017-08-08,Grade,A
4,E & E Grill House,2017-08-08,Score,9.0


In [1415]:
# inspections.pivot(index=['Name', 'Date'], columns='Info', values='Value')
#NotImplementedError

In [1416]:
inspections.set_index(['Name','Date', 'Info']).head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Value
Name,Date,Info,Unnamed: 3_level_1
E & E Grill House,2017-08-08,Borough,MANHATTAN
E & E Grill House,2017-08-08,Cuisine,American
E & E Grill House,2017-08-08,Description,Non-food contact surface improperly constructe...
E & E Grill House,2017-08-08,Grade,A
E & E Grill House,2017-08-08,Score,9.0
PIZZA WAGON,2017-04-12,Borough,BROOKLYN
PIZZA WAGON,2017-04-12,Cuisine,Pizza
PIZZA WAGON,2017-04-12,Description,"Food contact surface not properly washed, rins..."
PIZZA WAGON,2017-04-12,Grade,A
PIZZA WAGON,2017-04-12,Score,10.0


In [1417]:
inspections.set_index(['Name','Date', 'Info']) \
 .unstack('Info').head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Value,Value,Value,Value,Value
Unnamed: 0_level_1,Info,Borough,Cuisine,Description,Grade,Score
Name,Date,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
3 STAR JUICE CENTER,2017-05-10,BROOKLYN,"Juice, Smoothies, Fruit Salads",Facility not vermin proof. Harborage or condit...,A,12.0
A & L PIZZA RESTAURANT,2017-08-22,BROOKLYN,Pizza,Facility not vermin proof. Harborage or condit...,A,9.0
AKSARAY TURKISH CAFE AND RESTAURANT,2017-07-25,BROOKLYN,Turkish,Plumbing not properly installed or maintained;...,A,13.0
ANTOJITOS DELI FOOD,2017-06-01,BROOKLYN,"Latin (Cuban, Dominican, Puerto Rican, South &...",Live roaches present in facility's food and/or...,A,10.0
BANGIA,2017-06-16,MANHATTAN,Korean,Covered garbage receptacle not provided or ina...,A,9.0


In [1418]:
insp_tidy = inspections.set_index(['Name','Date', 'Info']) \
.unstack('Info') \
.reset_index(col_level=-1)
insp_tidy

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Value,Value,Value,Value,Value
Info,Name,Date,Borough,Cuisine,Description,Grade,Score
0,3 STAR JUICE CENTER,2017-05-10,BROOKLYN,"Juice, Smoothies, Fruit Salads",Facility not vermin proof. Harborage or condit...,A,12.0
1,A & L PIZZA RESTAURANT,2017-08-22,BROOKLYN,Pizza,Facility not vermin proof. Harborage or condit...,A,9.0
2,AKSARAY TURKISH CAFE AND RESTAURANT,2017-07-25,BROOKLYN,Turkish,Plumbing not properly installed or maintained;...,A,13.0
3,ANTOJITOS DELI FOOD,2017-06-01,BROOKLYN,"Latin (Cuban, Dominican, Puerto Rican, South &...",Live roaches present in facility's food and/or...,A,10.0
4,BANGIA,2017-06-16,MANHATTAN,Korean,Covered garbage receptacle not provided or ina...,A,9.0
...,...,...,...,...,...,...,...
95,VALL'S PIZZERIA,2017-03-15,STATEN ISLAND,Pizza/Italian,Wiping cloths soiled or not stored in sanitizi...,A,9.0
96,VIP GRILL,2017-06-12,BROOKLYN,Jewish/Kosher,Hot food item not held at or above 140Âº F.,A,10.0
97,WAHIZZA,2017-04-13,MANHATTAN,Pizza,"No facilities available to wash, rinse and san...",A,10.0
98,WANG MANDOO HOUSE,2017-08-29,QUEENS,Korean,Accurate thermometer not provided in refrigera...,A,12.0


In [1419]:
insp_tidy.columns = insp_tidy.columns.droplevel(0).rename(None)
insp_tidy.head()

Unnamed: 0,Name,Date,Borough,Cuisine,Description,Grade,Score
0,3 STAR JUICE CENTER,2017-05-10,BROOKLYN,"Juice, Smoothies, Fruit Salads",Facility not vermin proof. Harborage or condit...,A,12.0
1,A & L PIZZA RESTAURANT,2017-08-22,BROOKLYN,Pizza,Facility not vermin proof. Harborage or condit...,A,9.0
2,AKSARAY TURKISH CAFE AND RESTAURANT,2017-07-25,BROOKLYN,Turkish,Plumbing not properly installed or maintained;...,A,13.0
3,ANTOJITOS DELI FOOD,2017-06-01,BROOKLYN,"Latin (Cuban, Dominican, Puerto Rican, South &...",Live roaches present in facility's food and/or...,A,10.0
4,BANGIA,2017-06-16,MANHATTAN,Korean,Covered garbage receptacle not provided or ina...,A,9.0


In [1420]:
inspections.set_index(['Name','Date', 'Info']) \
.squeeze() \
.unstack('Info') \
.reset_index() \
.rename_axis(None, axis='columns')

Unnamed: 0,Name,Date,Borough,Cuisine,Description,Grade,Score
0,3 STAR JUICE CENTER,2017-05-10,BROOKLYN,"Juice, Smoothies, Fruit Salads",Facility not vermin proof. Harborage or condit...,A,12.0
1,A & L PIZZA RESTAURANT,2017-08-22,BROOKLYN,Pizza,Facility not vermin proof. Harborage or condit...,A,9.0
2,AKSARAY TURKISH CAFE AND RESTAURANT,2017-07-25,BROOKLYN,Turkish,Plumbing not properly installed or maintained;...,A,13.0
3,ANTOJITOS DELI FOOD,2017-06-01,BROOKLYN,"Latin (Cuban, Dominican, Puerto Rican, South &...",Live roaches present in facility's food and/or...,A,10.0
4,BANGIA,2017-06-16,MANHATTAN,Korean,Covered garbage receptacle not provided or ina...,A,9.0
...,...,...,...,...,...,...,...
95,VALL'S PIZZERIA,2017-03-15,STATEN ISLAND,Pizza/Italian,Wiping cloths soiled or not stored in sanitizi...,A,9.0
96,VIP GRILL,2017-06-12,BROOKLYN,Jewish/Kosher,Hot food item not held at or above 140Âº F.,A,10.0
97,WAHIZZA,2017-04-13,MANHATTAN,Pizza,"No facilities available to wash, rinse and san...",A,10.0
98,WANG MANDOO HOUSE,2017-08-29,QUEENS,Korean,Accurate thermometer not provided in refrigera...,A,12.0


In [1421]:
#squeeze()

In [1422]:
inspections.head()

Unnamed: 0,Name,Date,Info,Value
0,E & E Grill House,2017-08-08,Borough,MANHATTAN
1,E & E Grill House,2017-08-08,Cuisine,American
2,E & E Grill House,2017-08-08,Description,Non-food contact surface improperly constructe...
3,E & E Grill House,2017-08-08,Grade,A
4,E & E Grill House,2017-08-08,Score,9.0


In [1423]:
inspections.pivot_table(index=['Name', 'Date'],
columns='Info',
values='Value',
aggfunc='first') \
.reset_index() \
.rename_axis(None, axis='columns')

Unnamed: 0,Name,Date,Borough,Cuisine,Description,Grade,Score
0,3 STAR JUICE CENTER,2017-05-10,BROOKLYN,"Juice, Smoothies, Fruit Salads",Facility not vermin proof. Harborage or condit...,A,12.0
1,A & L PIZZA RESTAURANT,2017-08-22,BROOKLYN,Pizza,Facility not vermin proof. Harborage or condit...,A,9.0
2,AKSARAY TURKISH CAFE AND RESTAURANT,2017-07-25,BROOKLYN,Turkish,Plumbing not properly installed or maintained;...,A,13.0
3,ANTOJITOS DELI FOOD,2017-06-01,BROOKLYN,"Latin (Cuban, Dominican, Puerto Rican, South &...",Live roaches present in facility's food and/or...,A,10.0
4,BANGIA,2017-06-16,MANHATTAN,Korean,Covered garbage receptacle not provided or ina...,A,9.0
...,...,...,...,...,...,...,...
95,VALL'S PIZZERIA,2017-03-15,STATEN ISLAND,Pizza/Italian,Wiping cloths soiled or not stored in sanitizi...,A,9.0
96,VIP GRILL,2017-06-12,BROOKLYN,Jewish/Kosher,Hot food item not held at or above 140Âº F.,A,10.0
97,WAHIZZA,2017-04-13,MANHATTAN,Pizza,"No facilities available to wash, rinse and san...",A,10.0
98,WANG MANDOO HOUSE,2017-08-29,QUEENS,Korean,Accurate thermometer not provided in refrigera...,A,12.0


### 10. Tidying when two or more values are stored in the same cell

In [1424]:
cities = pd.read_csv('data/texas_cities.csv')
cities

Unnamed: 0,City,Geolocation
0,Houston,"29.7604° N, 95.3698° W"
1,Dallas,"32.7767° N, 96.7970° W"
2,Austin,"30.2672° N, 97.7431° W"


In [1425]:
geolocations = cities.Geolocation.str.split(pat='. ',
 expand=True)
geolocations

Unnamed: 0,0,1,2,3
0,29.7604,N,95.3698,W
1,32.7767,N,96.797,W
2,30.2672,N,97.7431,W


In [1426]:
geolocations.columns = ['latitude', 'latitude direction',
 'longitude', 'longitude direction']
geolocations

Unnamed: 0,latitude,latitude direction,longitude,longitude direction
0,29.7604,N,95.3698,W
1,32.7767,N,96.797,W
2,30.2672,N,97.7431,W


In [1427]:
geolocations = geolocations.astype({'latitude':'float',
 'longitude':'float'})
geolocations

Unnamed: 0,latitude,latitude direction,longitude,longitude direction
0,29.7604,N,95.3698,W
1,32.7767,N,96.797,W
2,30.2672,N,97.7431,W


In [1428]:
geolocations.dtypes

latitude               float64
latitude direction      object
longitude              float64
longitude direction     object
dtype: object

In [1429]:
cities_tidy = pd.concat([cities['City'], geolocations],
 axis='columns')
cities_tidy

Unnamed: 0,City,latitude,latitude direction,longitude,longitude direction
0,Houston,29.7604,N,95.3698,W
1,Dallas,32.7767,N,96.797,W
2,Austin,30.2672,N,97.7431,W


In [1430]:
geolocations.apply(pd.to_numeric, errors='ignore')

Unnamed: 0,latitude,latitude direction,longitude,longitude direction
0,29.7604,N,95.3698,W
1,32.7767,N,96.797,W
2,30.2672,N,97.7431,W


In [1431]:
cities

Unnamed: 0,City,Geolocation
0,Houston,"29.7604° N, 95.3698° W"
1,Dallas,"32.7767° N, 96.7970° W"
2,Austin,"30.2672° N, 97.7431° W"


In [1432]:
cities.Geolocation.str.split(pat='° |, ', expand=True)

Unnamed: 0,0,1,2,3
0,29.7604,N,95.3698,W
1,32.7767,N,96.797,W
2,30.2672,N,97.7431,W


In [1433]:
cities.Geolocation.str.split(pat='° |, ', expand=False)

0    [29.7604, N, 95.3698, W]
1    [32.7767, N, 96.7970, W]
2    [30.2672, N, 97.7431, W]
Name: Geolocation, dtype: object

In [1434]:
cities.Geolocation.str.extract('([0-9.]+). (N|S), ([0-9.]+). (E|W)',
 expand=True)

Unnamed: 0,0,1,2,3
0,29.7604,N,95.3698,W
1,32.7767,N,96.797,W
2,30.2672,N,97.7431,W


### 11. Tidying when variables are stored in column names and values

In [1435]:
sensors = pd.read_csv('data/sensors.csv')
sensors

Unnamed: 0,Group,Property,2012,2013,2014,2015,2016
0,A,Pressure,928,873,814,973,870
1,A,Temperature,1026,1038,1009,1036,1042
2,A,Flow,819,806,861,882,856
3,B,Pressure,817,877,914,806,942
4,B,Temperature,1008,1041,1009,1002,1013
5,B,Flow,887,899,837,824,873


In [1436]:
sensors.melt(id_vars=['Group', 'Property'], var_name='Year') \
 .head(6)

Unnamed: 0,Group,Property,Year,value
0,A,Pressure,2012,928
1,A,Temperature,2012,1026
2,A,Flow,2012,819
3,B,Pressure,2012,817
4,B,Temperature,2012,1008
5,B,Flow,2012,887


In [1437]:
sensors.melt(id_vars=['Group', 'Property'], var_name='Year') \
 .pivot_table(index=['Group', 'Year'],
 columns='Property', values='value') \
 .reset_index() \
 .rename_axis(None, axis='columns')

Unnamed: 0,Group,Year,Flow,Pressure,Temperature
0,A,2012,819,928,1026
1,A,2013,806,873,1038
2,A,2014,861,814,1009
3,A,2015,882,973,1036
4,A,2016,856,870,1042
5,B,2012,887,817,1008
6,B,2013,899,877,1041
7,B,2014,837,914,1009
8,B,2015,824,806,1002
9,B,2016,873,942,1013


* melt pivot_table pivot
* stack unstack

In [1438]:
sensors.set_index(['Group', 'Property']) \
 .stack() \
 .unstack('Property') \
 .rename_axis(['Group', 'Year'], axis='index') \
 .rename_axis(None, axis='columns') \
 .reset_index()

Unnamed: 0,Group,Year,Flow,Pressure,Temperature
0,A,2012,819,928,1026
1,A,2013,806,873,1038
2,A,2014,861,814,1009
3,A,2015,882,973,1036
4,A,2016,856,870,1042
5,B,2012,887,817,1008
6,B,2013,899,877,1041
7,B,2014,837,914,1009
8,B,2015,824,806,1002
9,B,2016,873,942,1013


### 12. Tidying when multiple observational units are stored in the same table

* Normalization

In [1439]:
movie = pd.read_csv('data/movie_altered.csv')
movie.head()

Unnamed: 0,title,rating,year,duration,director_1,director_fb_likes_1,actor_1,actor_2,actor_3,actor_fb_likes_1,actor_fb_likes_2,actor_fb_likes_3
0,Avatar,PG-13,2009.0,178.0,James Cameron,0.0,CCH Pounder,Joel David Moore,Wes Studi,1000.0,936.0,855.0
1,Pirates of the Caribbean: At World's End,PG-13,2007.0,169.0,Gore Verbinski,563.0,Johnny Depp,Orlando Bloom,Jack Davenport,40000.0,5000.0,1000.0
2,Spectre,PG-13,2015.0,148.0,Sam Mendes,0.0,Christoph Waltz,Rory Kinnear,Stephanie Sigman,11000.0,393.0,161.0
3,The Dark Knight Rises,PG-13,2012.0,164.0,Christopher Nolan,22000.0,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,27000.0,23000.0,23000.0
4,Star Wars: Episode VII - The Force Awakens,,,,Doug Walker,131.0,Doug Walker,Rob Walker,,131.0,12.0,


In [1440]:
movie.insert(0, 'id', np.arange(len(movie)))
movie.head()

Unnamed: 0,id,title,rating,year,duration,director_1,director_fb_likes_1,actor_1,actor_2,actor_3,actor_fb_likes_1,actor_fb_likes_2,actor_fb_likes_3
0,0,Avatar,PG-13,2009.0,178.0,James Cameron,0.0,CCH Pounder,Joel David Moore,Wes Studi,1000.0,936.0,855.0
1,1,Pirates of the Caribbean: At World's End,PG-13,2007.0,169.0,Gore Verbinski,563.0,Johnny Depp,Orlando Bloom,Jack Davenport,40000.0,5000.0,1000.0
2,2,Spectre,PG-13,2015.0,148.0,Sam Mendes,0.0,Christoph Waltz,Rory Kinnear,Stephanie Sigman,11000.0,393.0,161.0
3,3,The Dark Knight Rises,PG-13,2012.0,164.0,Christopher Nolan,22000.0,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,27000.0,23000.0,23000.0
4,4,Star Wars: Episode VII - The Force Awakens,,,,Doug Walker,131.0,Doug Walker,Rob Walker,,131.0,12.0,


In [1441]:
stubnames = ['director', 'director_fb_likes',
 'actor', 'actor_fb_likes']


In [1442]:
movie_long = pd.wide_to_long(movie,
stubnames=stubnames,
i='id',
j='num',
sep='_').reset_index()
movie_long

Unnamed: 0,id,num,year,title,duration,rating,director,director_fb_likes,actor,actor_fb_likes
0,0,1,2009.0,Avatar,178.0,PG-13,James Cameron,0.0,CCH Pounder,1000.0
1,0,2,2009.0,Avatar,178.0,PG-13,,,Joel David Moore,936.0
2,0,3,2009.0,Avatar,178.0,PG-13,,,Wes Studi,855.0
3,1,1,2007.0,Pirates of the Caribbean: At World's End,169.0,PG-13,Gore Verbinski,563.0,Johnny Depp,40000.0
4,1,2,2007.0,Pirates of the Caribbean: At World's End,169.0,PG-13,,,Orlando Bloom,5000.0
...,...,...,...,...,...,...,...,...,...,...
14743,4914,2,2012.0,Shanghai Calling,100.0,PG-13,,,Daniel Henney,719.0
14744,4914,3,2012.0,Shanghai Calling,100.0,PG-13,,,Eliza Coupe,490.0
14745,4915,1,2004.0,My Date with Drew,90.0,PG,Jon Gunn,16.0,John August,86.0
14746,4915,2,2004.0,My Date with Drew,90.0,PG,,,Brian Herzlinger,23.0


In [1443]:
movie_long['num'] = movie_long['num'].astype(int)
movie_long.head(9)

Unnamed: 0,id,num,year,title,duration,rating,director,director_fb_likes,actor,actor_fb_likes
0,0,1,2009.0,Avatar,178.0,PG-13,James Cameron,0.0,CCH Pounder,1000.0
1,0,2,2009.0,Avatar,178.0,PG-13,,,Joel David Moore,936.0
2,0,3,2009.0,Avatar,178.0,PG-13,,,Wes Studi,855.0
3,1,1,2007.0,Pirates of the Caribbean: At World's End,169.0,PG-13,Gore Verbinski,563.0,Johnny Depp,40000.0
4,1,2,2007.0,Pirates of the Caribbean: At World's End,169.0,PG-13,,,Orlando Bloom,5000.0
5,1,3,2007.0,Pirates of the Caribbean: At World's End,169.0,PG-13,,,Jack Davenport,1000.0
6,2,1,2015.0,Spectre,148.0,PG-13,Sam Mendes,0.0,Christoph Waltz,11000.0
7,2,2,2015.0,Spectre,148.0,PG-13,,,Rory Kinnear,393.0
8,2,3,2015.0,Spectre,148.0,PG-13,,,Stephanie Sigman,161.0


In [1444]:
movie_table = movie_long[['id', 'year', 'duration', 'rating']]
movie_table.head()

Unnamed: 0,id,year,duration,rating
0,0,2009.0,178.0,PG-13
1,0,2009.0,178.0,PG-13
2,0,2009.0,178.0,PG-13
3,1,2007.0,169.0,PG-13
4,1,2007.0,169.0,PG-13


In [1445]:
director_table = movie_long[['id', 'num',
'director', 'director_fb_likes']]

In [1446]:
director_table.head()

Unnamed: 0,id,num,director,director_fb_likes
0,0,1,James Cameron,0.0
1,0,2,,
2,0,3,,
3,1,1,Gore Verbinski,563.0
4,1,2,,


In [1447]:
actor_table = movie_long[['id', 'num',
'actor', 'actor_fb_likes']]

In [1448]:
actor_table.head()

Unnamed: 0,id,num,actor,actor_fb_likes
0,0,1,CCH Pounder,1000.0
1,0,2,Joel David Moore,936.0
2,0,3,Wes Studi,855.0
3,1,1,Johnny Depp,40000.0
4,1,2,Orlando Bloom,5000.0


In [1449]:
movie_entity = movie_table.drop_duplicates().reset_index(drop=True)
movie_entity.head()

Unnamed: 0,id,year,duration,rating
0,0,2009.0,178.0,PG-13
1,1,2007.0,169.0,PG-13
2,2,2015.0,148.0,PG-13
3,3,2012.0,164.0,PG-13
4,4,,,


In [1450]:
director_entity = director_table.dropna() \
 .reset_index(drop=True)
director_entity

Unnamed: 0,id,num,director,director_fb_likes
0,0,1,James Cameron,0.0
1,1,1,Gore Verbinski,563.0
2,2,1,Sam Mendes,0.0
3,3,1,Christopher Nolan,22000.0
4,4,1,Doug Walker,131.0
...,...,...,...,...
4809,4910,1,Edward Burns,0.0
4810,4911,1,Scott Smith,2.0
4811,4913,1,Benjamin Roberds,0.0
4812,4914,1,Daniel Hsia,0.0


In [1451]:
actor_entity= actor_table.dropna().reset_index(drop=True)
actor_entity.head()

Unnamed: 0,id,num,actor,actor_fb_likes
0,0,1,CCH Pounder,1000.0
1,0,2,Joel David Moore,936.0
2,0,3,Wes Studi,855.0
3,1,1,Johnny Depp,40000.0
4,1,2,Orlando Bloom,5000.0


In [1452]:
movie.memory_usage(deep=True).sum()

2280714

In [1453]:
movie_table.memory_usage(deep=True).sum() + \
director_table.memory_usage(deep=True).sum() + \
actor_table.memory_usage(deep=True).sum()

3499072

In [1454]:
director_cat = pd.Categorical(director_table['director'])

In [1455]:
director_cat

[James Cameron, NaN, NaN, Gore Verbinski, NaN, ..., NaN, NaN, Jon Gunn, NaN, NaN]
Length: 14748
Categories (2397, object): [A. Raven Cruz, Aaron Hann, Aaron Schneider, Aaron Seltzer, ..., Álex de la Iglesia, Émile Gaudreault, Éric Tessier, Étienne Faure]

In [1456]:
director_table.insert(1, 'director_id', director_cat.codes)

In [1457]:
director_table

Unnamed: 0,id,director_id,num,director,director_fb_likes
0,0,922,1,James Cameron,0.0
1,0,-1,2,,
2,0,-1,3,,
3,1,794,1,Gore Verbinski,563.0
4,1,-1,2,,
...,...,...,...,...,...
14743,4914,-1,2,,
14744,4914,-1,3,,
14745,4915,1158,1,Jon Gunn,16.0
14746,4915,-1,2,,


In [1458]:
actor_cat = pd.Categorical(actor_table['actor'])
actor_cat

[CCH Pounder, Joel David Moore, Wes Studi, Johnny Depp, Orlando Bloom, ..., Daniel Henney, Eliza Coupe, John August, Brian Herzlinger, Jon Gunn]
Length: 14748
Categories (6250, object): [50 Cent, A. Michael Baldwin, A.J. Buckley, A.J. DeLucia, ..., Ángela Molina, Émilie Dequenne, Ólafur Darri Ólafsson, Óscar Jaenada]

In [1459]:
actor_table.insert(1, 'actor_id', actor_cat.codes)
actor_table.head()

Unnamed: 0,id,actor_id,num,actor,actor_fb_likes
0,0,824,1,CCH Pounder,1000.0
1,0,2867,2,Joel David Moore,936.0
2,0,6099,3,Wes Studi,855.0
3,1,2971,1,Johnny Depp,40000.0
4,1,4536,2,Orlando Bloom,5000.0


In [1460]:
director_associative = director_table[['id', 'director_id', 'num']]

In [1461]:
dcols = ['director_id', 'director', 'director_fb_likes']

In [1462]:
director_unique = director_table[dcols].drop_duplicates() \
 .reset_index(drop=True)
director_unique.head()

Unnamed: 0,director_id,director,director_fb_likes
0,922,James Cameron,0.0
1,-1,,
2,794,Gore Verbinski,563.0
3,2020,Sam Mendes,0.0
4,373,Christopher Nolan,22000.0


In [1463]:
actor_associative = actor_table[['id', 'actor_id', 'num']]
actor_associative.head()

Unnamed: 0,id,actor_id,num
0,0,824,1
1,0,2867,2
2,0,6099,3
3,1,2971,1
4,1,4536,2


In [1464]:
acols = ['actor_id', 'actor', 'actor_fb_likes']
acols

['actor_id', 'actor', 'actor_fb_likes']

In [1465]:
actor_unique = actor_table[acols].drop_duplicates() \
.reset_index(drop=True)
actor_unique.head()

Unnamed: 0,actor_id,actor,actor_fb_likes
0,824,CCH Pounder,1000.0
1,2867,Joel David Moore,936.0
2,6099,Wes Studi,855.0
3,2971,Johnny Depp,40000.0
4,4536,Orlando Bloom,5000.0


In [1466]:
movie_table.memory_usage(deep=True).sum() + \
 director_associative.memory_usage(deep=True).sum() + \
 director_unique.memory_usage(deep=True).sum() + \
 actor_associative.memory_usage(deep=True).sum() + \
 actor_unique.memory_usage(deep=True).sum()

#deep = True to get memory from all kinds of dtypes including objects

2324279

In [1467]:
actors.head()

Unnamed: 0_level_0,actor_1,actor_2,actor_3,actor_fb_likes_1,actor_fb_likes_2,actor_fb_likes_3
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,CCH Pounder,Joel David Moore,Wes Studi,1000.0,936.0,855.0
1,Johnny Depp,Orlando Bloom,Jack Davenport,40000.0,5000.0,1000.0
2,Christoph Waltz,Rory Kinnear,Stephanie Sigman,11000.0,393.0,161.0
3,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,27000.0,23000.0,23000.0
4,Doug Walker,Rob Walker,,131.0,12.0,


In [1468]:
actor_associative.head()

Unnamed: 0,id,actor_id,num
0,0,824,1
1,0,2867,2
2,0,6099,3
3,1,2971,1
4,1,4536,2


In [1469]:
actor_unique.head()

Unnamed: 0,actor_id,actor,actor_fb_likes
0,824,CCH Pounder,1000.0
1,2867,Joel David Moore,936.0
2,6099,Wes Studi,855.0
3,2971,Johnny Depp,40000.0
4,4536,Orlando Bloom,5000.0


In [1470]:
actors = actor_associative.merge(actor_unique, on='actor_id') \
 .drop('actor_id', 1) \
 .pivot_table(index='id',
 columns='num',
 aggfunc='first')
actors.head()

Unnamed: 0_level_0,actor,actor,actor,actor_fb_likes,actor_fb_likes,actor_fb_likes
num,1,2,3,1,2,3
id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
0,CCH Pounder,Joel David Moore,Wes Studi,1000.0,936.0,855.0
1,Johnny Depp,Orlando Bloom,Jack Davenport,40000.0,5000.0,1000.0
2,Christoph Waltz,Rory Kinnear,Stephanie Sigman,11000.0,393.0,161.0
3,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,27000.0,23000.0,23000.0
4,Doug Walker,Rob Walker,,131.0,12.0,


In [1471]:
df = pd.DataFrame([[1, 2, "A"], [1, 4, "A"], [5, 6, "B"]])
df

Unnamed: 0,0,1,2
0,1,2,A
1,1,4,A
2,5,6,B


In [1472]:
df.pivot_table(index=0, values=1, columns=2)  # default aggfunc is 'mean'

2,A,B
0,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3.0,
5,,6.0


In [1473]:
df.pivot_table(index=0, values=1, columns=2, aggfunc='first')

2,A,B
0,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2.0,
5,,6.0


In [1474]:
actors.columns = actors.columns.get_level_values(0) + '_' + \
 actors.columns.get_level_values(1).astype(str)

In [1475]:
actors.head()

Unnamed: 0_level_0,actor_1,actor_2,actor_3,actor_fb_likes_1,actor_fb_likes_2,actor_fb_likes_3
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,CCH Pounder,Joel David Moore,Wes Studi,1000.0,936.0,855.0
1,Johnny Depp,Orlando Bloom,Jack Davenport,40000.0,5000.0,1000.0
2,Christoph Waltz,Rory Kinnear,Stephanie Sigman,11000.0,393.0,161.0
3,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,27000.0,23000.0,23000.0
4,Doug Walker,Rob Walker,,131.0,12.0,


In [1476]:
directors= director_associative.merge(director_unique,
on='director_id') \
.drop('director_id', 1) \
.pivot_table(index='id',
columns='num',
aggfunc='first')
directors.head()

Unnamed: 0_level_0,director,director_fb_likes
num,1,1
id,Unnamed: 1_level_2,Unnamed: 2_level_2
0,James Cameron,0.0
1,Gore Verbinski,563.0
2,Sam Mendes,0.0
3,Christopher Nolan,22000.0
4,Doug Walker,131.0


In [1477]:
#get_level_values why?

In [1478]:
directors.columns = directors.columns.get_level_values(0) + '_' + \
 directors.columns.get_level_values(1) \
 .astype(str)

In [1479]:
directors.head()

Unnamed: 0_level_0,director_1,director_fb_likes_1
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,James Cameron,0.0
1,Gore Verbinski,563.0
2,Sam Mendes,0.0
3,Christopher Nolan,22000.0
4,Doug Walker,131.0


In [1480]:
movie2 = movie_table.merge(directors.reset_index(),
on='id', how='left') \
.merge(actors.reset_index(),
on='id', how='left')

In [1481]:
movie2.head()

Unnamed: 0,id,year,duration,rating,director_1,director_fb_likes_1,actor_1,actor_2,actor_3,actor_fb_likes_1,actor_fb_likes_2,actor_fb_likes_3
0,0,2009.0,178.0,PG-13,James Cameron,0.0,CCH Pounder,Joel David Moore,Wes Studi,1000.0,936.0,855.0
1,0,2009.0,178.0,PG-13,James Cameron,0.0,CCH Pounder,Joel David Moore,Wes Studi,1000.0,936.0,855.0
2,0,2009.0,178.0,PG-13,James Cameron,0.0,CCH Pounder,Joel David Moore,Wes Studi,1000.0,936.0,855.0
3,1,2007.0,169.0,PG-13,Gore Verbinski,563.0,Johnny Depp,Orlando Bloom,Jack Davenport,40000.0,5000.0,1000.0
4,1,2007.0,169.0,PG-13,Gore Verbinski,563.0,Johnny Depp,Orlando Bloom,Jack Davenport,40000.0,5000.0,1000.0


In [1482]:
movie.equals(movie2[movie.columns]) 
#shoul

KeyError: "['title'] not in index"