# Agenda

1. Pivot tables
2. Stack, unstack, melt
3. Joins
4. `filter` on columns
5. Cleaning data

In [1]:
import numpy as np
import pandas as pd 
from pandas import Series, DataFrame

In [2]:
df = DataFrame(np.random.randint(0, 1000, [4,5]),
              index=list('abcd'),
              columns=list('vwxyz'))
df

Unnamed: 0,v,w,x,y,z
a,906,704,539,445,618
b,692,477,543,649,594
c,853,732,548,193,643
d,501,406,614,962,466


In [3]:
df.loc['e'] = [1,2,3,4,5]
df

Unnamed: 0,v,w,x,y,z
a,906,704,539,445,618
b,692,477,543,649,594
c,853,732,548,193,643
d,501,406,614,962,466
e,1,2,3,4,5


In [4]:
new_row = {'w':20, 'x':30, 'z':50}
df.loc['f'] = new_row

In [5]:
df

Unnamed: 0,v,w,x,y,z
a,906.0,704,539,445.0,618
b,692.0,477,543,649.0,594
c,853.0,732,548,193.0,643
d,501.0,406,614,962.0,466
e,1.0,2,3,4.0,5
f,,20,30,,50


In [6]:
df['w'] = [10, 20, 30, 40, 50, 60]
df

Unnamed: 0,v,w,x,y,z
a,906.0,10,539,445.0,618
b,692.0,20,543,649.0,594
c,853.0,30,548,193.0,643
d,501.0,40,614,962.0,466
e,1.0,50,3,4.0,5
f,,60,30,,50


In [7]:
df['u'] = {'a':100, 'c':300, 'e':500}

In [8]:
df

Unnamed: 0,v,w,x,y,z,u
a,906.0,10,539,445.0,618,100.0
b,692.0,20,543,649.0,594,
c,853.0,30,548,193.0,643,300.0
d,501.0,40,614,962.0,466,
e,1.0,50,3,4.0,5,500.0
f,,60,30,,50,


# Grouping

If we have a data frame with:
- One categorical column
- One numeric column

We can use `groupby` to calculate an aggregation method once per category, for all numeric rows matching.

In [9]:
df = pd.read_csv('taxi.csv')
df.groupby('passenger_count')['total_amount'].mean()

passenger_count
0    25.570000
1    17.368569
2    18.406306
3    17.994704
4    18.881648
5    17.211269
6    17.401355
Name: total_amount, dtype: float64

In [10]:
# we can do a 2D grouping

df.groupby(['VendorID', 'passenger_count'])['total_amount'].mean()

VendorID  passenger_count
1         0                  25.570000
          1                  16.941386
          2                  19.076807
          3                  19.002803
          4                  20.518657
          5                  20.466667
2         1                  17.904989
          2                  17.855770
          3                  17.359076
          4                  17.927913
          5                  17.192379
          6                  17.401355
Name: total_amount, dtype: float64

# Pivot table

A pivot table is a 2-dimensional grouping!  It's similar to our multi-index, but uses a table

- We need one categorical column -- this will be the index, for the rows
- A second categorical column - this will be for the columns
- A numeric column
- Aggregation method

In [11]:
df.pivot_table(index='VendorID', columns='passenger_count', values='total_amount')

passenger_count,0,1,2,3,4,5,6
VendorID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,25.57,16.941386,19.076807,19.002803,20.518657,20.466667,
2,,17.904989,17.85577,17.359076,17.927913,17.192379,17.401355


In [12]:
df.groupby(['VendorID', 'passenger_count'])['total_amount'].mean()

VendorID  passenger_count
1         0                  25.570000
          1                  16.941386
          2                  19.076807
          3                  19.002803
          4                  20.518657
          5                  20.466667
2         1                  17.904989
          2                  17.855770
          3                  17.359076
          4                  17.927913
          5                  17.192379
          6                  17.401355
Name: total_amount, dtype: float64

In [13]:
df.pivot(index='VendorID', columns='passenger_count', values='total_amount')

ValueError: Index contains duplicate entries, cannot reshape

In [15]:
df.pivot_table(index='VendorID', columns='passenger_count', values='total_amount',
              aggfunc='mean')

passenger_count,0,1,2,3,4,5,6
VendorID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,25.57,16.941386,19.076807,19.002803,20.518657,20.466667,
2,,17.904989,17.85577,17.359076,17.927913,17.192379,17.401355


In [16]:
df.pivot_table(index='VendorID', columns='passenger_count', values='total_amount',
              aggfunc='max')

passenger_count,0,1,2,3,4,5,6
VendorID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,36.39,252.35,102.35,73.55,73.84,27.3,
2,,137.59,138.84,74.46,72.92,102.11,83.12


In [18]:
# what if we want both mean + standard deviation?
df.pivot_table(columns='VendorID', index='passenger_count', values='total_amount',
              aggfunc=['mean', 'std'])

Unnamed: 0_level_0,mean,mean,std,std
VendorID,1,2,1,2
passenger_count,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,25.57,,15.301791,
1,16.941386,17.904989,15.369459,15.16749
2,19.076807,17.85577,16.049125,15.23353
3,19.002803,17.359076,15.389968,12.540837
4,20.518657,17.927913,15.845568,14.630713
5,20.466667,17.192379,6.751543,14.064202
6,,17.401355,,13.363827


In [19]:
# what if we want to look at more than one value column?
df.pivot_table(columns='VendorID', index='passenger_count', values=['total_amount', 'trip_distance'],
              aggfunc='mean')

Unnamed: 0_level_0,total_amount,total_amount,trip_distance,trip_distance
VendorID,1,2,1,2
passenger_count,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,25.57,,4.6,
1,16.941386,17.904989,2.956456,3.262967
2,19.076807,17.85577,3.452027,3.328849
3,19.002803,17.359076,3.588535,3.187189
4,20.518657,17.927913,3.952239,3.440522
5,20.466667,17.192379,4.933333,3.172553
6,,17.401355,,3.170976


In [23]:
# what if we want to look at more than one category for columns?
df.pivot_table(columns=['VendorID', 'payment_type'], index='passenger_count', values='total_amount',
              aggfunc='mean')

VendorID,1,1,1,1,2,2,2,2
payment_type,1,2,3,4,1,2,3,4
passenger_count,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0,25.57,,,,,,,
1,19.673469,12.698253,13.215172,11.675,20.14739,14.510453,-5.3,-7.8
2,21.338436,16.006464,42.48,,20.623066,13.905152,,
3,19.225065,18.763418,,20.8,18.548926,15.5862,,
4,24.138387,17.401667,,,21.539123,14.378966,,
5,,20.466667,,,19.63475,13.225076,,
6,,,,,18.980288,15.361491,,


In [22]:
df.columns

Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'pickup_longitude',
       'pickup_latitude', 'RateCodeID', 'store_and_fwd_flag',
       'dropoff_longitude', 'dropoff_latitude', 'payment_type', 'fare_amount',
       'extra', 'mta_tax', 'tip_amount', 'tolls_amount',
       'improvement_surcharge', 'total_amount'],
      dtype='object')

In [24]:
# everything can be a list!
df.pivot_table(columns='VendorID', index=['passenger_count', 'payment_type'],
               values=['total_amount', 'trip_distance'],
              aggfunc=['mean', 'std'])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,mean,mean,mean,std,std,std,std
Unnamed: 0_level_1,Unnamed: 1_level_1,total_amount,total_amount,trip_distance,trip_distance,total_amount,total_amount,trip_distance,trip_distance
Unnamed: 0_level_2,VendorID,1,2,1,2,1,2,1,2
passenger_count,payment_type,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3
0,1,25.57,,4.6,,15.301791,,4.666905,
1,1,19.673469,20.14739,3.346904,3.486413,17.40171,16.330567,4.120243,4.23813
1,2,12.698253,14.510453,2.358766,2.925338,10.068471,12.397838,3.495789,3.973524
1,3,13.215172,-5.3,2.0,0.465,15.363039,2.828427,3.3,0.657609
1,4,11.675,-7.8,1.925,0.89,6.75,,1.96023,
2,1,21.338436,20.623066,3.46135,3.751722,17.341582,16.457943,4.038718,4.584112
2,2,16.006464,13.905152,3.346388,2.725152,13.427005,12.278545,4.50227,3.465173
2,3,42.48,,11.7,,27.06136,,9.134002,
3,1,19.225065,18.548926,3.11039,3.278926,14.268286,13.073727,3.406595,3.529512
3,2,18.763418,15.5862,4.05443,3.0505,16.588856,11.539061,5.085722,3.363113


In [25]:
pt = df.pivot_table(columns='VendorID', index=['passenger_count', 'payment_type'],
               values=['total_amount', 'trip_distance'],
              aggfunc=['mean', 'std'])

In [29]:
# passenger_count 4
# payment_type 1

# function mean
# column total_amount
# vendor ID 2

pt.loc[
   (4, 1)   # row selector
,
   # column selector
   ('mean', 'total_amount', 2)
]

21.539122807017545

In [30]:
# I want payment type 1 for all passenger counts

pt.xs(1, level='payment_type')

Unnamed: 0_level_0,mean,mean,mean,mean,std,std,std,std
Unnamed: 0_level_1,total_amount,total_amount,trip_distance,trip_distance,total_amount,total_amount,trip_distance,trip_distance
VendorID,1,2,1,2,1,2,1,2
passenger_count,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3
0,25.57,,4.6,,15.301791,,4.666905,
1,19.673469,20.14739,3.346904,3.486413,17.40171,16.330567,4.120243,4.23813
2,21.338436,20.623066,3.46135,3.751722,17.341582,16.457943,4.038718,4.584112
3,19.225065,18.548926,3.11039,3.278926,14.268286,13.073727,3.406595,3.529512
4,24.138387,21.539123,4.348387,4.059298,18.045435,17.046583,4.541429,4.831724
5,,19.63475,,3.51875,,15.734323,,4.261502
6,,18.980288,,3.178606,,13.856405,,3.499148


In [31]:
# we can use xs on columns, too!

pt.xs('trip_distance', level=1, axis='columns')

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,mean,std,std
Unnamed: 0_level_1,VendorID,1,2,1,2
passenger_count,payment_type,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
0,1,4.6,,4.666905,
1,1,3.346904,3.486413,4.120243,4.23813
1,2,2.358766,2.925338,3.495789,3.973524
1,3,2.0,0.465,3.3,0.657609
1,4,1.925,0.89,1.96023,
2,1,3.46135,3.751722,4.038718,4.584112
2,2,3.346388,2.725152,4.50227,3.465173
2,3,11.7,,9.134002,
3,1,3.11039,3.278926,3.406595,3.529512
3,2,4.05443,3.0505,5.085722,3.363113


# Exercise: Olympic pivot tables

1. Load the Olympic data (`olympic_athlete_events.csv`) into a data frame. Keep only the rows for the teams from : Israel, US, UK, France, India, China.
2. Create a pivot table of teams vs. sport, average height, where the year is equal to or after 1980, and the sport has `i` in its name.
3. Show a pivot table of teams vs. sport, min + max weight. 
4. Show a pivot table of teams vs. sport, min and max weight + height.
5. Show a pivot table of years vs. teams, number of competitors.  Retrieve the number for Israel in each year.

In [32]:
pt = df.pivot_table(columns='VendorID', index=['passenger_count', 'payment_type'],
               values=['total_amount', 'trip_distance'],
              aggfunc=['mean', 'std'])

In [34]:
df = pd.read_csv('olympic_athlete_events.csv',
                usecols=['Name', 'Age', 'Height', 'Weight', 'Team', 'Year', 'Sport'])
df.columns

Index(['Name', 'Age', 'Height', 'Weight', 'Team', 'Year', 'Sport'], dtype='object')

In [39]:
# keep only rows from 1980-
df = df.loc[df['Year'] >= 1980]

In [43]:
# keep only from countries: Israel, US, UK, France, India, China

df = df.loc[df['Team'].isin(['United States', 'Israel', 'United Kingdom', 'France', 'India', 'China'])]
df.head(10)

Unnamed: 0,Name,Age,Height,Weight,Team,Year,Sport
0,A Dijiang,24.0,180.0,80.0,China,1992,Basketball
1,A Lamusi,23.0,170.0,60.0,China,2012,Judo
10,Per Knut Aaland,31.0,188.0,75.0,United States,1992,Cross Country Skiing
11,Per Knut Aaland,31.0,188.0,75.0,United States,1992,Cross Country Skiing
12,Per Knut Aaland,31.0,188.0,75.0,United States,1992,Cross Country Skiing
13,Per Knut Aaland,31.0,188.0,75.0,United States,1992,Cross Country Skiing
14,Per Knut Aaland,33.0,188.0,75.0,United States,1994,Cross Country Skiing
15,Per Knut Aaland,33.0,188.0,75.0,United States,1994,Cross Country Skiing
16,Per Knut Aaland,33.0,188.0,75.0,United States,1994,Cross Country Skiing
17,Per Knut Aaland,33.0,188.0,75.0,United States,1994,Cross Country Skiing


In [44]:
df['Team'].value_counts()

Team
United States    8858
France           5550
China            4874
India             759
Israel            474
Name: count, dtype: int64

In [49]:
# keep only sports with "i" in the name

df = df.loc[df['Sport'].str.contains('i')]
df.head()

Unnamed: 0,Name,Age,Height,Weight,Team,Year,Sport
10,Per Knut Aaland,31.0,188.0,75.0,United States,1992,Cross Country Skiing
11,Per Knut Aaland,31.0,188.0,75.0,United States,1992,Cross Country Skiing
12,Per Knut Aaland,31.0,188.0,75.0,United States,1992,Cross Country Skiing
13,Per Knut Aaland,31.0,188.0,75.0,United States,1992,Cross Country Skiing
14,Per Knut Aaland,33.0,188.0,75.0,United States,1994,Cross Country Skiing


In [50]:
# 2. Create a pivot table of teams vs. sport, average height, where the year is equal to or after 1980, and the sport has `i` in its name.

# index = Sport
# columns = Team
# values = Height
# aggregation method =  mean

df.pivot_table(index='Sport',
              columns='Team',
              values='Height',
              aggfunc='mean')




Team,China,France,India,Israel,United States
Sport,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Alpine Skiing,171.166667,173.445255,167.6,181.0,174.090379
Athletics,174.985646,177.066901,171.186047,177.166667,178.065041
Badminton,176.44898,173.65,175.0,171.0,170.818182
Biathlon,167.177215,173.54065,,,173.824645
Bobsleigh,,182.368421,,,
Boxing,176.5,175.033333,171.484848,171.0,175.989583
Canoeing,179.366972,179.234973,,180.6,178.350318
Cross Country Skiing,169.520408,172.902834,167.666667,,173.645503
Curling,172.411765,177.1,,,174.510638
Cycling,172.37234,176.2,,164.5,176.320346


In [52]:
# 3. Show a pivot table of teams vs. sport, min + max weight. 

# index = Sport
# columns = Team
# values = Weight
# aggfunc = ['min', 'max']

df.pivot_table(index='Sport',
              columns='Team',
              values='Weight',
              aggfunc=['min', 'max'])

Unnamed: 0_level_0,min,min,min,min,min,max,max,max,max,max
Team,China,France,India,Israel,United States,China,France,India,Israel,United States
Sport,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
Alpine Skiing,60.0,47.0,54.0,74.0,53.0,75.0,98.0,71.0,74.0,102.0
Athletics,42.0,43.0,43.0,44.0,41.0,120.0,116.0,125.0,96.0,158.0
Badminton,52.0,55.0,59.0,61.0,52.0,81.0,80.0,85.0,61.0,91.0
Biathlon,52.0,52.0,,,48.0,71.0,80.0,,,84.0
Bobsleigh,,70.0,,,,,123.0,,,
Boxing,48.0,48.0,48.0,48.0,48.0,111.0,105.0,91.0,75.0,116.0
Canoeing,55.0,53.0,,64.0,55.0,92.0,95.0,,89.0,98.0
Cross Country Skiing,49.0,50.0,62.0,,48.0,83.0,81.0,74.0,,86.0
Curling,60.0,62.0,,,52.0,80.0,80.0,,,102.0
Cycling,51.0,47.0,,58.0,48.0,90.0,100.0,,59.0,104.0


In [53]:
# 4. Show a pivot table of teams vs. sport, min and max weight + height.

# index = 'Sport'
# columns = Team
# values = ['Weight', 'Height']
# aggfunc = ['min', 'max']

df.pivot_table(index='Sport',
              columns='Team',
              values=['Weight', 'Height'],
              aggfunc=['min', 'max'])

Unnamed: 0_level_0,min,min,min,min,min,min,min,min,min,min,max,max,max,max,max,max,max,max,max,max
Unnamed: 0_level_1,Height,Height,Height,Height,Height,Weight,Weight,Weight,Weight,Weight,Height,Height,Height,Height,Height,Weight,Weight,Weight,Weight,Weight
Team,China,France,India,Israel,United States,China,France,India,Israel,United States,China,France,India,Israel,United States,China,France,India,Israel,United States
Sport,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3
Alpine Skiing,160.0,154.0,162.0,181.0,160.0,60.0,47.0,54.0,74.0,53.0,180.0,191.0,177.0,181.0,193.0,75.0,98.0,71.0,74.0,102.0
Athletics,155.0,155.0,152.0,148.0,150.0,42.0,43.0,43.0,44.0,41.0,202.0,198.0,196.0,202.0,206.0,120.0,116.0,125.0,96.0,158.0
Badminton,161.0,164.0,160.0,171.0,157.0,52.0,55.0,59.0,61.0,52.0,190.0,186.0,189.0,171.0,187.0,81.0,80.0,85.0,61.0,91.0
Biathlon,160.0,160.0,,,152.0,52.0,52.0,,,48.0,175.0,188.0,,,190.0,71.0,80.0,,,84.0
Bobsleigh,,167.0,,,,,70.0,,,,,198.0,,,,,123.0,,,
Boxing,165.0,158.0,158.0,165.0,155.0,48.0,48.0,48.0,48.0,48.0,200.0,200.0,191.0,181.0,201.0,111.0,105.0,91.0,75.0,116.0
Canoeing,160.0,160.0,,165.0,157.0,55.0,53.0,,64.0,55.0,194.0,203.0,,192.0,205.0,92.0,95.0,,89.0,98.0
Cross Country Skiing,159.0,155.0,155.0,,157.0,49.0,50.0,62.0,,48.0,188.0,186.0,175.0,,193.0,83.0,81.0,74.0,,86.0
Curling,162.0,174.0,,,155.0,60.0,62.0,,,52.0,185.0,183.0,,,188.0,80.0,80.0,,,102.0
Cycling,160.0,155.0,,162.0,155.0,51.0,47.0,,58.0,48.0,189.0,192.0,,167.0,196.0,90.0,100.0,,59.0,104.0


In [57]:
# 5. Show a pivot table of years vs. teams, number of competitors.
# Retrieve the number for Israel in each year.

# index = Year
# column = Team
# values = Name
# aggfunc = count

df.pivot_table(index='Year',
              columns='Team',
              values='Name',
              aggfunc='count').fillna(0).astype(int)

Team,China,France,India,Israel,United States
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1980,38,230,36,0,106
1984,312,367,37,63,642
1988,348,452,36,29,712
1992,394,542,39,40,766
1994,43,131,0,1,187
1996,320,385,28,36,611
1998,68,123,0,3,212
2000,296,381,54,55,572
2002,83,138,0,3,231
2004,372,376,52,40,571


In [60]:
# 5. Show a pivot table of years vs. teams, number of competitors.
# Retrieve the number for Israel in each year.

# index = Year
# column = Team
# values = Name
# aggfunc = count

df.pivot_table(index='Year',
              columns='Team',
              values='Name',
              aggfunc=['min', 'max'])

Unnamed: 0_level_0,min,min,min,min,min,max,max,max,max,max
Team,China,France,India,Israel,United States,China,France,India,Israel,United States
Year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
1980,Bao Zhenghua,Alain Bondue,Adille Sumariwalla,,"Abigail E. ""Abbi"" Fisher (-Gould)",Zhao Weichang,Yvon Mougel,Tamil Selwan Muniswamy,,"William Nourse ""Billy"" Taylor"
1984,Bao Zhenghua,Agns Gosselin (-Tavernier),Baljit Singh Kharab,Arie Gamliel,Abdurrahim Kuzu,Zou Zhenxian,velyne Imbert,Vandana Rao,Zehava Shmueli,"Willie James Smith, III"
1988,Cai Jianming,Agns Gosselin (-Tavernier),"Alapurackal ""Mercy"" Kuttan-Mathews",Aharon Jacobashvili,"Abigail Knickerbocker ""Abby"" Peck",Zhuang Yong,velyne lien,Vinod Kumar,Yoel Sela,Zina Lynna Garrison (-Jackson)
1992,Bai Chongguang,"Abdelghani ""Ghani"" Yalouz",Abha Dhillan,Aleksey Bazarov,Aaron E. Pollock,Zou Sixin,ric Paul Alain Navet,Vimal Kumar,Yoel Sela,Zina Lynna Garrison (-Jackson)
1994,Chen Lu,Alexis Blanc,,Michael Shmerkin,"Alva Ross ""A J"" Kitt, IV",Zhao Guona,tienne Gouy,,Michael Shmerkin,Troy Robert Benson
1996,Ba Yanchuan,Abdel Kader Chkhmani,Ambika Radhika,Alex Tripolski,Adam Christophe Saathoff,Zou Sixin,velyne lien,Varalakshimi Pandimukkala Venkata,Yoav Bruck,Zahir A. Raheem
1998,An Yulong,Adrien Duvillard,,Galit Chait,Adam Hostetter,Zhao Hongbo,ric Le Chanony,,Sergey Sakhnovsky,"William ""Bill"" Demong"
2000,Abudoureheman,Abdel Jebahi,Abhinav Bindra,Adi Maia Bichman,Aaron Wells Peirsol,Zhu Yi,ric le Leuch,Vinita Tripathi,Yuriy Yevseychyk,Yolanda Nicole Gamble
2002,An Yulong,Alexandre Rousselet,,Olga Danilov,Aelin Peterson,Zhang Xiaolei,Vincent Vittoz,,Olga Danilov,"William Joseph N. ""Joey"" Cheek"
2004,Ai Linuer,Abderrahim El Haouzy,Abhinav Bindra,"Aleksandr ""Alex"" Averbukh",Aaron Wells Peirsol,Zou Shiming,ric Paul Alain Navet,Yogeshwar Dutt,Yuriy Yevseychyk,Yuliana Y. Martinez Perez


In [63]:
# get data for Israel

pt = df.pivot_table(index='Year',
              columns='Team',
              values='Name',
              aggfunc=['min', 'max'])

pt.xs('Israel', level='Team', axis='columns')

Unnamed: 0_level_0,min,max
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
1980,,
1984,Arie Gamliel,Zehava Shmueli
1988,Aharon Jacobashvili,Yoel Sela
1992,Aleksey Bazarov,Yoel Sela
1994,Michael Shmerkin,Michael Shmerkin
1996,Alex Tripolski,Yoav Bruck
1998,Galit Chait,Sergey Sakhnovsky
2000,Adi Maia Bichman,Yuriy Yevseychyk
2002,Olga Danilov,Olga Danilov
2004,"Aleksandr ""Alex"" Averbukh",Yuriy Yevseychyk


In [64]:
df = pd.read_csv('olympic_athlete_events.csv',
                usecols=['Name', 'Age', 'Height', 'Weight', 'Team', 'Year', 'Sport'])
df.columns

Index(['Name', 'Age', 'Height', 'Weight', 'Team', 'Year', 'Sport'], dtype='object')

In [65]:
# keep only rows from 1980-
df = df.loc[df['Year'] >= 1980]

In [69]:
# how do we get sports with just "i"

df.loc[df['Sport'].str.contains('i')]

Unnamed: 0,Name,Age,Height,Weight,Team,Year,Sport
4,Christine Jacoba Aaftink,21.0,185.0,82.0,Netherlands,1988,Speed Skating
5,Christine Jacoba Aaftink,21.0,185.0,82.0,Netherlands,1988,Speed Skating
6,Christine Jacoba Aaftink,25.0,185.0,82.0,Netherlands,1992,Speed Skating
7,Christine Jacoba Aaftink,25.0,185.0,82.0,Netherlands,1992,Speed Skating
8,Christine Jacoba Aaftink,27.0,185.0,82.0,Netherlands,1994,Speed Skating
...,...,...,...,...,...,...,...
271109,Aleksandr Viktorovich Zyuzin,28.0,183.0,72.0,Russia,2004,Rowing
271112,Piotr ya,27.0,176.0,59.0,Poland,2014,Ski Jumping
271113,Piotr ya,27.0,176.0,59.0,Poland,2014,Ski Jumping
271114,Tomasz Ireneusz ya,30.0,185.0,96.0,Poland,1998,Bobsleigh


In [70]:
# how do we get sports with "i" or "j"
# option 1: Use | (or)

df.loc[df['Sport'].str.contains('i')  | df['Sport'].str.contains('j')]

Unnamed: 0,Name,Age,Height,Weight,Team,Year,Sport
4,Christine Jacoba Aaftink,21.0,185.0,82.0,Netherlands,1988,Speed Skating
5,Christine Jacoba Aaftink,21.0,185.0,82.0,Netherlands,1988,Speed Skating
6,Christine Jacoba Aaftink,25.0,185.0,82.0,Netherlands,1992,Speed Skating
7,Christine Jacoba Aaftink,25.0,185.0,82.0,Netherlands,1992,Speed Skating
8,Christine Jacoba Aaftink,27.0,185.0,82.0,Netherlands,1994,Speed Skating
...,...,...,...,...,...,...,...
271109,Aleksandr Viktorovich Zyuzin,28.0,183.0,72.0,Russia,2004,Rowing
271112,Piotr ya,27.0,176.0,59.0,Poland,2014,Ski Jumping
271113,Piotr ya,27.0,176.0,59.0,Poland,2014,Ski Jumping
271114,Tomasz Ireneusz ya,30.0,185.0,96.0,Poland,1998,Bobsleigh


In [71]:
# option 2: Use a regular expression!

df.loc[df['Sport'].str.contains('[ij]')]

Unnamed: 0,Name,Age,Height,Weight,Team,Year,Sport
4,Christine Jacoba Aaftink,21.0,185.0,82.0,Netherlands,1988,Speed Skating
5,Christine Jacoba Aaftink,21.0,185.0,82.0,Netherlands,1988,Speed Skating
6,Christine Jacoba Aaftink,25.0,185.0,82.0,Netherlands,1992,Speed Skating
7,Christine Jacoba Aaftink,25.0,185.0,82.0,Netherlands,1992,Speed Skating
8,Christine Jacoba Aaftink,27.0,185.0,82.0,Netherlands,1994,Speed Skating
...,...,...,...,...,...,...,...
271109,Aleksandr Viktorovich Zyuzin,28.0,183.0,72.0,Russia,2004,Rowing
271112,Piotr ya,27.0,176.0,59.0,Poland,2014,Ski Jumping
271113,Piotr ya,27.0,176.0,59.0,Poland,2014,Ski Jumping
271114,Tomasz Ireneusz ya,30.0,185.0,96.0,Poland,1998,Bobsleigh


In [75]:
# what if we want both i and j

df.loc[df['Sport'].str.contains('i.*s|s.*i')]

Unnamed: 0,Name,Age,Height,Weight,Team,Year,Sport
10,Per Knut Aaland,31.0,188.0,75.0,United States,1992,Cross Country Skiing
11,Per Knut Aaland,31.0,188.0,75.0,United States,1992,Cross Country Skiing
12,Per Knut Aaland,31.0,188.0,75.0,United States,1992,Cross Country Skiing
13,Per Knut Aaland,31.0,188.0,75.0,United States,1992,Cross Country Skiing
14,Per Knut Aaland,33.0,188.0,75.0,United States,1994,Cross Country Skiing
...,...,...,...,...,...,...,...
271099,Stavroula Zygouri,36.0,171.0,63.0,Greece,2004,Wrestling
271102,Olesya Nikolayevna Zykina,19.0,171.0,64.0,Russia,2000,Athletics
271103,Olesya Nikolayevna Zykina,23.0,171.0,64.0,Russia,2004,Athletics
271114,Tomasz Ireneusz ya,30.0,185.0,96.0,Poland,1998,Bobsleigh


In [76]:
pt

Unnamed: 0_level_0,min,min,min,min,min,max,max,max,max,max
Team,China,France,India,Israel,United States,China,France,India,Israel,United States
Year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
1980,Bao Zhenghua,Alain Bondue,Adille Sumariwalla,,"Abigail E. ""Abbi"" Fisher (-Gould)",Zhao Weichang,Yvon Mougel,Tamil Selwan Muniswamy,,"William Nourse ""Billy"" Taylor"
1984,Bao Zhenghua,Agns Gosselin (-Tavernier),Baljit Singh Kharab,Arie Gamliel,Abdurrahim Kuzu,Zou Zhenxian,velyne Imbert,Vandana Rao,Zehava Shmueli,"Willie James Smith, III"
1988,Cai Jianming,Agns Gosselin (-Tavernier),"Alapurackal ""Mercy"" Kuttan-Mathews",Aharon Jacobashvili,"Abigail Knickerbocker ""Abby"" Peck",Zhuang Yong,velyne lien,Vinod Kumar,Yoel Sela,Zina Lynna Garrison (-Jackson)
1992,Bai Chongguang,"Abdelghani ""Ghani"" Yalouz",Abha Dhillan,Aleksey Bazarov,Aaron E. Pollock,Zou Sixin,ric Paul Alain Navet,Vimal Kumar,Yoel Sela,Zina Lynna Garrison (-Jackson)
1994,Chen Lu,Alexis Blanc,,Michael Shmerkin,"Alva Ross ""A J"" Kitt, IV",Zhao Guona,tienne Gouy,,Michael Shmerkin,Troy Robert Benson
1996,Ba Yanchuan,Abdel Kader Chkhmani,Ambika Radhika,Alex Tripolski,Adam Christophe Saathoff,Zou Sixin,velyne lien,Varalakshimi Pandimukkala Venkata,Yoav Bruck,Zahir A. Raheem
1998,An Yulong,Adrien Duvillard,,Galit Chait,Adam Hostetter,Zhao Hongbo,ric Le Chanony,,Sergey Sakhnovsky,"William ""Bill"" Demong"
2000,Abudoureheman,Abdel Jebahi,Abhinav Bindra,Adi Maia Bichman,Aaron Wells Peirsol,Zhu Yi,ric le Leuch,Vinita Tripathi,Yuriy Yevseychyk,Yolanda Nicole Gamble
2002,An Yulong,Alexandre Rousselet,,Olga Danilov,Aelin Peterson,Zhang Xiaolei,Vincent Vittoz,,Olga Danilov,"William Joseph N. ""Joey"" Cheek"
2004,Ai Linuer,Abderrahim El Haouzy,Abhinav Bindra,"Aleksandr ""Alex"" Averbukh",Aaron Wells Peirsol,Zou Shiming,ric Paul Alain Navet,Yogeshwar Dutt,Yuriy Yevseychyk,Yuliana Y. Martinez Perez


In [78]:
pt.T    # don't use () here!

Unnamed: 0_level_0,Year,1980,1984,1988,1992,1994,1996,1998,2000,2002,2004,2006,2008,2010,2012,2014,2016
Unnamed: 0_level_1,Team,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
min,China,Bao Zhenghua,Bao Zhenghua,Cai Jianming,Bai Chongguang,Chen Lu,Ba Yanchuan,An Yulong,Abudoureheman,An Yulong,Ai Linuer,An Weijiang,Bai Xue,Cai Xuetong,Bai Anqi,Ba Dexin,Ai Yanhan
min,France,Alain Bondue,Agns Gosselin (-Tavernier),Agns Gosselin (-Tavernier),"Abdelghani ""Ghani"" Yalouz",Alexis Blanc,Abdel Kader Chkhmani,Adrien Duvillard,Abdel Jebahi,Alexandre Rousselet,Abderrahim El Haouzy,Alexandre Rousselet,Adrien Hardy,Adrien Thaux,Abdellatif Meftah,Adeline Baud (-Mugnier),Adrien Bart
min,India,Adille Sumariwalla,Baljit Singh Kharab,"Alapurackal ""Mercy"" Kuttan-Mathews",Abha Dhillan,,Ambika Radhika,,Abhinav Bindra,,Abhinav Bindra,Bahadur Gurung Gupta,Abhinav Bindra,Jamyang Namgial,Abhinav Bindra,Himanshu Thakur,Abhinav Bindra
min,Israel,,Arie Gamliel,Aharon Jacobashvili,Aleksey Bazarov,Michael Shmerkin,Alex Tripolski,Galit Chait,Adi Maia Bichman,Olga Danilov,"Aleksandr ""Alex"" Averbukh",Mikail Renzhin,"Aleksandr ""Alex"" Averbukh",Alexandra Zaretski,Aleksandr Shatilov,Alexei Yuryevich Bychenko,Ageze Guadie
min,United States,"Abigail E. ""Abbi"" Fisher (-Gould)",Abdurrahim Kuzu,"Abigail Knickerbocker ""Abby"" Peck",Aaron E. Pollock,"Alva Ross ""A J"" Kitt, IV",Adam Christophe Saathoff,Adam Hostetter,Aaron Wells Peirsol,Aelin Peterson,Aaron Wells Peirsol,"Abigail Jean ""Abby"" Larson",Aarik Wilson,Alice McKennis,"Abdihakim ""Abdi"" Abdirahman",Aaron Blunck,Abbey Weitzeil
max,China,Zhao Weichang,Zou Zhenxian,Zhuang Yong,Zou Sixin,Zhao Guona,Zou Sixin,Zhao Hongbo,Zhu Yi,Zhang Xiaolei,Zou Shiming,Zhang Zhongqi,Zou Shiming,Zhou Yang,Zou Shiming,Zhou Yang,Zu Lijun
max,France,Yvon Mougel,velyne Imbert,velyne lien,ric Paul Alain Navet,tienne Gouy,velyne lien,ric Le Chanony,ric le Leuch,Vincent Vittoz,ric Paul Alain Navet,milie Vina,lodie Gugan,milie Vina,tienne Daille,Xavier Bertoni,tienne Hubert
max,India,Tamil Selwan Muniswamy,Vandana Rao,Vinod Kumar,Vimal Kumar,,Varalakshimi Pandimukkala Venkata,,Vinita Tripathi,,Yogeshwar Dutt,Neha Ahuja,Yogeshwar Dutt,Tashi Lundup,Yogeshwar Dutt,Nadeem Iqbal,Yogeshwar Dutt
max,Israel,,Zehava Shmueli,Yoel Sela,Yoel Sela,Michael Shmerkin,Yoav Bruck,Sergey Sakhnovsky,Yuriy Yevseychyk,Olga Danilov,Yuriy Yevseychyk,Mikail Renzhin,Veronika Vitenberg,Roman Zaretski,Zohar Zemiro,Vladislav Bykanov,Zohar Hen Shikler
max,United States,"William Nourse ""Billy"" Taylor","Willie James Smith, III",Zina Lynna Garrison (-Jackson),Zina Lynna Garrison (-Jackson),Troy Robert Benson,Zahir A. Raheem,"William ""Bill"" Demong",Yolanda Nicole Gamble,"William Joseph N. ""Joey"" Cheek",Yuliana Y. Martinez Perez,"William Joseph N. ""Joey"" Cheek","Zsuzsanna ""Susan"" Francia",Wynn Andrew Roberts,"Zsuzsanna ""Susan"" Francia","William ""Bill"" Demong","Zachery ""Zach"" Ziemek"


In [79]:
pt.transpose()    # same as pt.T, note that we need parentheses

Unnamed: 0_level_0,Year,1980,1984,1988,1992,1994,1996,1998,2000,2002,2004,2006,2008,2010,2012,2014,2016
Unnamed: 0_level_1,Team,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
min,China,Bao Zhenghua,Bao Zhenghua,Cai Jianming,Bai Chongguang,Chen Lu,Ba Yanchuan,An Yulong,Abudoureheman,An Yulong,Ai Linuer,An Weijiang,Bai Xue,Cai Xuetong,Bai Anqi,Ba Dexin,Ai Yanhan
min,France,Alain Bondue,Agns Gosselin (-Tavernier),Agns Gosselin (-Tavernier),"Abdelghani ""Ghani"" Yalouz",Alexis Blanc,Abdel Kader Chkhmani,Adrien Duvillard,Abdel Jebahi,Alexandre Rousselet,Abderrahim El Haouzy,Alexandre Rousselet,Adrien Hardy,Adrien Thaux,Abdellatif Meftah,Adeline Baud (-Mugnier),Adrien Bart
min,India,Adille Sumariwalla,Baljit Singh Kharab,"Alapurackal ""Mercy"" Kuttan-Mathews",Abha Dhillan,,Ambika Radhika,,Abhinav Bindra,,Abhinav Bindra,Bahadur Gurung Gupta,Abhinav Bindra,Jamyang Namgial,Abhinav Bindra,Himanshu Thakur,Abhinav Bindra
min,Israel,,Arie Gamliel,Aharon Jacobashvili,Aleksey Bazarov,Michael Shmerkin,Alex Tripolski,Galit Chait,Adi Maia Bichman,Olga Danilov,"Aleksandr ""Alex"" Averbukh",Mikail Renzhin,"Aleksandr ""Alex"" Averbukh",Alexandra Zaretski,Aleksandr Shatilov,Alexei Yuryevich Bychenko,Ageze Guadie
min,United States,"Abigail E. ""Abbi"" Fisher (-Gould)",Abdurrahim Kuzu,"Abigail Knickerbocker ""Abby"" Peck",Aaron E. Pollock,"Alva Ross ""A J"" Kitt, IV",Adam Christophe Saathoff,Adam Hostetter,Aaron Wells Peirsol,Aelin Peterson,Aaron Wells Peirsol,"Abigail Jean ""Abby"" Larson",Aarik Wilson,Alice McKennis,"Abdihakim ""Abdi"" Abdirahman",Aaron Blunck,Abbey Weitzeil
max,China,Zhao Weichang,Zou Zhenxian,Zhuang Yong,Zou Sixin,Zhao Guona,Zou Sixin,Zhao Hongbo,Zhu Yi,Zhang Xiaolei,Zou Shiming,Zhang Zhongqi,Zou Shiming,Zhou Yang,Zou Shiming,Zhou Yang,Zu Lijun
max,France,Yvon Mougel,velyne Imbert,velyne lien,ric Paul Alain Navet,tienne Gouy,velyne lien,ric Le Chanony,ric le Leuch,Vincent Vittoz,ric Paul Alain Navet,milie Vina,lodie Gugan,milie Vina,tienne Daille,Xavier Bertoni,tienne Hubert
max,India,Tamil Selwan Muniswamy,Vandana Rao,Vinod Kumar,Vimal Kumar,,Varalakshimi Pandimukkala Venkata,,Vinita Tripathi,,Yogeshwar Dutt,Neha Ahuja,Yogeshwar Dutt,Tashi Lundup,Yogeshwar Dutt,Nadeem Iqbal,Yogeshwar Dutt
max,Israel,,Zehava Shmueli,Yoel Sela,Yoel Sela,Michael Shmerkin,Yoav Bruck,Sergey Sakhnovsky,Yuriy Yevseychyk,Olga Danilov,Yuriy Yevseychyk,Mikail Renzhin,Veronika Vitenberg,Roman Zaretski,Zohar Zemiro,Vladislav Bykanov,Zohar Hen Shikler
max,United States,"William Nourse ""Billy"" Taylor","Willie James Smith, III",Zina Lynna Garrison (-Jackson),Zina Lynna Garrison (-Jackson),Troy Robert Benson,Zahir A. Raheem,"William ""Bill"" Demong",Yolanda Nicole Gamble,"William Joseph N. ""Joey"" Cheek",Yuliana Y. Martinez Perez,"William Joseph N. ""Joey"" Cheek","Zsuzsanna ""Susan"" Francia",Wynn Andrew Roberts,"Zsuzsanna ""Susan"" Francia","William ""Bill"" Demong","Zachery ""Zach"" Ziemek"


In [81]:
# stack -- take the 2nd layer of the column multi-index, and make it
# the 2nd layer of the row multi-index

pt.stack()

Unnamed: 0_level_0,Unnamed: 1_level_0,min,max
Year,Team,Unnamed: 2_level_1,Unnamed: 3_level_1
1980,China,Bao Zhenghua,Zhao Weichang
1980,France,Alain Bondue,Yvon Mougel
1980,India,Adille Sumariwalla,Tamil Selwan Muniswamy
1980,United States,"Abigail E. ""Abbi"" Fisher (-Gould)","William Nourse ""Billy"" Taylor"
1984,China,Bao Zhenghua,Zou Zhenxian
...,...,...,...
2016,China,Ai Yanhan,Zu Lijun
2016,France,Adrien Bart,tienne Hubert
2016,India,Abhinav Bindra,Yogeshwar Dutt
2016,Israel,Ageze Guadie,Zohar Hen Shikler


In [82]:
pt.stack().unstack()

Unnamed: 0_level_0,min,min,min,min,min,max,max,max,max,max
Team,China,France,India,Israel,United States,China,France,India,Israel,United States
Year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
1980,Bao Zhenghua,Alain Bondue,Adille Sumariwalla,,"Abigail E. ""Abbi"" Fisher (-Gould)",Zhao Weichang,Yvon Mougel,Tamil Selwan Muniswamy,,"William Nourse ""Billy"" Taylor"
1984,Bao Zhenghua,Agns Gosselin (-Tavernier),Baljit Singh Kharab,Arie Gamliel,Abdurrahim Kuzu,Zou Zhenxian,velyne Imbert,Vandana Rao,Zehava Shmueli,"Willie James Smith, III"
1988,Cai Jianming,Agns Gosselin (-Tavernier),"Alapurackal ""Mercy"" Kuttan-Mathews",Aharon Jacobashvili,"Abigail Knickerbocker ""Abby"" Peck",Zhuang Yong,velyne lien,Vinod Kumar,Yoel Sela,Zina Lynna Garrison (-Jackson)
1992,Bai Chongguang,"Abdelghani ""Ghani"" Yalouz",Abha Dhillan,Aleksey Bazarov,Aaron E. Pollock,Zou Sixin,ric Paul Alain Navet,Vimal Kumar,Yoel Sela,Zina Lynna Garrison (-Jackson)
1994,Chen Lu,Alexis Blanc,,Michael Shmerkin,"Alva Ross ""A J"" Kitt, IV",Zhao Guona,tienne Gouy,,Michael Shmerkin,Troy Robert Benson
1996,Ba Yanchuan,Abdel Kader Chkhmani,Ambika Radhika,Alex Tripolski,Adam Christophe Saathoff,Zou Sixin,velyne lien,Varalakshimi Pandimukkala Venkata,Yoav Bruck,Zahir A. Raheem
1998,An Yulong,Adrien Duvillard,,Galit Chait,Adam Hostetter,Zhao Hongbo,ric Le Chanony,,Sergey Sakhnovsky,"William ""Bill"" Demong"
2000,Abudoureheman,Abdel Jebahi,Abhinav Bindra,Adi Maia Bichman,Aaron Wells Peirsol,Zhu Yi,ric le Leuch,Vinita Tripathi,Yuriy Yevseychyk,Yolanda Nicole Gamble
2002,An Yulong,Alexandre Rousselet,,Olga Danilov,Aelin Peterson,Zhang Xiaolei,Vincent Vittoz,,Olga Danilov,"William Joseph N. ""Joey"" Cheek"
2004,Ai Linuer,Abderrahim El Haouzy,Abhinav Bindra,"Aleksandr ""Alex"" Averbukh",Aaron Wells Peirsol,Zou Shiming,ric Paul Alain Navet,Yogeshwar Dutt,Yuriy Yevseychyk,Yuliana Y. Martinez Perez


In [84]:
pt.stack()

Unnamed: 0_level_0,Unnamed: 1_level_0,min,max
Year,Team,Unnamed: 2_level_1,Unnamed: 3_level_1
1980,China,Bao Zhenghua,Zhao Weichang
1980,France,Alain Bondue,Yvon Mougel
1980,India,Adille Sumariwalla,Tamil Selwan Muniswamy
1980,United States,"Abigail E. ""Abbi"" Fisher (-Gould)","William Nourse ""Billy"" Taylor"
1984,China,Bao Zhenghua,Zou Zhenxian
...,...,...,...
2016,China,Ai Yanhan,Zu Lijun
2016,France,Adrien Bart,tienne Hubert
2016,India,Abhinav Bindra,Yogeshwar Dutt
2016,Israel,Ageze Guadie,Zohar Hen Shikler


In [85]:
pt.stack(level=0)

Unnamed: 0_level_0,Team,China,France,India,Israel,United States
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1980,max,Zhao Weichang,Yvon Mougel,Tamil Selwan Muniswamy,,"William Nourse ""Billy"" Taylor"
1980,min,Bao Zhenghua,Alain Bondue,Adille Sumariwalla,,"Abigail E. ""Abbi"" Fisher (-Gould)"
1984,max,Zou Zhenxian,velyne Imbert,Vandana Rao,Zehava Shmueli,"Willie James Smith, III"
1984,min,Bao Zhenghua,Agns Gosselin (-Tavernier),Baljit Singh Kharab,Arie Gamliel,Abdurrahim Kuzu
1988,max,Zhuang Yong,velyne lien,Vinod Kumar,Yoel Sela,Zina Lynna Garrison (-Jackson)
1988,min,Cai Jianming,Agns Gosselin (-Tavernier),"Alapurackal ""Mercy"" Kuttan-Mathews",Aharon Jacobashvili,"Abigail Knickerbocker ""Abby"" Peck"
1992,max,Zou Sixin,ric Paul Alain Navet,Vimal Kumar,Yoel Sela,Zina Lynna Garrison (-Jackson)
1992,min,Bai Chongguang,"Abdelghani ""Ghani"" Yalouz",Abha Dhillan,Aleksey Bazarov,Aaron E. Pollock
1994,max,Zhao Guona,tienne Gouy,,Michael Shmerkin,Troy Robert Benson
1994,min,Chen Lu,Alexis Blanc,,Michael Shmerkin,"Alva Ross ""A J"" Kitt, IV"


# Exercise: Pivot tables + stacking + unstacking with taxis

1. Read `taxi.csv` into a data frame.
2. Create a pivot table with the rows being the passenger_count, the columns will be VendorID, the values will be the trip_distance and total_amount, and we want to calculate min, max, and mean.
3. Move the min/max/mean to be on the rows, rather than the columns.
4. Move the trip_distance and total_amount to be on the rows, rather than the columns.
5. After you've done 4, move the passenger_count to be on the columns.

In [86]:
df = pd.read_csv('taxi.csv')
df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,2,2015-06-02 11:19:29,2015-06-02 11:47:52,1,1.63,-73.95443,40.764141,1,N,-73.974754,40.754093,2,17.0,0.0,0.5,0.0,0.0,0.3,17.8
1,2,2015-06-02 11:19:30,2015-06-02 11:27:56,1,0.46,-73.971443,40.758942,1,N,-73.978539,40.761909,1,6.5,0.0,0.5,1.0,0.0,0.3,8.3
2,2,2015-06-02 11:19:31,2015-06-02 11:30:30,1,0.87,-73.978111,40.738434,1,N,-73.990273,40.745438,1,8.0,0.0,0.5,2.2,0.0,0.3,11.0
3,2,2015-06-02 11:19:31,2015-06-02 11:39:02,1,2.13,-73.945892,40.773529,1,N,-73.971527,40.76033,1,13.5,0.0,0.5,2.86,0.0,0.3,17.16
4,1,2015-06-02 11:19:32,2015-06-02 11:32:49,1,1.4,-73.979088,40.776772,1,N,-73.982162,40.758999,2,9.5,0.0,0.5,0.0,0.0,0.3,10.3


In [87]:
# 2. Create a pivot table with the rows being the passenger_count, 
# the columns will be VendorID, the values will be the 
# trip_distance and total_amount, and we want to calculate min, max, and mean.

# index = passenger_count
# columns = VendorID
# values = ['trip_distance', 'total_amount']
# aggfunc = ['min', 'max', 'mean']

pt = df.pivot_table(index='passenger_count',
                   columns='VendorID',
                   values=['trip_distance', 'total_amount'],
                   aggfunc=['min', 'max', 'mean'])
pt

Unnamed: 0_level_0,min,min,min,min,max,max,max,max,mean,mean,mean,mean
Unnamed: 0_level_1,total_amount,total_amount,trip_distance,trip_distance,total_amount,total_amount,trip_distance,trip_distance,total_amount,total_amount,trip_distance,trip_distance
VendorID,1,2,1,2,1,2,1,2,1,2,1,2
passenger_count,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3
0,14.75,,1.3,,36.39,,7.9,,25.57,,4.6,
1,0.3,-7.8,0.0,0.0,252.35,137.59,64.6,35.51,16.941386,17.904989,2.956456,3.262967
2,0.3,2.3,0.0,0.0,102.35,138.84,26.0,29.78,19.076807,17.85577,3.452027,3.328849
3,4.8,4.8,0.0,0.0,73.55,74.46,18.0,21.8,19.002803,17.359076,3.588535,3.187189
4,5.8,3.3,0.0,0.0,73.84,72.92,19.3,22.67,20.518657,17.927913,3.952239,3.440522
5,13.8,3.3,3.0,0.0,27.3,102.11,8.3,24.97,20.466667,17.192379,4.933333,3.172553
6,,3.3,,0.0,,83.12,,23.81,,17.401355,,3.170976


In [88]:
# 3. Move the min/max/mean to be on the rows, rather than the columns.

pt.stack(level=0)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_amount,total_amount,trip_distance,trip_distance
Unnamed: 0_level_1,VendorID,1,2,1,2
passenger_count,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
0,max,36.39,,7.9,
0,mean,25.57,,4.6,
0,min,14.75,,1.3,
1,max,252.35,137.59,64.6,35.51
1,mean,16.941386,17.904989,2.956456,3.262967
1,min,0.3,-7.8,0.0,0.0
2,max,102.35,138.84,26.0,29.78
2,mean,19.076807,17.85577,3.452027,3.328849
2,min,0.3,2.3,0.0,0.0
3,max,73.55,74.46,18.0,21.8


In [91]:
# 4. Move the trip_distance and total_amount to be on the rows, rather than the columns.

pt = pt.stack(level=1)
pt

Unnamed: 0_level_0,Unnamed: 1_level_0,max,max,mean,mean,min,min
Unnamed: 0_level_1,VendorID,1,2,1,2,1,2
passenger_count,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
0,total_amount,36.39,,25.57,,14.75,
0,trip_distance,7.9,,4.6,,1.3,
1,total_amount,252.35,137.59,16.941386,17.904989,0.3,-7.8
1,trip_distance,64.6,35.51,2.956456,3.262967,0.0,0.0
2,total_amount,102.35,138.84,19.076807,17.85577,0.3,2.3
2,trip_distance,26.0,29.78,3.452027,3.328849,0.0,0.0
3,total_amount,73.55,74.46,19.002803,17.359076,4.8,4.8
3,trip_distance,18.0,21.8,3.588535,3.187189,0.0,0.0
4,total_amount,73.84,72.92,20.518657,17.927913,5.8,3.3
4,trip_distance,19.3,22.67,3.952239,3.440522,0.0,0.0


In [92]:
# 5. After you've done 4, move the passenger_count to be on the columns.

pt.unstack(level=0)

Unnamed: 0_level_0,max,max,max,max,max,max,max,max,max,max,...,min,min,min,min,min,min,min,min,min,min
VendorID,1,1,1,1,1,1,1,2,2,2,...,1,1,1,2,2,2,2,2,2,2
passenger_count,0,1,2,3,4,5,6,0,1,2,...,4,5,6,0,1,2,3,4,5,6
total_amount,36.39,252.35,102.35,73.55,73.84,27.3,,,137.59,138.84,...,5.8,13.8,,,-7.8,2.3,4.8,3.3,3.3,3.3
trip_distance,7.9,64.6,26.0,18.0,19.3,8.3,,,35.51,29.78,...,0.0,3.0,,,0.0,0.0,0.0,0.0,0.0,0.0


In [93]:
print(pt.unstack(level=0))

                   max                                                      \
VendorID             1                                           2           
passenger_count      0       1       2      3      4     5   6   0       1   
total_amount     36.39  252.35  102.35  73.55  73.84  27.3 NaN NaN  137.59   
trip_distance     7.90   64.60   26.00  18.00  19.30   8.3 NaN NaN   35.51   

                         ...  min                                              
VendorID                 ...    1             2                                
passenger_count       2  ...    4     5   6   0    1    2    3    4    5    6  
total_amount     138.84  ...  5.8  13.8 NaN NaN -7.8  2.3  4.8  3.3  3.3  3.3  
trip_distance     29.78  ...  0.0   3.0 NaN NaN  0.0  0.0  0.0  0.0  0.0  0.0  

[2 rows x 42 columns]


In [96]:
with open('/tmp/mydata.html', 'w') as f:
    f.write(pt.unstack(level=0).to_html())

In [97]:
help(df.to_html)

Help on method to_html in module pandas.core.frame:

to_html(buf: 'FilePath | WriteBuffer[str] | None' = None, columns: 'Sequence[Level] | None' = None, col_space: 'ColspaceArgType | None' = None, header: 'bool | Sequence[str]' = True, index: 'bool' = True, na_rep: 'str' = 'NaN', formatters: 'FormattersType | None' = None, float_format: 'FloatFormatType | None' = None, sparsify: 'bool | None' = None, index_names: 'bool' = True, justify: 'str | None' = None, max_rows: 'int | None' = None, max_cols: 'int | None' = None, show_dimensions: 'bool | str' = False, decimal: 'str' = '.', bold_rows: 'bool' = True, classes: 'str | list | tuple | None' = None, escape: 'bool' = True, notebook: 'bool' = False, border: 'int | bool | None' = None, table_id: 'str | None' = None, render_links: 'bool' = False, encoding: 'str | None' = None) -> 'str | None' method of pandas.core.frame.DataFrame instance
    Render a DataFrame as an HTML table.
    
    Parameters
    ----------
    buf : str, Path or Strin

In [102]:
df.pivot_table(index='passenger_count',
                   columns='VendorID',
                   values='total_amount')

VendorID,1,2
passenger_count,Unnamed: 1_level_1,Unnamed: 2_level_1
0,25.57,
1,16.941386,17.904989
2,19.076807,17.85577
3,19.002803,17.359076
4,20.518657,17.927913
5,20.466667,17.192379
6,,17.401355
