In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

### See https://vita.had.co.nz/papers/tidy-data.pdf

In [2]:
pew = pd.read_csv('./data/pew.csv')
pew.sample(5)

Unnamed: 0,religion,<$10k,$10-20k,$20-30k,$30-40k,$40-50k,$50-75k,$75-100k,$100-150k,>150k,Don't know/refused
13,Orthodox,13,17,23,32,32,47,38,42,46,73
12,Muslim,6,7,9,10,9,23,16,8,6,22
1,Atheist,12,27,37,52,35,70,73,59,74,76
3,Catholic,418,617,732,670,638,1116,949,792,633,1489
6,Hindu,1,9,7,9,11,34,47,48,54,37


# Melting (columns to rows)

In [5]:
pew_pivot = pew.melt(id_vars=['religion']
                    , value_vars=None # defaults to non-id_vars
                    , var_name='income'
                    , value_name='count'
                    )
pew_pivot.loc[pew_pivot.religion == 'Atheist']

Unnamed: 0,religion,income,count
1,Atheist,<$10k,12
19,Atheist,$10-20k,27
37,Atheist,$20-30k,37
55,Atheist,$30-40k,52
73,Atheist,$40-50k,35
91,Atheist,$50-75k,70
109,Atheist,$75-100k,73
127,Atheist,$100-150k,59
145,Atheist,>150k,74
163,Atheist,Don't know/refused,76


In [9]:
billboard = pd.read_csv('./data/billboard.csv')
billboard.head(5)

Unnamed: 0,year,artist,track,time,date.entered,wk1,wk2,wk3,wk4,wk5,...,wk67,wk68,wk69,wk70,wk71,wk72,wk73,wk74,wk75,wk76
0,2000,2 Pac,Baby Don't Cry (Keep...,4:22,2000-02-26,87,82.0,72.0,77.0,87.0,...,,,,,,,,,,
1,2000,2Ge+her,The Hardest Part Of ...,3:15,2000-09-02,91,87.0,92.0,,,...,,,,,,,,,,
2,2000,3 Doors Down,Kryptonite,3:53,2000-04-08,81,70.0,68.0,67.0,66.0,...,,,,,,,,,,
3,2000,3 Doors Down,Loser,4:24,2000-10-21,76,76.0,72.0,69.0,67.0,...,,,,,,,,,,
4,2000,504 Boyz,Wobble Wobble,3:35,2000-04-15,57,34.0,25.0,17.0,17.0,...,,,,,,,,,,


In [10]:
bb_melt = billboard.melt(
            id_vars=['year', 'artist', 'track', 'time', 'date.entered']
            , value_vars=None # defaults to non-id_vars
            , var_name='week'
            , value_name='rating'
        )
bb_melt.loc[bb_melt.artist == '2 Pac']

Unnamed: 0,year,artist,track,time,date.entered,week,rating
0,2000,2 Pac,Baby Don't Cry (Keep...,4:22,2000-02-26,wk1,87.0
317,2000,2 Pac,Baby Don't Cry (Keep...,4:22,2000-02-26,wk2,82.0
634,2000,2 Pac,Baby Don't Cry (Keep...,4:22,2000-02-26,wk3,72.0
951,2000,2 Pac,Baby Don't Cry (Keep...,4:22,2000-02-26,wk4,77.0
1268,2000,2 Pac,Baby Don't Cry (Keep...,4:22,2000-02-26,wk5,87.0
...,...,...,...,...,...,...,...
22507,2000,2 Pac,Baby Don't Cry (Keep...,4:22,2000-02-26,wk72,
22824,2000,2 Pac,Baby Don't Cry (Keep...,4:22,2000-02-26,wk73,
23141,2000,2 Pac,Baby Don't Cry (Keep...,4:22,2000-02-26,wk74,
23458,2000,2 Pac,Baby Don't Cry (Keep...,4:22,2000-02-26,wk75,


# Data in both rows and columns (pivot)

In [21]:
weather = pd.read_csv('./data/weather.csv')
weather.head(5)

Unnamed: 0,id,year,month,element,d1,d2,d3,d4,d5,d6,...,d22,d23,d24,d25,d26,d27,d28,d29,d30,d31
0,MX17004,2010,1,tmax,,,,,,,...,,,,,,,,,27.8,
1,MX17004,2010,1,tmin,,,,,,,...,,,,,,,,,14.5,
2,MX17004,2010,2,tmax,,27.3,24.1,,,,...,,29.9,,,,,,,,
3,MX17004,2010,2,tmin,,14.4,14.4,,,,...,,10.7,,,,,,,,
4,MX17004,2010,3,tmax,,,,,32.1,,...,,,,,,,,,,


In [25]:
w_melt = weather.melt(
            id_vars=['id', 'year', 'month', 'element']
            , value_vars=None # defaults to non-id_vars
            , var_name='day'
            , value_name='temp'
)
w_melt.loc[(w_melt.id == 'MX17004') & (w_melt.year == 2010) & (w_melt.month == 1)]

Unnamed: 0,id,year,month,element,day,temp
0,MX17004,2010,1,tmax,d1,
1,MX17004,2010,1,tmin,d1,
22,MX17004,2010,1,tmax,d2,
23,MX17004,2010,1,tmin,d2,
44,MX17004,2010,1,tmax,d3,
...,...,...,...,...,...,...
617,MX17004,2010,1,tmin,d29,
638,MX17004,2010,1,tmax,d30,27.8
639,MX17004,2010,1,tmin,d30,14.5
660,MX17004,2010,1,tmax,d31,


## Pivot (rows to columns)

In [27]:
# Show min and max temp as variables (columns)
# pivot: returns multiple values as a multi-index column
# pivot_table: Can specify an aggregation for multi-values
# Shown in optional piping notation
w_pivot = (
            w_melt
                .pivot_table(
                    index=['id', 'year', 'month', 'day']
                    , columns='element' # pivot column
                    , values='temp'     # value column
                    , aggfunc=np.min)    # optional
                .reset_index()
        )
w_pivot.head(5)

element,id,year,month,day,tmax,tmin
0,MX17004,2010,1,d30,27.8,14.5
1,MX17004,2010,2,d11,29.7,13.4
2,MX17004,2010,2,d2,27.3,14.4
3,MX17004,2010,2,d23,29.9,10.7
4,MX17004,2010,2,d3,24.1,14.4
