In [89]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

### See https://vita.had.co.nz/papers/tidy-data.pdf

In [26]:
pew = pd.read_csv('./data/pew.csv')
pew.sample(5)

Unnamed: 0,religion,<$10k,$10-20k,$20-30k,$30-40k,$40-50k,$50-75k,$75-100k,$100-150k,>150k,Don't know/refused
14,Other Christian,9,7,11,13,13,14,18,14,12,18
17,Unaffiliated,217,299,374,365,341,528,407,321,258,597
15,Other Faiths,20,33,40,46,49,63,46,40,41,71
6,Hindu,1,9,7,9,11,34,47,48,54,37
5,Evangelical Prot,575,869,1064,982,881,1486,949,723,414,1529


# Melting (Unpivot columns as values)

In [30]:
pew_pivot = pew.melt(id_vars=['religion']
                    , value_vars=None # defaults to non-id_vars
                    , var_name='income'
                    , value_name='count'
                    )
pew_pivot.sample(5)

Unnamed: 0,religion,income,count
49,Orthodox,$20-30k,23
160,Other World Religions,>150k,4
96,Hindu,$50-75k,34
30,Muslim,$10-20k,7
11,Mormon,<$10k,29


In [31]:
billboard = pd.read_csv('./data/billboard.csv')
billboard.sample(5)

Unnamed: 0,year,artist,track,time,date.entered,wk1,wk2,wk3,wk4,wk5,...,wk67,wk68,wk69,wk70,wk71,wk72,wk73,wk74,wk75,wk76
215,2000,Mya,Case Of The Ex (What...,3:50,2000-08-19,72,57.0,52.0,47.0,42.0,...,,,,,,,,,,
207,2000,Metallica,I Disappear,4:26,2000-05-13,86,84.0,88.0,81.0,81.0,...,,,,,,,,,,
268,2000,Son By Four,A Puro Dolor (Purest...,3:30,2000-04-08,80,80.0,80.0,79.0,72.0,...,,,,,,,,,,
18,2000,"Anthony, Marc",My Baby You,3:59,2000-09-16,82,76.0,76.0,70.0,82.0,...,,,,,,,,,,
300,2000,Vertical Horizon,Everything You Want,4:01,2000-01-22,70,61.0,53.0,46.0,40.0,...,,,,,,,,,,


In [33]:
bb_melt = billboard.melt(
            id_vars=['year', 'artist', 'track', 'time', 'date.entered']
            , value_vars=None # defaults to non-id_vars
            , var_name='week'
            , value_name='rating'
        )
bb_melt.sample(5)

Unnamed: 0,year,artist,track,time,date.entered,week,rating
13104,2000,Ghostface Killah,Cherchez LaGhost,3:04,2000-08-05,wk42,
6580,2000,Q-Tip,Breathe And Stop,4:06,2000-01-22,wk21,
4652,2000,Musiq,Just Friends,4:02,2000-10-14,wk15,41.0
2147,2000,Red Hot Chili Peppers,Otherside,4:13,2000-02-12,wk7,40.0
19690,2000,"Blige, Mary J.",Deep Inside,5:26,1999-11-13,wk63,


# Split multi-valued columns as values

In [34]:
ebola = pd.read_csv('./data/country_timeseries.csv')
ebola.sample(5)

Unnamed: 0,Date,Day,Cases_Guinea,Cases_Liberia,Cases_SierraLeone,Cases_Nigeria,Cases_Senegal,Cases_UnitedStates,Cases_Spain,Cases_Mali,Deaths_Guinea,Deaths_Liberia,Deaths_SierraLeone,Deaths_Nigeria,Deaths_Senegal,Deaths_UnitedStates,Deaths_Spain,Deaths_Mali
15,11/30/2014,253,2164.0,,7312.0,20.0,1.0,4.0,1.0,7.0,1327.0,,1583.0,8.0,0.0,1.0,0.0,6.0
4,12/31/2014,284,2730.0,8115.0,9633.0,,,,,,1739.0,3471.0,2827.0,,,,,
94,5/10/2014,49,233.0,12.0,0.0,,,,,,157.0,11.0,0.0,,,,,
61,8/18/2014,149,579.0,972.0,907.0,15.0,,,,,396.0,576.0,374.0,4.0,,,,
20,11/16/2014,239,1971.0,,6073.0,20.0,1.0,4.0,1.0,5.0,1192.0,,1250.0,8.0,0.0,1.0,0.0,5.0


In [61]:
ebola_melt = ebola.melt(
            id_vars=['Date', 'Day']
            , value_vars=None # defaults to non-id_vars
            , var_name='val_col'
            , value_name='count'
        )

ebola_melt.sample(5)

Unnamed: 0,Date,Day,val_col,count
356,4/4/2014,13,Cases_SierraLeone,2.0
41,10/8/2014,200,Cases_Guinea,
1724,11/28/2014,251,Deaths_Spain,
974,3/24/2014,2,Cases_Mali,
1506,10/7/2014,199,Deaths_Senegal,


In [104]:
# Single assignment
ebola_melt['cases'] = ebola_melt.val_col.str.split('_').str.get(0)
ebola_melt['country'] = ebola_melt.val_col.str.split('_').str[1]
ebola_melt.sample(5)

Unnamed: 0,Date,Day,val_col,count,cases,country
1818,4/9/2014,18,Deaths_Spain,,Deaths,Spain
1244,11/9/2014,232,Deaths_SierraLeone,1169.0,Deaths,SierraLeone
575,6/5/2014,75,Cases_Senegal,,Cases,Senegal
807,7/12/2014,112,Cases_Spain,,Cases,Spain
1033,9/5/2014,167,Deaths_Guinea,517.0,Deaths,Guinea


In [72]:
# Multiple assignment
# expand=True, returns multiple column series
ebola_melt[['cases', 'country']] = ebola_melt.val_col.str.split('_', expand=True)
ebola_melt.sample(5)

Unnamed: 0,Date,Day,val_col,count,cases,country
1732,11/9/2014,232,Deaths_Spain,0.0,Deaths,Spain
358,3/31/2014,9,Cases_SierraLeone,2.0,Cases,SierraLeone
1055,6/30/2014,100,Deaths_Guinea,303.0,Deaths,Guinea
393,11/3/2014,226,Cases_Nigeria,,Cases,Nigeria
182,8/20/2014,151,Cases_Liberia,1082.0,Cases,Liberia


# Data in both rows and columns (pivot)

In [73]:
weather = pd.read_csv('./data/weather.csv')
weather.sample(5)

Unnamed: 0,id,year,month,element,d1,d2,d3,d4,d5,d6,...,d22,d23,d24,d25,d26,d27,d28,d29,d30,d31
18,MX17004,2010,11,tmax,,31.3,,27.2,26.3,,...,,,,,28.1,27.7,,,,
2,MX17004,2010,2,tmax,,27.3,24.1,,,,...,,29.9,,,,,,,,
6,MX17004,2010,4,tmax,,,,,,,...,,,,,,36.3,,,,
8,MX17004,2010,5,tmax,,,,,,,...,,,,,,33.2,,,,
0,MX17004,2010,1,tmax,,,,,,,...,,,,,,,,,27.8,


In [83]:
w_melt = weather.melt(
            id_vars=['id', 'year', 'month', 'element']
            , value_vars=None # defaults to non-id_vars
            , var_name='day'
            , value_name='temp'
)
w_melt.sample(5)

Unnamed: 0,id,year,month,element,day,temp
170,MX17004,2010,10,tmax,d8,
525,MX17004,2010,11,tmin,d24,
494,MX17004,2010,6,tmax,d23,
531,MX17004,2010,2,tmin,d25,
320,MX17004,2010,7,tmax,d15,


In [98]:
# Show min and max temp as variables (columns)
# pivot: returns multiple values as a multi-index column
# pivot_table: Can specify an aggregation for multi-values
# Shown in optional piping notation
w_pivot = (
            w_melt
                .pivot_table(
                    index=['id', 'year', 'month', 'day']
                    , columns='element' # pivot column
                    , values='temp'     # value column
                    , aggfunc=np.min)    # optional
                .reset_index()
                .sample(5)
        )
w_pivot

element,id,year,month,day,tmax,tmin
20,MX17004,2010,8,d8,29.0,17.3
28,MX17004,2010,11,d27,27.7,14.2
3,MX17004,2010,2,d23,29.9,10.7
23,MX17004,2010,10,d15,28.7,10.5
21,MX17004,2010,10,d5,27.0,14.0
