# Some important data transformation tools

## Multi Index, Hierarchical Indexing

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
np.random.seed(1)
# manual multi-index creation:
# provide the index argument with a list of lists
data = pd.Series(np.random.randint(10, size = 9), 
                 index = [['a','a','a','b','b','b','c','c','c'],
                          [ 1 , 2 , 3 , 1 , 2 , 3 , 1 , 2 , 3 ]])

In [None]:
data

In [None]:
data.index

In [None]:
# select via the outer index
data.loc['b']

In [None]:
# select via the inner index
data.loc[:,2] 

In [None]:
type(data.loc[:,2])

In [None]:
data.loc[:,2].index

In [None]:
# the unstack function returns a new DataFrame where the values have been unstacked
# similar to tidyr's spread() function in R
data.unstack()

In [None]:
# after unstacking, the index is no longer a multi index
data.unstack().index

In [None]:
data.unstack().shape

In [None]:
# the inverse operation of unstack() is stack()
# applying both of these functions will return the same series
data.unstack().stack()

In [None]:
# you can swap the levels of the multi index using swaplevel
data.swaplevel()

In [None]:
# the .loc accessors work as expected
data.swaplevel().loc[:,'a']

In [None]:
# swaplevel will keep the original order
# you may want to sort based on the new swapped index levels
# you must save the output as data remains unchanged
data.swaplevel().sort_index()

In [None]:
print(data)

In [None]:
data.swaplevel().unstack()

In [None]:
# summing and other aggregate functions can be performed on an index-based level
# calling sum() on a series, will sum the whole series
data.sum()

In [None]:
# you can call sum on the level 0 (the first level of the index)
# we get sums for each value in the first level of the index
data.sum(level = 0)

In [None]:
data.sum(level = 1)

# Reshaping and Pivoting Data

In [None]:
data = pd.DataFrame(np.arange(6).reshape((2, 3)),
                    index  = pd.Index(['alpha', 'beta'], name='letter'),
                    columns= pd.Index(['one', 'two', 'three'], name = 'number'))
data

In [None]:
data.stack()  # creates a multi-index

In [None]:
data.stack().unstack()  # unstack undoes the creation of the stacks

In [None]:
data.stack().unstack(0) # you can specify how the unstacking should be done
# here we specify that we should unstack the first level of the multi-index

In [None]:
data.stack().unstack('letter')
# you can specify the unstacking by the index level name

In [None]:
data.stack().unstack('number')

### Unstacking can introduce missing values

In [None]:
s1 = pd.Series([0, 1, 2, 3], index=['a', 'b', 'c', 'd'])
s2 = pd.Series([4, 5, 6], index=['c', 'd', 'e'])
data2 = pd.concat([s1, s2], keys=['one', 'two'])  
# using the argument keys when concat series will produce a multi-index
data2

In [None]:
data2.unstack()

In [None]:
data2.unstack().stack() # stack() will filter out missing values

In [None]:
data2.unstack().stack(dropna = False) # you can force stack to keep the NaNs

# Small example data wrangling

In [None]:
data = pd.read_csv('macrodata.csv')

https://www.statsmodels.org/dev/datasets/generated/macrodata.html

In [None]:
data.info()

In [None]:
data.head()

https://pandas.pydata.org/pandas-docs/stable/generated/pandas.PeriodIndex.html

In [None]:
# We can create a time based index of periods consisting of the year and quarter
periods = pd.PeriodIndex(year = data.year, quarter = data.quarter, name = 'date')

In [None]:
periods

In [None]:
columns = pd.Index(['realgdp', 'infl', 'unemp'], name = 'item')
columns

In [None]:
data = data.reindex(columns = columns) # forces columns to conform to the column index we specified

In [None]:
data.head(10)

In [None]:
periods.to_timestamp('D','start')  # changes 1959Q1 to a date: the start date of Q1 of 1959: 1959-01-01

In [None]:
# the current index is just integers, and we want to replace it
data.index

In [None]:
# specify a new index directly
data.index = periods.to_timestamp('D','start')

In [None]:
data.head()

In [None]:
data.stack().head(10)  # stack creates a series

In [None]:
data.stack().reset_index().head()
# calling reset index turns the current index into a new column and creates a new index

In [None]:
data.stack().reset_index().index

In [None]:
ldata = data.stack().reset_index().rename(columns = {0: 'value'})  
# rename changes the column title '0' to 'value'
ldata.head(10)

In [None]:
# unstack doesn't work, because the stacking and unstacking is powered by multi-index
ldata.unstack()

https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.pivot.html

In [None]:
# if the data is in 'long' form, you can change it to 'wide' form with pivot
ldata.pivot('date','item','value').head()

In [None]:
data.head()

# Group By


In [None]:
np.random.seed(1)
df = pd.DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
                   'key2' : ['one', 'two', 'one', 'two', 'one'],
                   'data1' : np.random.randint(20, size = 5),
                   'data2' : np.random.randint(20, size = 5)})
df

In [None]:
grouped = df['data1'].groupby(df['key1'])
grouped

In [None]:
grouped.mean()

In [None]:
df.groupby(df['key1']).mean()  
# if you don't specify the column, it'll apply the function to the entire dataframe

In [None]:
means = df['data1'].groupby([df['key1'], df['key2']]).mean()
means
# means has a multi-index

In [None]:
# with the multi-index, you can unstack
means.unstack()

In [None]:
# you can perform group by on Series that are not in the dataframe, but are of the correct length
states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])
years = np.array([2005, 2005, 2006, 2005, 2006])
df['data1'].groupby([states, years]).mean()

In [None]:
# groupby applied to the entire dataframe, not just one column
df.groupby('key1').mean()

In [None]:
df.groupby(['key1', 'key2']).mean()

In [None]:
df.groupby(['key1', 'key2']).size()

### Iterating over groups

In [None]:
df

In [None]:
# the groupby creates a series of tuples that can be unpacked into name and group
for name, group in df.groupby('key1'):
    print(name)
    print(group)
    print(group.mean())
    print('----------------')

In [None]:
for name, group in df.groupby('key2'):
    print(name)
    print(group)
    print(group.sum())
    print('----------------')
