# Reshaping, Reorganizing and Aggregation

In [1]:
# import pandas and NumPy
import pandas as pd
import numpy as np

# date and time functions
import datetime

# bring in matplotlib and draw inline
import matplotlib.pyplot as plt
%matplotlib inline

# Set some Pandas options
pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', 15)
pd.set_option('display.max_rows', 8)
pd.set_option('precision', 3)

## Loading historical stock data from the web or from files

### From Web

In [2]:
# for the DataReader
import pandas.io.data as web

# start end end dates
start = datetime.datetime(2012, 1, 1)
end = datetime.datetime(2012, 12, 30)

# load the data
msft = web.DataReader("MSFT", 'yahoo', start, end)
aapl = web.DataReader("AAPL", 'yahoo', start, end)

# these save the data to file - optional for the examples
#msft.to_csv("msft.csv")
#aapl.to_csv("aapl.csv")

ModuleNotFoundError: No module named 'pandas.io.data'

### From file

In [4]:
# read the Microsoft and Apple data from file
msft = pd.read_csv("msft.csv", index_col=0, parse_dates=True)
aapl = pd.read_csv("aapl.csv", index_col=0, parse_dates=True)

### Organizing the data for the examples

In [5]:
msft[:3]

             Open   High    Low  Close    Volume  Adj Close
Date                                                       
2012-01-03  26.55  26.96  26.39  26.77  64731500     24.422
2012-01-04  26.82  27.47  26.78  27.40  80516100     24.997
2012-01-05  27.38  27.73  27.29  27.68  56081400     25.252

In [None]:
aapl[:3]

# Reorganizing and reshaping data

## Concatenating data in Multiple DataFrame objects

In [None]:
# get MSFT adjusted close data for Jan and Feb 2012
msftA01 = msft['2012-01'][['Adj Close']]
msftA02 = msft['2012-02'][['Adj Close']]
msftA01[:3]

In [None]:
msftA02[:3]

In [None]:
# combine the first three rows of each of msftA01 and msftA02
pd.concat([msftA01.head(3), msftA02.head(3)])

In [None]:
# Extract only the Jan 2012 AAPL values.  
aaplA01 = aapl['2012-01'][['Adj Close']]
# now concat the AAPL and MSFT Jan 2012 data
# there will be duplicate index labels
withDups = pd.concat([msftA01[:3], aaplA01[:3]])
withDups

In [None]:
# show the two records for data of 2012-01-03
withDups.ix['2012-01-03']

In [None]:
# demonstrate concat with a specification of the 
# stock tickets being part of the index
# this help disambiguate the duplicate dates using
# a hierarchal index
closes = pd.concat([msftA01[:3], aaplA01[:3]], 
                    keys=['MSFT', 'AAPL'])
closes

In [None]:
# extract just MSFT values using .ix
closes.ix['MSFT'][:3]

In [None]:
# demonstrate concatenation using two DataFrame's
# that each have two columns.  pandas will align the
# data in columns by the column names (labels)
msftAV = msft[['Adj Close', 'Volume']]
aaplAV = aapl[['Adj Close', 'Volume']]
pd.concat([msftAV, aaplAV])

In [None]:
# demonstrate concatenation with DataFrame objects
# that do not have the same set of columns
# this demonstrates pandas filling in NaN values
aaplA = aapl[['Adj Close']]
pd.concat([msftAV, aaplA])

In [None]:
# perform an inner join on the DataFrame's
# since aaplA does not have a Volume column, pandas
# will not include that column in the result
pd.concat([msftAV, aaplA], join='inner')

In [None]:
# concat along the rows, causing duplicate columns to
# be created in the result
msftA = msft[['Adj Close']]
closes = pd.concat([msftA, aaplA], axis=1)
closes[:3]

In [None]:
# concat along rows using two DataFrame objects with
# different number of rows. This demonstrates how
# NaN values will be filled in those rows for AAPL
# which only hase three rows as compared to 5 for MSFT
pd.concat([msftAV[:5], aaplAV[:3]], axis=1,
          keys=['MSFT', 'AAPL'])

In [None]:
# inner join can also be used along this axis
# this will not include rows with index labels that do
# not exist in both DataFrame objects
pd.concat([msftA[:5], aaplA[:3]], axis=1,
          join='inner', keys=['MSFT', 'AAPL'])

In [None]:
# ignore indexes and just concatenate the data and
# have the result have a default integer index
pd.concat([msftA[:3], aaplA[:3]], ignore_index=True)

## Merging DataFrame objects

In [None]:
# we will merge these two DataFrame objects, 
# so lets peek at the data to remind ourselves
# of what they contain
msftAR = msftA.reset_index()
msftVR = msft[['Volume']].reset_index()
msftAR[:3]

In [None]:
msftVR[:3]

In [None]:
# merge the two.  pandas finds the columns in common,
# in this case Date, and merges on that column and adds
# a column for all the other columns in both DataFrame's
msftCVR = pd.merge(msftAR, msftVR)
msftCVR[:5]

In [None]:
# we will demonstrate join semantics using this DataFrame
msftAR0_5 = msftAR[0:5]
msftAR0_5

In [None]:
# and also this one
msftVR2_4 = msftVR[2:4]
msftVR2_4

In [None]:
# merge semantics using default inner join
pd.merge(msftAR0_5, msftVR2_4)

In [None]:
# same joing but using
pd.merge(msftAR0_5, msftVR2_4, how='outer')

## Pivoting

In [None]:
# need to insert Symbol column before combining
msft.insert(0, 'Symbol', 'MSFT')
aapl.insert(0, 'Symbol', 'AAPL')

# concatenate the MSFT and AAPL data
# index will consist of the Date column, which we will sort
combined = pd.concat([msft, aapl]).sort_index()

# this pushes the index into a column and resets to a 
# default integer index
s4p = combined.reset_index();
s4p[:5]

In [None]:
# pivot Date into the Index, make the columns match the
# unique values in the Symbol column, and the values 
# will be the AdjClose values
closes = s4p.pivot(index='Date', columns='Symbol', 
                   values='Adj Close')
closes[:3]

## Stacking and Unstacking

In [None]:
# stack the first level of columns into the index
# essentially, moves AAPL and MSFT into the index
# leaving a single colum which is the AdjClose values
stackedCloses = closes.stack()
stackedCloses

In [None]:
# using .ix we can retrieve close values by
# specifying both the date and ticker
stackedCloses.ix['2012-01-03', 'AAPL']

In [None]:
# lookup on just the date, which will give us two values
# one each for AAPL and MSFT.  
stackedCloses.ix['2012-01-03']

In [None]:
# this looks up all values for the MSFT symbol
stackedCloses.ix[:, 'MSFT']

In [None]:
# pivots the last level of the index back into a column
unstackedCloses = stackedCloses.unstack()
unstackedCloses[:3]

## Melting

In [None]:
# melt making id_vars of Date and Symbol, making the 
# column names the variable and the for each the value
melted = pd.melt(s4p, id_vars=['Date', 'Symbol'])
melted[:5]

In [None]:
# extract the values for the data for MSFT on 2012-01-03
melted[(melted.Date=='2012-01-03') & (melted.Symbol=='MSFT')]

# Grouping and aggregation

## Splitting

In [None]:
# construct a DataFrame to demonstrate splitting
# extract from combined the Symbol and AdjClose, and reset the index
s4g = combined[['Symbol', 'Adj Close']].reset_index()
# now, add two columns, year and month, using the year and month
# portions of the data as integers
s4g.insert(1, 'Year', pd.DatetimeIndex(s4g['Date']).year)
s4g.insert(2, 'Month',pd.DatetimeIndex(s4g['Date']).month)
s4g[:5]

In [None]:
# group by the Symbol column
s4g.groupby('Symbol')

In [None]:
# group again, but save the result this time
grouped = s4g.groupby('Symbol')
# the groupby object has a property groups, which shows how
# all rows will in mapped into the groups.   
# the type of this object is a python dict
type(grouped.groups)

In [None]:
# show the mappings of rows to groups
grouped.groups

In [None]:
# these report the number of groups that resulted from
# the grouping
len(grouped), grouped.ngroups

In [None]:
# this function will print the contents of a group
def print_groups (groupobject):
    for name, group in groupobject:
        print name
        print group.head()

In [None]:
# examine our resulting groups
print_groups(grouped)

In [None]:
# .size will tell us the count of items in each group
grouped.size()

In [None]:
# a specific group can be retrieved using .get_group()
# which returns a DataFrame representing the specified group
grouped.get_group('MSFT')

In [None]:
# group by three different fields and print the result
mcg = s4g.groupby(['Symbol', 'Year', 'Month'])
print_groups(mcg)

In [None]:
# set the index of the data to be the following three fields
# we are creating a multiindex
mi = s4g.set_index(['Symbol', 'Year', 'Month'])
mi

In [None]:
# now we can group based upon values in the actual index
# the following groups by level 0 of the index (Month)
mig_l1 = mi.groupby(level=0)
print_groups(mig_l1)

In [None]:
# group by three levels in the index using their names
mig_l12 = mi.groupby(level=['Symbol', 'Year', 'Month'])
print_groups(mig_l12)

# Aggregation

In [None]:
# this will apply the mean function to each group
mig_l12.agg(np.mean)

In [None]:
# example of groupby that also ignores the index
# resulting in a default integer index
# this also has the mean function applied
s4g.groupby(['Symbol', 'Year', 'Month'], 
            as_index=False).agg(np.mean)[:5]

In [None]:
# apply multiple functions to each group in one call
mig_l12.agg([np.mean, np.std])