In [1]:
from functools import partial
import numpy as np
import pandas as pd

In [2]:
df = pd.DataFrame({'category': ['a', 'a', 'a', 'a', 'b', 'b', 'b', 'b'],
             'data': np.random.standard_normal(8),
             'weights': np.random.uniform(size=8)})

In [3]:
df

Unnamed: 0,category,data,weights
0,a,-0.847156,0.591943
1,a,-1.07171,0.050108
2,a,0.371956,0.949859
3,a,-0.651345,0.637547
4,b,-0.033763,0.843624
5,b,0.917313,0.433759
6,b,1.067787,0.15045
7,b,-0.178713,0.573556


In [4]:
#weighted average by category would then be
grouped = df.groupby('category')

In [5]:
def get_wavg(group):
    return np.average(group['data'], weights=group['weights'])

In [6]:
grouped.apply(get_wavg)

category
a   -0.276806
b    0.213630
dtype: float64

In [7]:
#Financial dataset obtained from yahoo finance with EOD prices for a few stocks and the S&P 500 index.
close_px = pd.read_csv('examples/stock_px.csv', parse_dates=True,
                      index_col=0)

In [8]:
close_px.info() #quick overview of DataF

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2214 entries, 2003-01-02 to 2011-10-14
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   AAPL    2214 non-null   float64
 1   MSFT    2214 non-null   float64
 2   XOM     2214 non-null   float64
 3   SPX     2214 non-null   float64
dtypes: float64(4)
memory usage: 86.5 KB


In [9]:
close_px.tail()

Unnamed: 0,AAPL,MSFT,XOM,SPX
2011-10-10,388.81,26.94,76.28,1194.89
2011-10-11,400.29,27.0,76.27,1195.54
2011-10-12,402.19,26.96,77.16,1207.25
2011-10-13,408.43,27.18,76.37,1203.66
2011-10-14,422.0,27.27,78.11,1224.58


In [10]:
#compute a DataFrame corresponding to the yearly correlations of daily returns (from pct changes)
def spx_corr(group):
    return group.corrwith(group['SPX'])

In [11]:
#now we compute pct_change on close using pct_change
rets = close_px.pct_change().dropna()

In [12]:
#group these percent changes by year, which we can extract from each label with a one-line function.
def get_year(x):
    return x.year

In [13]:
by_year = rets.groupby(get_year)

In [14]:
by_year

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fdb50332f50>

In [15]:
by_year.apply(spx_corr)

Unnamed: 0,AAPL,MSFT,XOM,SPX
2003,0.541124,0.745174,0.661265,1.0
2004,0.374283,0.588531,0.557742,1.0
2005,0.46754,0.562374,0.63101,1.0
2006,0.428267,0.406126,0.518514,1.0
2007,0.508118,0.65877,0.786264,1.0
2008,0.681434,0.804626,0.828303,1.0
2009,0.707103,0.654902,0.797921,1.0
2010,0.710105,0.730118,0.839057,1.0
2011,0.691931,0.800996,0.859975,1.0


In [16]:
#we can also inpute intercolumn correlations. Here we do it between Apple and Microsoft.
def corr_two_stocks(group, s1, s2):
    return group[s1].corr(group[s2])

corr_aapl_msft = partial(corr_two_stocks, s1='AAPL', s2='MSFT')

In [17]:
by_year.apply(corr_aapl_msft)

2003    0.480868
2004    0.259024
2005    0.300093
2006    0.161735
2007    0.417738
2008    0.611901
2009    0.432738
2010    0.571946
2011    0.581987
dtype: float64

In [18]:
#EXAMPLE: Group-wise linear regression

In [19]:
import statsmodels.api as sm

def regress(data, yvar=None, xvars=None):
    '''Executes an ordinary least squares (OLS) regression on each chunk of data.'''
    Y = data[yvar]
    X = data[xvars]
    X['intercept'] = 1.
    result = sm.OLS(Y, X).fit()
    return result.params

In [20]:
by_year.apply(regress, yvar='AAPL', xvars=['SPX'])

Unnamed: 0,SPX,intercept
2003,1.195406,0.00071
2004,1.363463,0.004201
2005,1.766415,0.003246
2006,1.645496,8e-05
2007,1.198761,0.003438
2008,0.968016,-0.00111
2009,0.879103,0.002954
2010,1.052608,0.001261
2011,0.806605,0.001514


In [21]:
by_year.apply(regress, yvar='SPX', xvars=['MSFT'])

Unnamed: 0,MSFT,intercept
2003,0.434153,0.000739
2004,0.381352,0.000212
2005,0.401541,0.000137
2006,0.196557,0.000397
2007,0.46032,-0.000206
2008,0.679532,-0.000324
2009,0.480459,-5.1e-05
2010,0.599611,0.000645
2011,0.757835,-0.000101


In [None]:
#transform