In [1]:
import pandas as pd
import numpy as np

In [2]:
events = pd.read_pickle('out/41/events.pkl')
df = pd.read_pickle('out/21/donations.pkl')

In [3]:
events.head()

Unnamed: 0,appeal,campaign_location_id,campaign_month_id,transaction_count,amount,activity_year,activity_month,activity_ym,county,state
0,Banquet_GWOF,0,0,45,26548,2011,7,201107,Alameda,CA
1,Banquet_Gift Of Vision,0,0,46,9002,2010,10,201010,Alameda,CA
2,Banquet_Gujarat_CA_2008,0,0,16,8650,2008,7,200807,Santa Clara,CA
3,Banquet_Hyderabad,0,0,11,850,2015,10,201510,Santa Clara,CA
4,Banquet_Punjab_CA,0,0,65,15860,2009,11,200911,Alameda,CA


In [119]:
def month_diff(ym, number):
    '''
    example: month_diff(201505, 3) => 201502
    example: month_diff(201501, 3) => 201410
    '''
    y,m = int(ym/100), ym%100
    dy, dm = number/12, number%12
    m -= dm
    if m < 1:
        y, m = (y - 1), 12 + m
    return ((y - dy)*100) + m


In [320]:
def get_past_event_stats(state, year, month, prior_months_list):
    '''
    For a given list of numbers, go back that many months prior from the given month and get the event summary
    ex: state = CA, year = 2015, month = 1, prior_months_list = [1, 3, 6, 12, 18]
    Going back 1, 3, 6, 12 and 18 months from Jan 2012, return the summary stats (amount, attendance) for each duration
    '''
    target_ym = year*100 + month
    stats = (state, year, month,)
    for prior_month in prior_months_list:
        subdata = events[(events.state==state)
               & (events.activity_ym >= month_diff(target_ym, prior_month))
               & (events.activity_ym <= target_ym)]
        stats = stats + (subdata.amount.sum(),)
        stats = stats + (subdata.transaction_count.sum(),)
    return stats

In [321]:
d = get_past_event_stats('CA', 2014, 12, [1, 3, 6, 12, 18])
d

('CA',
 2014,
 12,
 0,
 0,
 20733L,
 318L,
 26400L,
 330L,
 1107523L,
 3363L,
 1137499L,
 3629L)

In [273]:
response_stats = []
for state in df.state.unique():
    for year in range(2005, 2016):
        for month in range(1, 13):
            subdata = df[(df.is_service==False) & (df.state==state)
                         & (df.activity_year==year) & (df.activity_month==month)]
            stats.append((state, year, month, subdata.amount.sum(), subdata.donor_id.nunique()))

In [283]:
data_Y = pd.DataFrame(response_stats, columns=['state','year','month','amount','donor_count'])

In [322]:
predictor_stats = data_Y.apply(lambda row: get_past_event_stats(row['state'], row['year'], row['month'], [1, 3, 6, 12, 18]), axis=1)

In [359]:
cols = ['amount_1', 'trans_1', 'amount_3', 'trans_3', 'amount_6', 'trans_6',
        'amount_12', 'trans_12','amount_18', 'trans_18']
xcols = ['state', 'year', 'month',]
xcols.extend(cols)

In [404]:
data_X = pd.DataFrame(predictor_stats.values.tolist(), columns=xcols)

In [429]:
data = data_X.merge(data_Y, on=['state','year','month'])
data = data[~(data.month==12)]

In [430]:
y = data.donor_count.values
X = data[cols].values

In [431]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import KFold

In [432]:
k_fold = KFold(len(X), 3, shuffle=True, random_state=46)

In [441]:
for k, (train, test) in enumerate(k_fold):
    rf = RandomForestRegressor()
    rf.fit(X[train], y[train])
    print rf.score(X[train], y[train])
print rf.score(X[test], y[test])

0.773291776051
0.798209824666
0.787804310589
0.531652511653


In [442]:
rf.feature_importances_

array([ 0.02518053,  0.00447784,  0.01573717,  0.03195681,  0.37892828,
        0.02715108,  0.14323535,  0.01896885,  0.25221581,  0.10214828])