In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
df = pd.read_pickle('out/1/donations_featureengineered_appeal.pkl')
df = df[['donor_id', 'activity_date', 'amount', 'appeal', 'charitable', 'fund', 'city', 'state', 'county',
         'zipcode', 'latitude', 'longitude',  'timezone', 'activity_year', 'activity_month', 'activity_dow',
         'activity_ym', 'appeal_category']]

### Looking at appeals by charitable flag, location, number of donors, period of activity

In [4]:
df[df.charitable==False]\
.groupby(['appeal', 'state'])\
.agg({'amount': np.sum, 'activity_ym': lambda x: [np.min(x), np.max(x), np.max(x) - np.min(x) + 1] })\
.sort_values(by='amount', ascending=False)\
.to_csv('out/contribution_by_appeal_by_state.csv', sep='|')

df[df.charitable==True]\
.groupby(['appeal', 'state'])\
.agg({'amount': np.sum, 'activity_ym': lambda x: [np.min(x), np.max(x), np.max(x) - np.min(x) + 1] })\
.sort_values(by='amount', ascending=False)\
.to_csv('out/donations_by_appeal_by_state.csv', sep='|')

df[df.charitable==False]\
.groupby(['appeal'])\
.agg({'amount': np.sum, 'activity_ym': lambda x: [np.min(x), np.max(x), np.max(x) - np.min(x) + 1] })\
.sort_values(by='amount', ascending=False)\
.to_csv('out/contribution_by_appeal.csv', sep='|')

df[df.charitable==True]\
.groupby(['appeal'])\
.agg({'amount': np.sum, 'activity_ym': lambda x: [np.min(x), np.max(x), np.max(x) - np.min(x) + 1] })\
.sort_values(by='amount', ascending=False)\
.to_csv('out/donations_by_appeal.csv', sep='|')

In [5]:
appeal_size = pd.concat(
    [df.groupby(['appeal']).donor_id.nunique(),
    df.groupby(['appeal']).amount.sum()], axis=1)

appeal_size\
.sort_values(by='donor_id', ascending=False)\
.to_csv('out/appeal_by_donorsize.csv', sep='|')

appeal_size\
.sort_values(by='amount', ascending=False)\
.to_csv('out/appeal_by_amount.csv', sep='|')

# Trying out a random forest regression and classifier

In [6]:
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn import cross_validation
from sklearn.metrics import roc_auc_score

In [10]:
#Try a regression model to predict the amount that someone from a given location will give
rg = RandomForestRegressor()
X = df[['amount', 'charitable', 'latitude', 'longitude', 'activity_year', 'activity_month', 'activity_dow', 'appeal_category']].copy()
X.charitable = X.charitable.astype(int)
cat_dummies = pd.get_dummies(X['appeal_category'])
X = pd.concat([X, cat_dummies], axis=1)
X.drop(['appeal_category', 'Walkathon'], inplace=True, axis=1)
y = X.pop('amount')
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X.values, y.values, test_size=0.3, random_state=10)
rg = rg.fit(X_train, y_train)
print 'Random Forest regression: ', rg.score(X_test, y_test)

 Random Forest regression:  0.208686347251


In [11]:
yc = y > 30
rc = RandomForestClassifier()
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X.values, yc.values, test_size=0.3, random_state=10)
rc = rc.fit(X_train, y_train)
print 'score: ', rc.score(X_test, y_test)
print 'area under roc: ', roc_auc_score(y_test, rc.predict_proba(X_test)[:,1])

score:  0.690075217999
area under roc:  0.751878003246
