In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

## Nomenclature:
    1. Donation - is a charitable contribution
    2. Contribution - is not a charitable contribution

In [2]:
def get_year_bins(years):
    '''
    input: list of years
    output: bins of consequent years that form the given list
    example:
        get_year_bins([2002, 2003, 2004, 2006, 2010, 2011, 2013]) => [(2002, 2004), (2010, 2011)]
    '''
    years = sorted(years)
    maxyear = 2014
    if maxyear in years:
        np.append(years, maxyear+1)
    bins = zip([x for x in years if x-1 not in years], [x for x in years if x+1 not in years])
    bins = [(x,y) for x,y in bins if x != y]
    return bins

def get_year_bins_charitable(rows, charitable):
    '''
    input: rows from dataframe for a specific donor, charitable flag
    output: bins of consequent years when the donor gave us money with that charitable flag
    '''
    if sum(rows.charitable==charitable) == 0:
        return []
    else:
        return get_year_bins(rows[rows.charitable==charitable].activity_year.unique())
    
def get_years_charitable(rows, charitable):
    '''
    input: rows from dataframe for a specific donor, charitable flag
    output: list of distinct, sorted years when the donor gave us money with that charitable flag
    '''
    if sum(rows.charitable==charitable) == 0:
        return []
    else:
        return np.sort(rows[rows.charitable==charitable].activity_year.unique())

In [3]:
def is_recurring_contribution(contribution_year_bins, activity_year, charitable):
    '''
    Input: List of tuples representing bins of continous years of contributions made by the donor, year of activity
    Returns: True if the row is a contribution and year appears in any of the bins, false otherwise
    '''
    return bool((~charitable) & np.any([activity_year >= b[0] and activity_year <= b[1] for b in contribution_year_bins]))

def is_recurring_donation(donation_year_bins, activity_year, charitable):
    '''
    Input: List of tuples representing bins of continous years of donations made by the donor, year of activity
    Returns: True if the row is a donation and year appears in any of the bins, false otherwise
    '''
    return bool((charitable) & np.any([activity_year >= b[0] and activity_year <= b[1] for b in donation_year_bins]))

In [4]:
df = pd.read_pickle('out/1/donations_featureengineered_appeal.pkl')
df = df[['donor_id', 'activity_date', 'amount', 'appeal', 'charitable', 'fund', 'city', 'state', 'county',
         'zipcode', 'latitude', 'longitude',  'timezone', 'activity_year', 'activity_month', 'activity_dow',
        'activity_ym']]

In [5]:
contribution_years = df.groupby(['donor_id']).apply(get_years_charitable, charitable=False)
contribution_years.name = 'contribution_years'
contribution_year_bins = contribution_years.apply(get_year_bins)
contribution_year_bins.name = 'contribution_year_bins'

In [6]:
donation_years = df.groupby(['donor_id']).apply(get_years_charitable, charitable=True)
donation_years.name = 'donation_years'
donation_year_bins = donation_years.apply(get_year_bins)
donation_year_bins.name = 'donation_year_bins'

In [7]:
df = df\
  .set_index('donor_id')\
  .join(contribution_years)\
  .join(contribution_year_bins)\
  .join(donation_years, how='left')\
  .join(donation_year_bins, how='left')\
  .reset_index()

In [8]:
df['is_recurring_contribution'] = \
    df.apply(lambda row: is_recurring_contribution(row.contribution_year_bins, row.activity_year, row.charitable), axis=1)

df['is_recurring_donation'] = \
    df.apply(lambda row: is_recurring_donation(row.donation_year_bins, row.activity_year, row.charitable), axis=1)

In [9]:
donor_data = df[['donor_id', 'city', 'state', 'county', 'zipcode', 'latitude', 'longitude',
                 'timezone', 'contribution_years', 'donation_years',
                 'contribution_year_bins', 'donation_year_bins',
                ]].copy().drop_duplicates('donor_id')

In [10]:
# We should have the same count of distinct donors in both the data frames. Otherwise something is wrong!
print df.donor_id.nunique(), donor_data.donor_id.nunique()

46986 46986


In [11]:
donor_data['num_donation_year_bins'] = donor_data.donation_year_bins.apply(len)
donor_data['num_contribution_year_bins'] = donor_data.contribution_year_bins.apply(len)
donor_data['num_donation_years'] = donor_data.donation_years.apply(len)
donor_data['num_contribution_years'] = donor_data.contribution_years.apply(len)

In [12]:
donor_data = donor_data\
  .set_index('donor_id')\
  .join(df.groupby(['donor_id']).is_recurring_donation.sum())\
  .rename(columns={'is_recurring_donation': 'num_recurring_donations'})\
  .join(df.groupby(['donor_id']).is_recurring_contribution.sum())\
  .rename(columns={'is_recurring_contribution': 'num_recurring_contributions'})\
  .join(df[df.charitable==False].groupby(['donor_id']).amount.sum(), how='left')\
  .rename(columns={'amount': 'contribution_amount'})\
  .join(df[df.charitable==True].groupby(['donor_id']).amount.sum(), how='left')\
  .rename(columns={'amount': 'donation_amount'})\
  .reset_index()

In [13]:
donor_data[donor_data.donor_id == '_1D50SWTKX']

Unnamed: 0,donor_id,city,state,county,zipcode,latitude,longitude,timezone,contribution_years,donation_years,contribution_year_bins,donation_year_bins,num_donation_year_bins,num_contribution_year_bins,num_donation_years,num_contribution_years,num_recurring_donations,num_recurring_contributions,contribution_amount,donation_amount
33035,_1D50SWTKX,Kenmore,WA,King County,98028,47.75,-122.24,America/Los_Angeles,"[2005, 2008, 2009, 2010, 2011, 2012, 2013, 201...","[2004, 2005, 2006, 2007, 2008, 2009, 2010, 201...","[(2008, 2015)]","[(2004, 2015)]",1,1,12,9,69,35,24596,32097


In [21]:
!mkdir -p out/2
df.to_pickle('out/2/donation_feature_engineered.pkl')
donor_data.to_pickle('out/2/donor_feature_engineered.pkl')