In [2]:
import pandas as pd
import numpy as np
import seaborn as sns  #library that uses matplotlib to make styled plots
import matplotlib.pyplot as plt #basic plotting library
import hypertools as hyp #library for visualizing 'high-dimensional' data
from glob import glob as lsdir 
import os 
import re #regular expression operations
import datetime as dt #manipulating dates and times

from sklearn import linear_model #sklearn is a python machine learning library
from sklearn.neural_network import MLPRegressor #multilayer perceptron regressor
from sklearn.model_selection import train_test_split 
#this function splits arrays or matrices into random train and test subsets

%matplotlib inline



# Read in data

In [3]:
# shows how to read ALL of the data in
data_readers = {'xlsx': pd.read_excel, 'xls': pd.read_excel, 'dta': pd.read_stata}
get_extension = lambda x: x.split('.')[-1]

In [4]:
def read_data(datadir, readers):
    files = lsdir(os.path.join(datadir, '*')) # path is path to the file?
    readable_files = []
    data = []
    for f in files:
        ext = get_extension(f)
        if ext in readers.keys():
            readable_files.append(f)
            data.append(data_readers[ext](f))
    return readable_files, data

In [5]:
fnames, data = read_data('data', data_readers)



In [6]:
fnames

['data/UVLTdata_individual.dta',
 'data/UVLTdata_final.dta',
 'data/Direct Mailing Analysis.xlsx',
 'data/UVLTDataAnalysis.xls',
 'data/TownLevelData.xlsx',
 'data/TownLevelData.dta',
 'data/CensusInfoUpperValley2015JH.xlsx']

In [7]:
data

[       ContactID State  TownID                Town  LandOwnerTownID  \
 0         1544.0    NC    60.0     All Other Towns              0.0   
 1         1545.0    NH    20.0             Lebanon              0.0   
 2         1546.0    NH    61.0  All Other Towns NH              0.0   
 3         1547.0    VT    41.0       Weathersfield             41.0   
 4         1548.0    VT    34.0         Springfield              0.0   
 5         1549.0    VT    13.0             Fairlee             13.0   
 6         1550.0    NH    28.0          Plainfield              0.0   
 7         1551.0    VT    36.0            Thetford             36.0   
 8         1552.0    NH    20.0             Lebanon              0.0   
 9         1553.0    VT    40.0            Vershire              0.0   
 10        1554.0    NH    16.0             Hanover              0.0   
 11        1555.0    NH    16.0             Hanover              0.0   
 12        1556.0    MA    62.0  All Other Towns MA             

## Human readable descriptions (guessed from examining the data)

In [8]:
descriptions = ['population', 'population_expanded', 'ads', 'population_raw', 'stats_by_town', 'stats_by_town (excel)', 'census']

##### Question
^ what is the point of this code?

## Examining some of the raw data

In [9]:
data[0].head()

Unnamed: 0,ContactID,State,TownID,Town,LandOwnerTownID,DeceasedDateYN,U_Tot_Amt,U_Tot_Cnt,U200001,U200102,...,E201112,E201213,E201314,E201415,E201516,E201617,E201718,E201819,DeceasedDate,ConservedOwner
0,1544.0,NC,60.0,All Other Towns,0.0,0.0,571.95,6.0,0.0,50.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0
1,1545.0,NH,20.0,Lebanon,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0
2,1546.0,NH,61.0,All Other Towns NH,0.0,0.0,600.0,7.0,25.0,25.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0
3,1547.0,VT,41.0,Weathersfield,41.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,1.0
4,1548.0,VT,34.0,Springfield,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2016.0,0.0


##### Question
^ How did he get these indices?

In [10]:
data[0].columns.values # columns is also a dataframe; returns a 

array(['ContactID', 'State', 'TownID', 'Town', 'LandOwnerTownID',
       'DeceasedDateYN', 'U_Tot_Amt', 'U_Tot_Cnt', 'U200001', 'U200102',
       'U200203', 'U200304', 'U200405', 'U200506', 'U200607', 'U200708',
       'U200809', 'U200910', 'U201011', 'U201112', 'U201213', 'U201314',
       'U201415', 'U201516', 'U201617', 'U201718', 'U201819', 'RTotAmt',
       'RTotCnt', 'R200001', 'R200102', 'R200203', 'R200304', 'R200405',
       'R200506', 'R200607', 'R200708', 'R200809', 'R200910', 'R201011',
       'R201112', 'R201213', 'R201314', 'R201415', 'R201516', 'R201617',
       'R201718', 'R201819', 'VTotCnt', 'V200001', 'V200102', 'V200203',
       'V200304', 'V200405', 'V200506', 'V200607', 'V200708', 'V200809',
       'V200910', 'V201011', 'V201112', 'V201213', 'V201314', 'V201415',
       'V201516', 'V201617', 'V201718', 'V201819', 'ETotCnt', 'E200001',
       'E200102', 'E200203', 'E200304', 'E200405', 'E200506', 'E200607',
       'E200708', 'E200809', 'E200910', 'E201011', 'E20111

In [11]:
data[1].head()

Unnamed: 0,ContactID,State,TownID,Town,LandOwnerTownID,DeceasedDateYN,U_Tot_Amt,U_Tot_Cnt,U200001,U200102,...,E201819,DeceasedDate,ConservedOwner,Nprojects,Nacres,Nmembers,MedianHHIncome,MeanHHIncome,PercBAplus,PercAge55Plus
0,2903.0,VT,2.0,Bradford,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,,0.0,20.0,2258.32,7.0,48056.0,58716.0,30.6,32.912306
1,11472.0,VT,2.0,Bradford,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,,0.0,20.0,2258.32,7.0,48056.0,58716.0,30.6,32.912306
2,9206.0,VT,2.0,Bradford,0.0,0.0,75.0,2.0,0.0,0.0,...,0.0,,0.0,20.0,2258.32,7.0,48056.0,58716.0,30.6,32.912306
3,12910.0,VT,2.0,Bradford,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,,0.0,20.0,2258.32,7.0,48056.0,58716.0,30.6,32.912306
4,5029.0,VT,2.0,Bradford,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,,0.0,20.0,2258.32,7.0,48056.0,58716.0,30.6,32.912306


In [48]:
data[1].columns.values # prints out a numpy array of the column heads

Index(['ContactID', 'State', 'TownID', 'Town', 'LandOwnerTownID',
       'DeceasedDateYN', 'U_Tot_Amt', 'U_Tot_Cnt', 'U200001', 'U200102',
       'U200203', 'U200304', 'U200405', 'U200506', 'U200607', 'U200708',
       'U200809', 'U200910', 'U201011', 'U201112', 'U201213', 'U201314',
       'U201415', 'U201516', 'U201617', 'U201718', 'U201819', 'RTotAmt',
       'RTotCnt', 'R200001', 'R200102', 'R200203', 'R200304', 'R200405',
       'R200506', 'R200607', 'R200708', 'R200809', 'R200910', 'R201011',
       'R201112', 'R201213', 'R201314', 'R201415', 'R201516', 'R201617',
       'R201718', 'R201819', 'VTotCnt', 'V200001', 'V200102', 'V200203',
       'V200304', 'V200405', 'V200506', 'V200607', 'V200708', 'V200809',
       'V200910', 'V201011', 'V201112', 'V201213', 'V201314', 'V201415',
       'V201516', 'V201617', 'V201718', 'V201819', 'ETotCnt', 'E200001',
       'E200102', 'E200203', 'E200304', 'E200405', 'E200506', 'E200607',
       'E200708', 'E200809', 'E200910', 'E201011', 'E20111

In [13]:
data[2].head()

Unnamed: 0,ID,Town,DATE,AMOUNT,CODE,LIST,NOTES,Gave Again FY 19
0,14661.0,28.0,2017-12-01 00:00:00,50.0,170930.0,The New Yorker,,
1,1838.0,16.0,2017-12-04 00:00:00,25.0,170924.0,NWF,,
2,14664.0,9.0,2017-12-04 00:00:00,20.0,170929.0,Sierra Club,,X
3,13889.0,12.0,2017-12-04 00:00:00,50.0,170924.0,NWF,previous-Smith Pond,X
4,14667.0,15.0,2017-12-06 00:00:00,25.0,170926.0,TNC,,X


In [14]:
data[2].columns.values

array(['ID', 'Town', 'DATE', 'AMOUNT', 'CODE', 'LIST', 'NOTES',
       'Gave Again FY 19'], dtype=object)

In [15]:
data[3].head()

Unnamed: 0,ContactID,FirstName,LastName,City,State,ZipCode,TownID,Town,LandOwnerTownID,DeceasedDate,...,E-2010-11,E-2011-12,E-2012-13,E-2013-14,E-2014-15,E-2015-16,E-2016-17,E-2017-18,E-2018-19,E-2019-20
0,1544,Charles ...,Pitman,Chapel Hill,NC,27517,60,All Other Towns,0,NaT,...,0,0,0,0,0,0,0,0,0,0
1,1545,Pat,Reed,Lebanon,NH,3766,20,Lebanon,0,NaT,...,0,0,0,0,0,0,0,0,0,0
2,1546,David,Davenport,New London,NH,3257,61,All Other Towns NH,0,NaT,...,0,0,0,0,0,0,0,0,0,0
3,1547,Patricia,Stevens,Ascutney,VT,5030,41,Weathersfield,41,NaT,...,0,0,0,0,0,0,0,0,0,0
4,1548,Margaret Elizabeth,Stevens,Springfield,VT,5156,34,Springfield,0,2016-05-08,...,0,0,0,0,0,0,0,0,0,0


In [16]:
data[3].columns.values

array(['ContactID', 'FirstName', 'LastName', 'City', 'State', 'ZipCode',
       'TownID', 'Town', 'LandOwnerTownID', 'DeceasedDate',
       'DeceasedDateYN', 'U_Tot_Amt', 'U_Tot_Cnt', 'U-2000-01',
       'U-2001-02', 'U-2002-03', 'U-2003-04', 'U-2004-05', 'U-2005-06',
       'U-2006-07', 'U-2007-08', 'U-2008-09', 'U-2009-10', 'U-2010-11',
       'U-2011-12', 'U-2012-13', 'U-2013-14', 'U-2014-15', 'U-2015-16',
       'U-2016-17', 'U-2017-18', 'U-2018-19', 'U-2019-20', 'R-Tot-Amt',
       'R-Tot-Cnt', 'R-2000-01', 'R-2001-02', 'R-2002-03', 'R-2003-04',
       'R-2004-05', 'R-2005-06', 'R-2006-07', 'R-2007-08', 'R-2008-09',
       'R-2009-10', 'R-2010-11', 'R-2011-12', 'R-2012-13', 'R-2013-14',
       'R-2014-15', 'R-2015-16', 'R-2016-17', 'R-2017-18', 'R-2018-19',
       'R-2019-20', 'V-Tot-Cnt', 'V-2000-01', 'V-2001-02', 'V-2002-03',
       'V-2003-04', 'V-2004-05', 'V-2005-06', 'V-2006-07', 'V-2007-08',
       'V-2008-09', 'V-2009-10', 'V-2010-11', 'V-2011-12', 'V-2012-13',
       'V-2

In [17]:
data[4].head()

Unnamed: 0,TownID,Town,Nprojects,Nacres,Nmembers,MedianHHIncome,MeanHHIncome,PercBAplus,PercAge55Plus
0,1,Bath,9,746.0,0,47386,60413,25.274725,48.844538
1,2,Bradford,20,2258.32,7,48056,58716,30.6,32.912306
2,3,Canaan,9,1676.35,10,58333,68870,25.3,33.86352
3,4,Cavendish,0,0.0,0,48750,69230,31.001727,40.359043
4,5,Charlestown,2,408.9,2,42693,50823,14.3,36.050905


In [18]:
data[4].columns.values

array(['TownID', 'Town', 'Nprojects', 'Nacres', 'Nmembers',
       'MedianHHIncome', 'MeanHHIncome', 'PercBAplus', 'PercAge55Plus'],
      dtype=object)

In [19]:
data[5].head()

Unnamed: 0,TownID,Town,Nprojects,Nacres,Nmembers,MedianHHIncome,MeanHHIncome,PercBAplus,PercAge55Plus
0,1,Bath,9,746.0,0,47386,60413,25.274725,48.844538
1,2,Bradford,20,2258.32,7,48056,58716,30.6,32.912306
2,3,Canaan,9,1676.35,10,58333,68870,25.3,33.86352
3,4,Cavendish,0,0.0,0,48750,69230,31.001727,40.359043
4,5,Charlestown,2,408.9,2,42693,50823,14.3,36.050905


In [20]:
data[5].columns.values

array(['TownID', 'Town', 'Nprojects', 'Nacres', 'Nmembers',
       'MedianHHIncome', 'MeanHHIncome', 'PercBAplus', 'PercAge55Plus'],
      dtype=object)

In [21]:
data[6].head()

Unnamed: 0,Town #,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Total Housing Units,Unnamed: 6,Unnamed: 7,Unnamed: 8
0,1.0,Bath,366,10,3,589.0,,,
1,2.0,Bradford,822,13,0,1407.0,,,
2,3.0,Canaan,1140,8,0,1867.0,,,
3,4.0,Cavendish,510,40,3,1013.0,,,
4,5.0,Charlestown,1738,0,0,2338.0,,,


In [22]:
data[6].columns.values

array(['Town #', 'Unnamed: 1', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4',
       'Total Housing Units', 'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8'],
      dtype=object)

## How much data are we working with?

In [23]:
list(map(np.shape, data)) 
# list of tuples that include the (rows, columns) of each of the datasets in question
# `map reduce` carves a task into a bunch of bite-sized chunks so that it's faster, searches run in parallel
# map takes a list of data and applies whatever function you give to each element of data

[(13927, 90), (13934, 97), (27, 8), (13273, 97), (45, 9), (45, 9), (48, 9)]

In [49]:
np.shape(data[0]) 
# any numpy function will work on a dataframe
# shape command prints how many rows and how many columns

(13927, 90)

In [50]:
[np.shape(x) for x in data] #does the same thing as above

[(13927, 90), (13934, 97), (27, 8), (13273, 97), (45, 9), (45, 9), (48, 9)]

# Prediction analyses

We'll try to predict four things:
- Unrestricted donations in the next year given past history
- Restricted donations in the next year given past history
- Volunteering in the next year given past history
- Event attendance in the next year given past history

We will form these predictions using the following information from `data/UVLTDataAnalysis.xls`:
- Past history of unrestricted donations, restricted donations, volunteering, and event attendance for all years prior to the to-be-predicted year
- Whether or not the person owns conserved land
- Where the person's conserved land is located (if applicable)
- Whether the person is still alive (otherwise they won't be donating!)

In [24]:
years = np.arange(2001, 2021) #years to include in analysis
def get_start_year(x):
    try:
        return int(re.search('-\d{4}-', x).group()[1:-1])
    except:
        return np.nan #no year found 

#data cleanup
def get_deceased_year(x, maxyear=np.inf): #second argument will be infinity if maxyear not passed
    if type(x) == pd._libs.tslib.Timestamp: #if some special library timestamp object, get the year
        y = x.year
    elif np.isscalar(x): # scalar - simple type, anything that is not an object
        try:
            y = int(x)
        except:
            return np.nan
        
    try:
        if y < maxyear:
            return y
        else:            
            return np.nan
    except:        
        return np.nan #not a timestamp

In [25]:
#preprocessing
n = np.where(np.array([x == 'data/UVLTDataAnalysis.xls' for x in fnames]))[0][0]  #fnames is file names
x = data[n].copy()
x.set_index('ContactID', inplace=True)
x.drop(['FirstName', 'LastName', 'City', 'TownID', 'Town', 'DeceasedDateYN'], axis=1, inplace=True)
x.drop(data[n].columns.values[np.where(np.array(['Tot' in x for x in data[n].columns.values]))[0]], axis=1, inplace=True)
x['DeceasedDate'] = x['DeceasedDate'].apply(get_deceased_year)

In [26]:
x.head(20)

Unnamed: 0_level_0,State,ZipCode,LandOwnerTownID,DeceasedDate,U-2000-01,U-2001-02,U-2002-03,U-2003-04,U-2004-05,U-2005-06,...,E-2010-11,E-2011-12,E-2012-13,E-2013-14,E-2014-15,E-2015-16,E-2016-17,E-2017-18,E-2018-19,E-2019-20
ContactID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1544,NC,27517,0,,0.0,50.0,21.95,100.0,200.0,100.0,...,0,0,0,0,0,0,0,0,0,0
1545,NH,3766,0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1546,NH,3257,0,,25.0,25.0,50.0,100.0,100.0,100.0,...,0,0,0,0,0,0,0,0,0,0
1547,VT,5030,41,,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1548,VT,5156,0,2016.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1549,VT,5045,13,,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1550,NH,3770,0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1551,VT,5043,36,,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1552,NH,3766,0,2014.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1553,VT,5079,0,2014.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
def get_training_and_test_data(df, year):
    #data to predict from
    x = df[['State', 'ZipCode', 'LandOwnerTownID', 'DeceasedDate']].copy()
    x['DeceasedDate'] = x['DeceasedDate'].apply(lambda i: get_deceased_year(i, maxyear=year)) #remove all deaths before the given year    
    
    inds = np.array(list(map(get_start_year, df.columns.values))) < (year - 1)
    x[df.columns.values[inds]] = df.iloc[:, inds] 
    
    #data to predict
    inds = np.array(list(map(get_start_year, df.columns.values))) == year
    y = df[df.columns.values[inds]].copy()
    
    return x, y

In [28]:
train, test = get_training_and_test_data(x, 2018)

In [29]:
train.head()

Unnamed: 0_level_0,State,ZipCode,LandOwnerTownID,DeceasedDate,U-2000-01,U-2001-02,U-2002-03,U-2003-04,U-2004-05,U-2005-06,...,E-2007-08,E-2008-09,E-2009-10,E-2010-11,E-2011-12,E-2012-13,E-2013-14,E-2014-15,E-2015-16,E-2016-17
ContactID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1544,NC,27517,0,,0.0,50.0,21.95,100.0,200.0,100.0,...,0,0,0,0,0,0,0,0,0,0
1545,NH,3766,0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1546,NH,3257,0,,25.0,25.0,50.0,100.0,100.0,100.0,...,0,0,0,0,0,0,0,0,0,0
1547,VT,5030,41,,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1548,VT,5156,0,2016.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
test.head()

Unnamed: 0_level_0,U-2018-19,R-2018-19,V-2018-19,E-2018-19
ContactID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1544,0.0,0.0,0,0
1545,0.0,0.0,0,0
1546,0.0,0.0,0,0
1547,0.0,0.0,0,0
1548,0.0,0.0,0,0


### Formatting data to facilitate predictions
When we formulate predictions, we'll want to be able to cleanly separate out one group of columns where every row in that column 

We'll create a dataframe where each contactID is repeated nyears-1 times.  The `U*`, `R*`, `V*`, and `E*` columns should be renamed to `U-10`, `U-9`, etc. indicated the number of years *prior* to the prediction year (everything before the earliest year with data should be set to nans).  Also include `U`, `R`, `V`, and `E` columns indicating the values of those columns on the prediction year.

In [31]:
def to_relative_years(df):
    years = np.array(list(map(get_start_year, df.columns.values)))
    maxyear = np.nanmax(years)
    minyear = np.nanmin(years)
    
    mapper = {}
    df = df.copy()
    if minyear == maxyear: #only one year; drop all years
        for c in df.columns.values[~np.isnan(years)]:
            mapper[c] = c[:-8]        
    else:
        for i, y in enumerate(years):
            if not np.isnan(y):
                c = df.columns.values[i]
                mapper[c] = c[:-7] + str(int(maxyear - y + 1))
    
    df.rename(mapper, inplace=True, axis=1)
    return df

In [32]:
columns = ['State', 'ZipCode', 'LandOwnerTownID', 'DeceasedDate']
categories = ['U', 'R', 'V', 'E']
for c in categories:
    columns.extend(list(map(lambda x: f'{c}-{x}', np.arange(len(years), 0, -1))))
columns.extend(categories)

In [33]:
df = pd.DataFrame(index=['ContactID'], columns=columns, data = [])

In [34]:
for y in years:
    next_train, next_test = get_training_and_test_data(x, y)
    next_merged = pd.concat([to_relative_years(train), to_relative_years(test)], axis=1)    
    df = pd.concat([df, next_merged], axis=0, copy=True)

In [35]:
#re-order columns
df = df[columns]

#drop nans in first row
df.drop(index=['ContactID'], inplace=True)

#rename index 
df.index.names = ['ContactID']

#set all nan values in 4th column on to 0s
df.iloc[:, 4:] = df.iloc[:, 4:].fillna(value=0)

In [36]:
df.head(20)

Unnamed: 0_level_0,State,ZipCode,LandOwnerTownID,DeceasedDate,U-20,U-19,U-18,U-17,U-16,U-15,...,E-6,E-5,E-4,E-3,E-2,E-1,U,R,V,E
ContactID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1544,NC,27517,0,,0,0,0,0.0,50.0,21.95,...,0,0,0,0,0,0,0.0,0.0,0,0
1545,NH,3766,0,,0,0,0,0.0,0.0,0.0,...,0,0,0,0,0,0,0.0,0.0,0,0
1546,NH,3257,0,,0,0,0,25.0,25.0,50.0,...,0,0,0,0,0,0,0.0,0.0,0,0
1547,VT,5030,41,,0,0,0,0.0,0.0,0.0,...,0,0,0,0,0,0,0.0,0.0,0,0
1548,VT,5156,0,2016.0,0,0,0,0.0,0.0,0.0,...,0,0,0,0,0,0,0.0,0.0,0,0
1549,VT,5045,13,,0,0,0,0.0,0.0,0.0,...,0,0,0,0,0,0,0.0,0.0,0,0
1550,NH,3770,0,,0,0,0,0.0,0.0,0.0,...,0,0,0,0,0,0,0.0,0.0,0,0
1551,VT,5043,36,,0,0,0,0.0,0.0,0.0,...,0,0,0,0,0,0,0.0,0.0,0,0
1552,NH,3766,0,2014.0,0,0,0,0.0,0.0,0.0,...,0,0,0,0,0,0,0.0,0.0,0,0
1553,VT,5079,0,2014.0,0,0,0,0.0,0.0,0.0,...,0,0,0,0,0,0,0.0,0.0,0,0


# Prediction
We'll use the prior history of unrestricted donations, restricted donations, volunteering, and event attendance for all years prior to the to-be-predicted year to attempt to predict unrestricted donations, restricted donations, volunteering, and event attendance in the to-be-predicted year.

For this analysis, we'll ignore the State, ZipCode, and LandOwnerTownID variables.  We'll also drop any rows where the given individual is deceased (we'll assume that the probability of donating after an individual dies is 0, even though this is not strictly true 100% of the time-- e.g. people could hypothetically donate after death using wills and trusts).

We will use four types of classifiers:
- [Linear regression](https://en.wikipedia.org/wiki/Linear_regression)
- [Ridge regression](https://en.wikipedia.org/wiki/Tikhonov_regularization)
- [Multilayer perceptron (deep learning)](https://en.wikipedia.org/wiki/Multilayer_perceptron)


**What is linear regression?**
* `y = mx + b`
* If you have `x`, you can get `y` by multiplying it by one scalar `m` and then adding another scalar `b`.
* You can **compute / predict** a given value of `x` you can the `y` value. You're finding the slope and the intercept to a line with the minimum sum squared error (that's not the right name) i.e. the distance between a point and the line.
* With more than 2 dimensions, you can also use linear regression. y = b0 + b1x1 + b2x2... there's something to do here with matrix multiplication.

**What is a multiple regression?**
* Look it up online... this was confusing.

**What is ridge regression?**
* Rounds the answer a little toward zero - values the line being flat (useful properties) -- don't want to be too specifically fit to the training dataset, what it to be able to predict the future not only describe the past (overfitting = fitting the data really well) -- combats that problem by sacrificing some of the ability to describe the data in order to be able to predict well.

**What is deep learning?**
* puts x into abstract representational spaces and eventually the last space maps onto the data
* essentially what it is doing is a series of regression problems
* you get better performance, but the cost is *understanding* what happened

_Look for Prof. Pfister's resources on regression_

In [54]:
#ignore warning message
import warnings
warnings.simplefilter('ignore')

#drop dead donors and the DeceasedDate column
living_donors = df.loc[df['DeceasedDate'].apply(np.isnan), :]
living_donors.drop(labels=['DeceasedDate'], axis=1, inplace=True)

In [55]:
history = living_donors.iloc[:, 3:-4]
donations = living_donors.iloc[:, -4:]

In [56]:
history.head() # column heads are number of years before the year being predicted; observations up to the year of interest
#things we're predicting from

Unnamed: 0_level_0,U-20,U-19,U-18,U-17,U-16,U-15,U-14,U-13,U-12,U-11,...,E-10,E-9,E-8,E-7,E-6,E-5,E-4,E-3,E-2,E-1
ContactID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1544,0,0,0,0.0,50.0,21.95,100.0,200.0,100.0,100.0,...,0,0,0,0,0,0,0,0,0,0
1545,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1546,0,0,0,25.0,25.0,50.0,100.0,100.0,100.0,200.0,...,0,0,0,0,0,0,0,0,0,0
1547,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1549,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [57]:
donations.head() # in the year after the corresponding row of the history, what did that person do
# things we're predicting

Unnamed: 0_level_0,U,R,V,E
ContactID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1544,0.0,0.0,0,0
1545,0.0,0.0,0,0
1546,0.0,0.0,0,0
1547,0.0,0.0,0,0
1549,0.0,0.0,0,0


In [58]:
#split dataset into training and testing sets
history_train, history_test, donations_train, donations_test = train_test_split(
    history.values, donations.values, test_size=0.2, random_state=0) 

#0.2 is the proportion of data the training set doesn't see

In [59]:
models = [linear_model.LinearRegression(), linear_model.RidgeCV(), MLPRegressor(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(20, 20, 20))]
model_names = ['Linear regression', 'Ridge regression', 'Deep learning']

In [60]:
for i, m in enumerate(models): # this code runs the models
    m.fit(history_train, donations_train)
    print(f'{model_names[i]} variance explained: {np.round(100*m.score(history_test, donations_test), 2)}%')

Linear regression variance explained: 44.1%
Ridge regression variance explained: 44.08%
Deep learning variance explained: 93.48%


# What's next?

## Questions to consider
- What are some potential confounds in the analyses above?
- What have we learned, if anything?
- What does it mean when some types of models explain the data better than other types of models?
- What is the "story" here, and how could we tell it in the simplest, cleanest way?
- What figures might we want to create?

## Future directions
- Correct some confounds and re-run the prediction analyses
- Explore the data and/or results
- Create some figures
- Tell a story!

### Class Notes Friday, April 12
* Weighted combination of corresponding values of x to predict y
* We need a way to represent the future: reorganize dataset; for every entry, let's look back at the prior years leading up to that time; what happened?
* What does it mean to explain variance? *Basically, accuracy of prediction!*

Weakness: 
* Hasn't controlled for whether the same person shows up in training & test sets ~people show up multiple times~

Over the weekend: 
* 