# Predict Blood Donation

_Spencer Pease_

## Setup

In [1]:
# Load packages

import pandas as pd
from sklearn import tree
from __future__ import division
from sklearn.cross_validation import train_test_split
from sklearn.neighbors import KNeighborsClassifier
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np



## Load Data

In [2]:
df_train_raw = pd.read_csv('data/training_data.csv')
df_test_raw = pd.read_csv('data/test_data.csv')

## Explore Data

In [3]:
df_train_raw.head()

Unnamed: 0.1,Unnamed: 0,Months since Last Donation,Number of Donations,Total Volume Donated (c.c.),Months since First Donation,Made Donation in March 2007
0,619,2,50,12500,98,1
1,664,0,13,3250,28,1
2,441,1,16,4000,35,1
3,160,2,20,5000,45,1
4,358,1,24,6000,77,0


In [4]:
# Dimensions
df_train_raw.shape

(576, 6)

## Prep Data

In [15]:
df_train = df_train_raw.copy()


# Rename columns
df_train.columns = ['id', 'last_donate', 'num_donate', 'total_vol', 'first_donate', 'march_donate']

# Drop ids
df_train = df_train.drop('id', axis = 1)

# Add metrics
df_train = df_train.assign(donate_per_visit = df_train.total_vol / df_train.num_donate)

df_train.head()

Unnamed: 0,last_donate,num_donate,total_vol,first_donate,march_donate,donate_per_visit
0,2,50,12500,98,1,250.0
1,0,13,3250,28,1,250.0
2,1,16,4000,35,1,250.0
3,2,20,5000,45,1,250.0
4,1,24,6000,77,0,250.0


## Machine Learning

In [16]:
# Seperate data into features and outcomes

features = df_train.drop('march_donate', axis = 1) 
outcomes = df_train.march_donate

# Split into train and validation
train_features, val_features, train_outcome, val_outcome = train_test_split(features, outcomes, test_size=0.20)

### KNN

In [17]:
# Create KNN classifier
knn_clf = KNeighborsClassifier(n_neighbors = 9)

# Fit classifier to training data
knn_fit = knn_clf.fit(train_features, train_outcome)

In [18]:
# test accuracy
knn_preds = knn_fit.predict(val_features)
test_acc = (knn_preds == val_outcome).sum()/len(val_outcome)
test_acc

0.7931034482758621

## Test Predictions

In [25]:
# Format test data
df_test = df_test_raw
df_test.columns = ['id', 'last_donate', 'num_donate', 'total_vol', 'first_donate']
df_test = df_test.assign(donate_per_visit = df_train.total_vol / df_train.num_donate)

df_test.head()


# Generate test pobabilities
test_preds = knn_fit.predict_proba(df_test.drop('id', axis = 1))[0:, 1]

result = pd.DataFrame({'': df_test.id, 'Made Donation in March 2007': test_preds})

result.to_csv('data/result.csv', index = False)