In [21]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
# Blood donations Dataset
test_data= pd.read_csv('BloodDonationPredictions/TestDataBlood.csv')
training_data = pd.read_csv('BloodDonationPredictions/TrainingDataBlood.csv')
table1= pd.DataFrame(test_data)
table2= pd.DataFrame(training_data)

In [24]:

table1.head(10)


Unnamed: 0,Donor Id,Months since Last Donation,Number of Donations,Total Volume Donated (c.c.),Months since First Donation
0,659,2,12,3000,52
1,276,21,7,1750,38
2,263,4,1,250,4
3,303,11,11,2750,38
4,83,4,12,3000,34
5,500,3,21,5250,42
6,530,4,2,500,4
7,244,14,1,250,14
8,249,23,2,500,87
9,728,14,4,1000,64


In [25]:
table2.head(10)

Unnamed: 0,Donor Id,Months since Last Donation,Number of Donations,Total Volume Donated (c.c.),Months since First Donation,Made Donation in March 2007
0,619,2,50,12500,98,1
1,664,0,13,3250,28,1
2,441,1,16,4000,35,1
3,160,2,20,5000,45,1
4,358,1,24,6000,77,0
5,335,4,4,1000,4,0
6,47,2,7,1750,14,1
7,164,1,12,3000,35,0
8,736,5,46,11500,98,1
9,436,0,3,750,4,0


In [26]:
print('Test Data columns with null values:\n', table1.isnull().sum())
print("-"*10)
print('Train Data columns with null values:\n', table2.isnull().sum())
print("-"*10)


Test Data columns with null values:
 Donor Id                       0
Months since Last Donation     0
Number of Donations            0
Total Volume Donated (c.c.)    0
Months since First Donation    0
dtype: int64
----------
Train Data columns with null values:
 Donor Id                       0
Months since Last Donation     0
Number of Donations            0
Total Volume Donated (c.c.)    0
Months since First Donation    0
Made Donation in March 2007    0
dtype: int64
----------


In [28]:
table2.describe()

Unnamed: 0,Donor Id,Months since Last Donation,Number of Donations,Total Volume Donated (c.c.),Months since First Donation,Made Donation in March 2007
count,576.0,576.0,576.0,576.0,576.0,576.0
mean,374.034722,9.439236,5.427083,1356.770833,34.050347,0.239583
std,216.947773,8.175454,5.74001,1435.002556,24.227672,0.4272
min,0.0,0.0,1.0,250.0,2.0,0.0
25%,183.75,2.0,2.0,500.0,16.0,0.0
50%,375.5,7.0,4.0,1000.0,28.0,0.0
75%,562.5,14.0,7.0,1750.0,49.25,0.0
max,747.0,74.0,50.0,12500.0,98.0,1.0


In [32]:

table2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 576 entries, 0 to 575
Data columns (total 6 columns):
Donor Id                       576 non-null int64
Months since Last Donation     576 non-null int64
Number of Donations            576 non-null int64
Total Volume Donated (c.c.)    576 non-null int64
Months since First Donation    576 non-null int64
Made Donation in March 2007    576 non-null int64
dtypes: int64(6)
memory usage: 27.1 KB


In [37]:
# merge the data between the 2 tables
table1['Made Donation in March 2007'] = 'NaN'
data = table2.append(table1)
data.head(10)

# feature engineering



Unnamed: 0,Donor Id,Months since Last Donation,Number of Donations,Total Volume Donated (c.c.),Months since First Donation,Made Donation in March 2007
0,619,2,50,12500,98,1
1,664,0,13,3250,28,1
2,441,1,16,4000,35,1
3,160,2,20,5000,45,1
4,358,1,24,6000,77,0
5,335,4,4,1000,4,0
6,47,2,7,1750,14,1
7,164,1,12,3000,35,0
8,736,5,46,11500,98,1
9,436,0,3,750,4,0


In [49]:
#I have renamed the "Unnamed: 0" to Donor ID and thus, 
# for now we assume that it has no connection with the donation
# lets drop it for now 
data.drop("Donor Id",axis=1, inplace=True)

KeyError: "['Donor Id'] not found in axis"

In [50]:
data.info()
data.head()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 776 entries, 0 to 199
Data columns (total 5 columns):
Months since Last Donation     776 non-null int64
Number of Donations            776 non-null int64
Total Volume Donated (c.c.)    776 non-null int64
Months since First Donation    776 non-null int64
Made Donation in March 2007    776 non-null object
dtypes: int64(4), object(1)
memory usage: 56.4+ KB


Unnamed: 0,Months since Last Donation,Number of Donations,Total Volume Donated (c.c.),Months since First Donation,Made Donation in March 2007
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1
3,2,20,5000,45,1
4,1,24,6000,77,0


In [51]:
# calculating months that the user has been donating
data['Months Donating'] = data['Months since First Donation'] - data['Months since Last Donation']
# calculating donations per month for the user
data['Donations per Months Donating'] = data["""Total Volume Donated (c.c.)"""]/data['Months Donating']
data['Donations per Months Donating'] = data['Donations per Months Donating'].replace(np.inf, 999)
# calculating donations per month since the first donation for the user
data['Donations per Months since First Donation'] = data["""Total Volume Donated (c.c.)"""]/data['Months since First Donation']

data['Donation Counts per Months Donating'] = data['Number of Donations']/data['Months Donating']
data['Donation Counts per Months Donating'] = data['Donation Counts per Months Donating'].replace(np.inf, 999)
data['Donation Counts per Months since First Donating'] = data['Number of Donations']/data['Months since First Donation']
data['Donation Counts per Months since First Donating'] = data['Donation Counts per Months since First Donating'].replace(np.inf, 999)
# 
data['Donation Volume per Donation'] = (data["""Total Volume Donated (c.c.)"""]/data['Number of Donations']).replace(np.inf, 999)


In [54]:
test = data[data['Made Donation in March 2007'] == 'NaN']
test.head(5)


Unnamed: 0,Months since Last Donation,Number of Donations,Total Volume Donated (c.c.),Months since First Donation,Made Donation in March 2007,Months Donating,Donations per Months Donating,Donations per Months since First Donation,Donation Counts per Months Donating,Donation Counts per Months since First Donating,Donation Volume per Donation
0,2,12,3000,52,,50,60.0,57.692308,0.24,0.230769,250.0
1,21,7,1750,38,,17,102.941176,46.052632,0.411765,0.184211,250.0
2,4,1,250,4,,0,999.0,62.5,999.0,0.25,250.0
3,11,11,2750,38,,27,101.851852,72.368421,0.407407,0.289474,250.0
4,4,12,3000,34,,30,100.0,88.235294,0.4,0.352941,250.0


In [55]:
test.drop(["Made Donation in March 2007"], axis = 1)
train = data[data['Made Donation in March 2007'] != 'NaN']
train.head(5)

Unnamed: 0,Months since Last Donation,Number of Donations,Total Volume Donated (c.c.),Months since First Donation,Made Donation in March 2007,Months Donating,Donations per Months Donating,Donations per Months since First Donation,Donation Counts per Months Donating,Donation Counts per Months since First Donating,Donation Volume per Donation
0,2,50,12500,98,1,96,130.208333,127.55102,0.520833,0.510204,250.0
1,0,13,3250,28,1,28,116.071429,116.071429,0.464286,0.464286,250.0
2,1,16,4000,35,1,34,117.647059,114.285714,0.470588,0.457143,250.0
3,2,20,5000,45,1,43,116.27907,111.111111,0.465116,0.444444,250.0
4,1,24,6000,77,0,76,78.947368,77.922078,0.315789,0.311688,250.0


In [60]:
final_train = train.drop(['Made Donation in March 2007'], axis = 1)
final_train.head(10)


Unnamed: 0,Months since Last Donation,Number of Donations,Total Volume Donated (c.c.),Months since First Donation,Months Donating,Donations per Months Donating,Donations per Months since First Donation,Donation Counts per Months Donating,Donation Counts per Months since First Donating,Donation Volume per Donation
0,2,50,12500,98,96,130.208333,127.55102,0.520833,0.510204,250.0
1,0,13,3250,28,28,116.071429,116.071429,0.464286,0.464286,250.0
2,1,16,4000,35,34,117.647059,114.285714,0.470588,0.457143,250.0
3,2,20,5000,45,43,116.27907,111.111111,0.465116,0.444444,250.0
4,1,24,6000,77,76,78.947368,77.922078,0.315789,0.311688,250.0
5,4,4,1000,4,0,999.0,250.0,999.0,1.0,250.0
6,2,7,1750,14,12,145.833333,125.0,0.583333,0.5,250.0
7,1,12,3000,35,34,88.235294,85.714286,0.352941,0.342857,250.0
8,5,46,11500,98,93,123.655914,117.346939,0.494624,0.469388,250.0
9,0,3,750,4,4,187.5,187.5,0.75,0.75,250.0
