# Import library

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score

In [2]:
import warnings
warnings.simplefilter('ignore')

In [3]:
from sklearn.ensemble import RandomForestClassifier

from sklearn.linear_model import LogisticRegression

from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.svm import SVC

# Read Files

In [4]:
train = pd.read_csv('blood-train.csv')
test = pd.read_csv('blood-test.csv')

In [5]:
print("Train shape :", train.shape)
print("Test shape:", test.shape)

Train shape : (576, 6)
Test shape: (200, 5)


In [6]:
train.head()
#unnamed:0 마지막 헌혈 후 지난 기간 헌혈 횟수 총 량 첫 도네 이후로 지난 달 2007년3월에 도네

Unnamed: 0.1,Unnamed: 0,Months since Last Donation,Number of Donations,Total Volume Donated (c.c.),Months since First Donation,Made Donation in March 2007
0,619,2,50,12500,98,1
1,664,0,13,3250,28,1
2,441,1,16,4000,35,1
3,160,2,20,5000,45,1
4,358,1,24,6000,77,0


In [7]:
train['Made Donation in March 2007'].value_counts()

0    438
1    138
Name: Made Donation in March 2007, dtype: int64

In [8]:
test.head()

Unnamed: 0.1,Unnamed: 0,Months since Last Donation,Number of Donations,Total Volume Donated (c.c.),Months since First Donation
0,659,2,12,3000,52
1,276,21,7,1750,38
2,263,4,1,250,4
3,303,11,11,2750,38
4,83,4,12,3000,34


In [9]:
y = train.iloc[:, -1]
y.head()

0    1
1    1
2    1
3    1
4    0
Name: Made Donation in March 2007, dtype: int64

In [10]:
#마지막 열이 제거되지 않은 모습
train.tail()

Unnamed: 0.1,Unnamed: 0,Months since Last Donation,Number of Donations,Total Volume Donated (c.c.),Months since First Donation,Made Donation in March 2007
571,698,23,1,250,23,0
572,433,16,3,750,86,0
573,360,21,2,500,52,0
574,541,39,1,250,39,0
575,74,72,1,250,72,0


In [11]:
#unnamed와 마지막 라벨 제거
old_train = train
train = train.iloc[:,1:5]
test = test.iloc[:, 1:5]

In [12]:
train.head()

Unnamed: 0,Months since Last Donation,Number of Donations,Total Volume Donated (c.c.),Months since First Donation
0,2,50,12500,98
1,0,13,3250,28
2,1,16,4000,35
3,2,20,5000,45
4,1,24,6000,77


In [13]:
test.head()

Unnamed: 0,Months since Last Donation,Number of Donations,Total Volume Donated (c.c.),Months since First Donation
0,2,12,3000,52
1,21,7,1750,38
2,4,1,250,4
3,11,11,2750,38
4,4,12,3000,34


In [14]:
#트레이닝 데이터와 테스트 데이터 합침
df = pd.merge(train, test)
df.head()

Unnamed: 0,Months since Last Donation,Number of Donations,Total Volume Donated (c.c.),Months since First Donation
0,2,2,500,4
1,2,2,500,4
2,2,2,500,4
3,2,2,500,4
4,2,2,500,4


In [15]:
x = df.iloc[:,:]
x.head()

Unnamed: 0,Months since Last Donation,Number of Donations,Total Volume Donated (c.c.),Months since First Donation
0,2,2,500,4
1,2,2,500,4
2,2,2,500,4
3,2,2,500,4
4,2,2,500,4


# **Data Exploration**

In [16]:
train.describe()

Unnamed: 0,Months since Last Donation,Number of Donations,Total Volume Donated (c.c.),Months since First Donation
count,576.0,576.0,576.0,576.0
mean,9.439236,5.427083,1356.770833,34.050347
std,8.175454,5.74001,1435.002556,24.227672
min,0.0,1.0,250.0,2.0
25%,2.0,2.0,500.0,16.0
50%,7.0,4.0,1000.0,28.0
75%,14.0,7.0,1750.0,49.25
max,74.0,50.0,12500.0,98.0


In [17]:
plt.figure(figsize =(20,10))
sns.boxplot(y="Months since Last Donation", data = old_train)
#최소는 0, 위에 점 4개는 이상치

<AxesSubplot:ylabel='Months since Last Donation'>

In [18]:
corrmat = x.corr()
f, ax = plt.subplots(figsize=(9,8))
#cmap으로 색깔 번경 가능
sns.heatmap(corrmat, ax=ax, cmap="YlGnBu", linewidths=0.1, fmt = ".2f", annot = True)

<AxesSubplot:>

In [19]:
train['Months since Last Donation'].unique()

array([ 2,  0,  1,  4,  5,  3, 12,  6, 11,  9, 10, 13, 14,  8,  7, 16, 23,
       21, 18, 22, 26, 35, 74, 15, 20, 17, 25, 39, 72], dtype=int64)

# **Feature Engineering**

In [20]:
x['Donating for'] = x['Months since First Donation']-x['Months since Last Donation']

In [21]:
x.head()

Unnamed: 0,Months since Last Donation,Number of Donations,Total Volume Donated (c.c.),Months since First Donation,Donating for
0,2,2,500,4,2
1,2,2,500,4,2
2,2,2,500,4,2
3,2,2,500,4,2
4,2,2,500,4,2


In [22]:
corrmat = x.corr()
f, ax = plt.subplots(figsize=(9,8))
#cmap으로 색깔 번경 가능
sns.heatmap(corrmat, ax=ax, cmap="YlGnBu", linewidths=0.1, fmt = ".2f", annot = True)

<AxesSubplot:>

In [23]:
x.drop('Total Volume Donated (c.c.)', axis = 1, inplace = True)
x.head()

Unnamed: 0,Months since Last Donation,Number of Donations,Months since First Donation,Donating for
0,2,2,4,2
1,2,2,4,2
2,2,2,4,2
3,2,2,4,2
4,2,2,4,2


In [24]:
x.shape

(810, 4)

# **Feature Transformation**

In [25]:
#x변수들의 값이 0보다 크고 한쪽에 치우쳐져있으니 scaling이 필요하다
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()

x = scale.fit_transform(x)

In [26]:
train = x[:576]
train.shape

(576, 4)

In [27]:
test = x[576:]
test.shape

(234, 4)

In [28]:
y = y[:576]

In [29]:
y.shape

(576,)

# **Modeling**

In [30]:
xtrain,xtest,ytrain,ytest = train_test_split(train,y,test_size = 0.2, random_state = 0)

## LogisticRegression

In [31]:
#이름은 LogisticRegression이지만 classify에 가깝다. 0과 1사이에 급격한 증가로 0아니면 1분류
logreg = LogisticRegression(random_state = 7)
logreg.fit(xtrain,ytrain)

LogisticRegression(random_state=7)

In [32]:
pred = logreg.predict(xtest)

In [33]:
accuracy_score(pred,ytest)

0.7155172413793104

In [34]:
roc_auc_score(pred,ytest)

0.7321428571428571

## Support Vector Machine

In [35]:
#SVMC는 probability=True를 사용하면 불확실성을 추정하는 predict_proba사용 가능
SVMC = SVC(probability=True)
SVMC.fit(train,y)

SVC(probability=True)

In [36]:
pred = SVMC.predict(xtest)

In [37]:
accuracy_score(pred,ytest)

0.7844827586206896

In [38]:
confusion_matrix(pred,ytest)
#t/f는 맞는지 아닌지, p/n은 예측
#tn 80, fn 24
#fp 1   tp 11

array([[80, 24],
       [ 1, 11]], dtype=int64)

In [39]:
roc_auc_score(pred,ytest)

0.8429487179487177

## Random Forest

In [40]:
RFC = RandomForestClassifier()
RFC.fit(xtrain,ytrain)

RandomForestClassifier()

In [41]:
pred = RFC.predict(xtest)

In [42]:
confusion_matrix(pred,ytest)

array([[79, 27],
       [ 2,  8]], dtype=int64)

In [43]:
accuracy_score(pred,ytest)

0.75

In [44]:
roc_auc_score(pred,ytest)

0.7726415094339623

## Decision Tree

In [45]:
model=DecisionTreeClassifier(max_leaf_nodes = 4, max_features = 3, max_depth = 15)
model.fit(xtrain, ytrain)

DecisionTreeClassifier(max_depth=15, max_features=3, max_leaf_nodes=4)

In [46]:
pred = model.predict(xtest)

In [47]:
accuracy_score(pred, ytest)

0.7327586206896551

In [48]:
confusion_matrix(pred, ytest)

array([[80, 30],
       [ 1,  5]], dtype=int64)

In [49]:
roc_auc_score(pred,ytest)

0.7803030303030304

## MLP Classifier

In [50]:
clf_neural = MLPClassifier(solver = 'lbfgs', alpha = 1e-5, hidden_layer_sizes=(25,), random_state=1)
clf_neural.fit(train,y)

MLPClassifier(alpha=1e-05, hidden_layer_sizes=(25,), random_state=1,
              solver='lbfgs')

In [51]:
predicted = clf_neural.predict(xtest)

In [52]:
confusion_matrix(predicted, ytest)

array([[80, 24],
       [ 1, 11]], dtype=int64)

In [53]:
accuracy_score(predicted, ytest)

0.7844827586206896

In [54]:
roc_auc_score(predicted, ytest)

0.8429487179487177