In [1]:
import pandas as pd

In [2]:
tf = pd.read_csv('datasets/transfusion.data')

In [3]:
tf.head(8)

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),whether he/she donated blood in March 2007
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1
3,2,20,5000,45,1
4,1,24,6000,77,0
5,4,4,1000,4,0
6,2,7,1750,14,1
7,1,12,3000,35,0


In [4]:
tf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 748 entries, 0 to 747
Data columns (total 5 columns):
 #   Column                                      Non-Null Count  Dtype
---  ------                                      --------------  -----
 0   Recency (months)                            748 non-null    int64
 1   Frequency (times)                           748 non-null    int64
 2   Monetary (c.c. blood)                       748 non-null    int64
 3   Time (months)                               748 non-null    int64
 4   whether he/she donated blood in March 2007  748 non-null    int64
dtypes: int64(5)
memory usage: 29.3 KB


In [7]:
tf.rename(
    columns = {'whether he/she donated blood in March 2007':'target'},
    inplace = True)

In [8]:
tf.head(4)

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),target
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1
3,2,20,5000,45,1


In [12]:
tf.target.value_counts(normalize=True).round(4)

0    0.762
1    0.238
Name: target, dtype: float64

In [13]:
from sklearn.model_selection import train_test_split

In [16]:
features =  tf.drop(columns='target')
X_train,X_test,y_train,y_test = train_test_split(
    features,
    tf.target,
    test_size=0.25,
    random_state=42,
    stratify=tf.target
)

In [17]:
X_train.head(4)

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months)
334,16,2,500,16
99,5,7,1750,26
116,2,7,1750,46
661,16,2,500,16


In [18]:
from tpot import TPOTClassifier 
from sklearn.metrics import roc_auc_score

In [19]:
tpot = TPOTClassifier(
    generations=5,
    population_size=6,
    verbosity=2,
    scoring='roc_auc',
    random_state=42,
    disable_update_check=True,
    config_dict='TPOT light'
)
tpot.fit(X_train, y_train)

HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=36.0, style=ProgressStyle(des…


Generation 1 - Current best internal CV score: 0.73470755823697

Generation 2 - Current best internal CV score: 0.7361682470642797

Generation 3 - Current best internal CV score: 0.7361682470642797

Generation 4 - Current best internal CV score: 0.7361682470642797

Generation 5 - Current best internal CV score: 0.7415395528117826

Best pipeline: LogisticRegression(input_matrix, C=0.01, dual=False, penalty=l2)


TPOTClassifier(config_dict='TPOT light', disable_update_check=True,
               generations=5, population_size=6, random_state=42,
               scoring='roc_auc', verbosity=2)

In [20]:
# AUC score for tpot model
tpot_auc_score = roc_auc_score(y_test, tpot.predict_proba(X_test)[:, 1])
print(f'\nAUC score: {tpot_auc_score:.4f}')


AUC score: 0.7858


In [21]:
# Print best pipeline steps
print('\nBest pipeline steps:', end='\n')
for idx, (name, transform) in enumerate(tpot.fitted_pipeline_.steps, start=1):
    # Print idx and transform
    print(f'{idx}. {transform}')


Best pipeline steps:
1. LogisticRegression(C=0.01, random_state=42)


In [22]:
#Checking variance of the train dataset
X_train.var().round(3)

Recency (months)              66.929
Frequency (times)             33.830
Monetary (c.c. blood)    2114363.700
Time (months)                611.147
dtype: float64

In [24]:
import numpy as np
X_train_normed, X_test_normed = X_train.copy(), X_test.copy()
col_to_normalize = "Monetary (c.c. blood)"
# Log normalization
for df_ in [X_train_normed, X_test_normed]:
    # Add log normalized column
    df_['monetary_log'] = np.log(df_[col_to_normalize])
    # Drop the original column
    df_.drop(columns=col_to_normalize, inplace=True)

#Variance for trained noramlized
X_train_normed.var().round(3)

Recency (months)      66.929
Frequency (times)     33.830
Time (months)        611.147
monetary_log           0.837
dtype: float64

In [25]:
from sklearn import linear_model
from sklearn.naive_bayes import MultinomialNB

# Instantiate LogisticRegression
logreg = linear_model.LogisticRegression(
    solver='liblinear',
    random_state=42
)

multiNB = MultinomialNB()
multiNB.fit(X_train_normed,y_train)

# Train the model
logreg.fit(X_train_normed, y_train)
multiNB_auc_score = roc_auc_score(y_test, multiNB.predict_proba(X_test_normed)[:, 1])
print(f'\nMAUC score: {multiNB_auc_score:.4f}')


# AUC score for tpot model
logreg_auc_score = roc_auc_score(y_test, logreg.predict_proba(X_test_normed)[:, 1])
print(f'\nAUC score: {logreg_auc_score:.4f}')


MAUC score: 0.7638

AUC score: 0.7891


In [28]:
from operator import itemgetter

# Sorting models based on their AUC score from highest to lowest
sorted(
    [('tpot', tpot_auc_score), ('logreg', logreg_auc_score),('multiNB', multiNB_auc_score)],
    key=itemgetter(1),)

[('multiNB', 0.7638270820089001),
 ('tpot', 0.7857596948506039),
 ('logreg', 0.7890972663699937)]