## MedTourEasy Internship project
### Predicting whether or not the same donor will give blood the next time the vehicle comes to campus

### Task-1
#### Inspecting the files of dataset

In [10]:
# Project data directory
!ls

Data Analyst.zip
MTE_internship_project.ipynb
notebook.ipynb
transfusion.data


In [12]:
# Print out the first 5 lines from the transfusion.data file
! head -n 5 transfusion.data

Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),"whether he/she donated blood in March 2007"
2 ,50,12500,98 ,1
0 ,13,3250,28 ,1
1 ,16,4000,35 ,1
2 ,20,5000,45 ,1


### Task-2
#### Load the dataset

In [9]:
# Importing pandas module
import pandas as pd

# reading dataset
transfusion = pd.read_csv('transfusion.data')
transfusion.head()

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),whether he/she donated blood in March 2007
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1
3,2,20,5000,45,1
4,1,24,6000,77,0


### Task-3
#### Summary of the dataframe

In [13]:
# summary of the dataframe
transfusion.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 748 entries, 0 to 747
Data columns (total 5 columns):
 #   Column                                      Non-Null Count  Dtype
---  ------                                      --------------  -----
 0   Recency (months)                            748 non-null    int64
 1   Frequency (times)                           748 non-null    int64
 2   Monetary (c.c. blood)                       748 non-null    int64
 3   Time (months)                               748 non-null    int64
 4   whether he/she donated blood in March 2007  748 non-null    int64
dtypes: int64(5)
memory usage: 29.3 KB


### Task-4
#### Renaming columns

In [15]:
transfusion.rename(columns={'whether he/she donated blood in March 2007':'target'}, inplace=True)
transfusion.head(2)

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),target
0,2,50,12500,98,1
1,0,13,3250,28,1


### Task-5
#### Checking target incidence

In [18]:
transfusion.target.value_counts(normalize=True).round(3)

0    0.762
1    0.238
Name: target, dtype: float64

### Task-6
#### Splitting dataframe into train and test dataset

In [25]:
# import the module
from sklearn.model_selection import train_test_split
# Splitting the dataframe into X_train, X_test, y_train, y_test dataset stratifying on 'target' column
X_train, X_test, y_train, y_test = train_test_split(transfusion.drop(columns='target'),
                                                   transfusion.target,
                                                   test_size=0.25,
                                                   random_state= 42,
                                                   stratify= transfusion.target)
X_train.head(2)

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months)
334,16,2,500,16
99,5,7,1750,26


### Task-7
#### TPOT libray for machine learning pipeline

In [31]:
# importing modules
from tpot import TPOTClassifier
from sklearn.metrics import roc_auc_score

# creating instance of TPOTClassifier
tpot= TPOTClassifier(generations=5,
                    population_size=20,
                    verbosity=2,
                    scoring='roc_auc',
                    random_state=42,
                    disable_update_check=True,
                    config_dict='TPOT light'
                    )
# fitting the classifer on training set
tpot.fit(X_train, y_train)

HBox(children=(HTML(value='Optimization Progress'), FloatProgress(value=0.0, max=120.0), HTML(value='')))


Generation 1 - Current best internal CV score: 0.7422459184429089

Generation 2 - Current best internal CV score: 0.7422459184429089

Generation 3 - Current best internal CV score: 0.7422459184429089

Generation 4 - Current best internal CV score: 0.7422459184429089

Generation 5 - Current best internal CV score: 0.7456308339276876

Best pipeline: MultinomialNB(Normalizer(input_matrix, norm=l2), alpha=0.001, fit_prior=True)


TPOTClassifier(config_dict='TPOT light', disable_update_check=True,
               generations=5, population_size=20, random_state=42,
               scoring='roc_auc', verbosity=2)

In [32]:
# AUC score for tpot model
tpot_auc_score = roc_auc_score(y_test, tpot.predict_proba(X_test)[:, 1])
print(f'\nAUC score: {tpot_auc_score:.4f}')


AUC score: 0.7637


In [45]:
# Print best pipline steps
print('\n Best pipline steps:', end='\n')
for idx, (name, transform) in enumerate(tpot.fitted_pipeline_.steps, start=1):
    # Print idx and transform
    print(f'{idx}. {transform}')


 Best pipline steps:
1. Normalizer()
2. MultinomialNB(alpha=0.001)


### Task-8
#### Checking the variance

In [38]:
X_train.var().round(3)

Recency (months)              66.929
Frequency (times)             33.830
Monetary (c.c. blood)    2114363.700
Time (months)                611.147
dtype: float64

In [40]:
''' We see that the feature 'Monetary' has high variance and need to normalize the features for better model performance'''

" We see that the feature 'Monetary' has high variance and need to normalize the features for better model performance"

### Task-9
#### Correcting the variance/normalization

In [43]:
# Import numpy
import numpy as np

# Copy X_train and X_test into X_train_normed and X_test_normed
X_train_normed, X_test_normed = X_train.copy(), X_test.copy()

# Specify which column to normalize
col_to_normalize = 'Monetary (c.c. blood)'

# Log normalization
for df_ in [X_train_normed, X_test_normed]:
    # Add log normalized column
    df_['monetary_log'] = np.log(df_[col_to_normalize])
    # Drop the original column
    df_.drop(columns=col_to_normalize, inplace=True)

# printing the variance after normalization
X_train_normed.var().round(3)

Recency (months)      66.929
Frequency (times)     33.830
Time (months)        611.147
monetary_log           0.837
dtype: float64

### Task-10
#### Train the model e.g logistic regression model

In [48]:
# importing logistic regression module
from sklearn.linear_model import LogisticRegression

# Create the instance of LogisticRegression
logreg = LogisticRegression(solver='liblinear', random_state=42)

# Train the logreg model using fit method
logreg.fit(X_train_normed, y_train)

LogisticRegression(random_state=42, solver='liblinear')

In [49]:
# print the AUC score for tpot model/logreg model
logreg_auc_score = roc_auc_score(y_test, logreg.predict_proba(X_test_normed)[:, 1])
print(f'\nAUC score: {logreg_auc_score:.4f}')


AUC score: 0.7891


### Task-11
#### Sorting model based on Auc score

In [50]:
# Importing itemgetter module
from operator import itemgetter

# Sort models based on their AUC score from highest to lowest
sorted([('tpot', tpot_auc_score), ('logreg', logreg_auc_score)],
        key=itemgetter(1),
        reverse=True
        )

[('logreg', 0.7890972663699937), ('tpot', 0.7637476160203432)]