# Wrap Up Quiz

Quiz based on the dataset `blood_transfusion.csv`

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('data/blood_transfusion.csv')
df.head()

Unnamed: 0,Recency,Frequency,Monetary,Time,Class
0,2,50,12500,98,donated
1,0,13,3250,28,donated
2,1,16,4000,35,donated
3,2,20,5000,45,donated
4,1,24,6000,77,not donated


In [12]:
data = df.drop(columns='Class')
target = df['Class']

In [11]:
# Q1
target.value_counts()

not donated    570
donated        178
Name: Class, dtype: int64

Q1) We are solving a binary classification problem and as we can see the proportion of the class counts are imbalanced with the `not donated` class over 4 times bigger than the other class

In [5]:
# Q2)
from sklearn.model_selection import cross_val_score
from sklearn.dummy import DummyClassifier

In [6]:
model = DummyClassifier(strategy="most_frequent")
res = cross_val_score(model, data, target, cv=10)

In [10]:
print(f" Accuracy score: {res.mean():.3f}")

 Accuracy score: 0.762


In [13]:
res = cross_val_score(model, data, target, cv=10, 
                      scoring='balanced_accuracy')
print(f" Accuracy score: {res.mean():.3f}")

 Accuracy score: 0.500


k-nearest neighbors is based on computing some distances. Features need to be normalized to contribute approximately equally to the distance computation.

In [16]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [17]:
model = make_pipeline(StandardScaler(), KNeighborsClassifier())
model.get_params()

{'memory': None,
 'steps': [('standardscaler', StandardScaler()),
  ('kneighborsclassifier', KNeighborsClassifier())],
 'verbose': False,
 'standardscaler': StandardScaler(),
 'kneighborsclassifier': KNeighborsClassifier(),
 'standardscaler__copy': True,
 'standardscaler__with_mean': True,
 'standardscaler__with_std': True,
 'kneighborsclassifier__algorithm': 'auto',
 'kneighborsclassifier__leaf_size': 30,
 'kneighborsclassifier__metric': 'minkowski',
 'kneighborsclassifier__metric_params': None,
 'kneighborsclassifier__n_jobs': None,
 'kneighborsclassifier__n_neighbors': 5,
 'kneighborsclassifier__p': 2,
 'kneighborsclassifier__weights': 'uniform'}

In [18]:
from sklearn.model_selection import cross_validate

In [19]:
results = cross_validate(model, data, target, cv=10, n_jobs=2, return_train_score=True)

In [20]:
results

{'fit_time': array([0.01668286, 0.01418018, 0.02106309, 0.01938605, 0.01530194,
        0.01153898, 0.01627898, 0.01691222, 0.01842213, 0.01834989]),
 'score_time': array([0.01613402, 0.01446986, 0.01264787, 0.01443505, 0.01490402,
        0.01437187, 0.01268482, 0.02036095, 0.00993109, 0.01615596]),
 'test_score': array([0.37333333, 0.4       , 0.56      , 0.70666667, 0.73333333,
        0.76      , 0.68      , 0.78666667, 0.74324324, 0.78378378]),
 'train_score': array([0.84249629, 0.81575037, 0.80534918, 0.82763744, 0.82020802,
        0.82763744, 0.82169391, 0.8127786 , 0.82195846, 0.81750742])}

In [22]:
print(f"Accuracy train score: {results['train_score'].mean():.3f} +/- {results['train_score'].std():.3f}")
print(f"Accuracy test score: {results['test_score'].mean():.3f} +/- {results['test_score'].std():.3f}")


Accuracy train score: 0.821 +/- 0.010
Accuracy test score: 0.653 +/- 0.147


The  training accuracy has a much higher mean score with little standard deviation in comparison to the testing accuracy. This clearly means that the model overfits and already capture some noises from the data. However, the training score is not that high, it seems like the model prediction is not specially good.

We will now study the effect of the parameter `n_neighbors` on the train and test score using a validation curve. We will use the following parameter range:

In [23]:
param_range = [1, 2, 5, 10, 20, 50, 100, 200, 500]