In [64]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score, LeaveOneOut,\
RepeatedKFold, train_test_split
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

In [2]:
df = pd.read_csv('data/income_evaluation.csv', na_values=' ?')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
df.isna().sum()

age                   0
 workclass         1836
 fnlwgt               0
 education            0
 education-num        0
 marital-status       0
 occupation        1843
 relationship         0
 race                 0
 sex                  0
 capital-gain         0
 capital-loss         0
 hours-per-week       0
 native-country     583
 income               0
dtype: int64

In [4]:
df.fillna('missing', inplace=True)

In [5]:
df.isna().sum()

age                0
 workclass         0
 fnlwgt            0
 education         0
 education-num     0
 marital-status    0
 occupation        0
 relationship      0
 race              0
 sex               0
 capital-gain      0
 capital-loss      0
 hours-per-week    0
 native-country    0
 income            0
dtype: int64

In [6]:
X = df.drop(' income', axis=1)
y = df[' income']

In [7]:
X.shape

(32561, 14)

In [8]:
y.value_counts()

 <=50K    24720
 >50K      7841
Name:  income, dtype: int64

# KFold

In [10]:
kf = KFold(n_splits=5)

In [13]:
32561/5

6512.2

In [14]:
6512*4

26048

In [15]:
i = 1
for train_set, test_set in kf.split(X=X):
    print("iteration ", i)
    print(train_set, " having :" , len(train_set))
    print(test_set, " having :" , len(test_set))
    print("-------------------------")
    i += 1

iteration  1
[ 6513  6514  6515 ... 32558 32559 32560]  having : 26048
[   0    1    2 ... 6510 6511 6512]  having : 6513
-------------------------
iteration  2
[    0     1     2 ... 32558 32559 32560]  having : 26049
[ 6513  6514  6515 ... 13022 13023 13024]  having : 6512
-------------------------
iteration  3
[    0     1     2 ... 32558 32559 32560]  having : 26049
[13025 13026 13027 ... 19534 19535 19536]  having : 6512
-------------------------
iteration  4
[    0     1     2 ... 32558 32559 32560]  having : 26049
[19537 19538 19539 ... 26046 26047 26048]  having : 6512
-------------------------
iteration  5
[    0     1     2 ... 26046 26047 26048]  having : 26049
[26049 26050 26051 ... 32558 32559 32560]  having : 6512
-------------------------


In [19]:
num_cols = X.select_dtypes(include=np.number).columns
num_cols

Index(['age', ' fnlwgt', ' education-num', ' capital-gain', ' capital-loss',
       ' hours-per-week'],
      dtype='object')

In [20]:
cat_cols = X.select_dtypes(exclude=np.number).columns
cat_cols

Index([' workclass', ' education', ' marital-status', ' occupation',
       ' relationship', ' race', ' sex', ' native-country'],
      dtype='object')

In [28]:
ct = ColumnTransformer([
    ('rob', RobustScaler(), num_cols),
    ('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'), cat_cols)
])

In [29]:
pipe = Pipeline([
    ('ct_step', ct),
    ('model', RandomForestClassifier(n_estimators=10, random_state=0))
])

In [25]:
X.loc[6513:].head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
6513,29,Private,280344,Bachelors,13,Married-civ-spouse,Sales,Husband,White,Male,0,0,40,United-States
6514,45,Private,202496,Bachelors,13,Married-civ-spouse,Handlers-cleaners,Husband,White,Male,0,0,37,United-States
6515,61,Self-emp-inc,134768,Some-college,10,Married-civ-spouse,Sales,Husband,White,Male,0,0,40,United-States
6516,40,Private,175686,Some-college,10,Divorced,Adm-clerical,Unmarried,White,Female,0,0,40,United-States
6517,24,Private,194748,HS-grad,9,Never-married,Transport-moving,Not-in-family,White,Female,0,0,49,United-States


In [31]:
scores = []
i = 1
for train_set, test_set in kf.split(X):
    pipe.fit(X.loc[train_set], y[train_set])
    sco = pipe.score(X.loc[test_set], y[test_set])
    scores.append(sco)
    print("iteration ", i)
    i += 1

iteration  1
iteration  2
iteration  3
iteration  4
iteration  5


In [33]:
np.array(scores)

array([0.84784278, 0.84520885, 0.84613022, 0.84858722, 0.85165848])

In [34]:
np.array(scores).mean()

0.8478855085142512

In [35]:
np.array(scores).std()

0.0022349531977626388

# Stratified KFold

In [36]:
y.value_counts()

 <=50K    24720
 >50K      7841
Name:  income, dtype: int64

In [37]:
7841/5

1568.2

In [38]:
24720/5

4944.0

In [46]:
4944*4

19776

In [40]:
skf = StratifiedKFold(n_splits=5)

In [42]:
scores_skf = []
i = 1
for train_set, test_set in skf.split(X, y):
    pipe.fit(X.loc[train_set], y[train_set])
    sco = pipe.score(X.loc[test_set], y[test_set])
    scores_skf.append(sco)
    print("iteration ", i)
    i += 1

iteration  1
iteration  2
iteration  3
iteration  4
iteration  5


In [43]:
scores_skf

[0.8473821587594043,
 0.8432125307125307,
 0.8421375921375921,
 0.8425982800982801,
 0.8536547911547911]

In [45]:
i = 1
for train_set, test_set in skf.split(X=X, y=y):
    print("iteration ", i)
    print(train_set, " having :" , len(train_set))
    print(test_set, " having :" , len(test_set))
    print()
    print("y train counts: \n", y[train_set].value_counts())
    print("y test counts: \n", y[test_set].value_counts())
    print("-------------------------")
    i += 1

iteration  1
[ 6499  6500  6512 ... 32558 32559 32560]  having : 26048
[   0    1    2 ... 6514 6515 6516]  having : 6513

y train counts: 
  <=50K    19776
 >50K      6272
Name:  income, dtype: int64
y test counts: 
  <=50K    4944
 >50K     1569
Name:  income, dtype: int64
-------------------------
iteration  2
[    0     1     2 ... 32558 32559 32560]  having : 26049
[ 6499  6500  6512 ... 13121 13123 13125]  having : 6512

y train counts: 
  <=50K    19776
 >50K      6273
Name:  income, dtype: int64
y test counts: 
  <=50K    4944
 >50K     1568
Name:  income, dtype: int64
-------------------------
iteration  3
[    0     1     2 ... 32558 32559 32560]  having : 26049
[12997 12999 13000 ... 19727 19729 19733]  having : 6512

y train counts: 
  <=50K    19776
 >50K      6273
Name:  income, dtype: int64
y test counts: 
  <=50K    4944
 >50K     1568
Name:  income, dtype: int64
-------------------------
iteration  4
[    0     1     2 ... 32558 32559 32560]  having : 26049
[19482 1948

In [48]:
result_kf = cross_val_score(estimator=pipe, X=X, y=y, scoring='accuracy', cv=5)

In [49]:
result_kf

array([0.84738216, 0.84321253, 0.84213759, 0.84259828, 0.85365479])

In [50]:
start = time.time()
result_kf10 = cross_val_score(estimator=pipe, X=X, y=y, scoring='accuracy', cv=KFold(n_splits=10))
result_kf10
print("time taken: ", time.time()-start)

time taken:  18.22358274459839


In [51]:
result_kf10

array([0.83880872, 0.85165848, 0.84981572, 0.84367322, 0.85135135,
       0.84613022, 0.84520885, 0.84797297, 0.8544226 , 0.84459459])

# LOO CV

In [55]:
start = time.time()
result_loocv = cross_val_score(estimator=pipe, X=X.head(100), y=y.head(100),
                               scoring='accuracy', cv=LeaveOneOut())
print("time taken: ", time.time()-start)

time taken:  7.897573947906494


It should've been 32561 technically, but makes very minimal difference really.
I don't change here just to keep it in sync with the video tutorial which you can access here btw:
https://www.youtube.com/watch?v=ZnSJgIULMVY

In [56]:
32531/100

325.31

In [57]:
325*8

2600

In [58]:
325*8/60

43.333333333333336

In [59]:
result_loocv

array([1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 1., 1., 0., 1., 1.,
       1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       0., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1.,
       1., 1., 0., 1., 0., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 0.,
       0., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0.,
       1., 0., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 0., 1., 1.])

In [60]:
result_loocv.mean()

0.8

# Repeated KFold

In [62]:
start = time.time()
result_rkf = cross_val_score(estimator=pipe, X=X, y=y, scoring='accuracy',
                              cv=RepeatedKFold(n_splits=5, n_repeats=5))
result_rkf
print("time taken: ", time.time()-start)

time taken:  42.56800842285156


In [63]:
result_rkf

array([0.85920467, 0.8519656 , 0.84490172, 0.85304054, 0.84351966,
       0.85429142, 0.8507371 , 0.84613022, 0.83983415, 0.84781941,
       0.85229541, 0.8544226 , 0.84413391, 0.84136978, 0.84305897,
       0.84983878, 0.84751229, 0.84474816, 0.84183047, 0.84536241,
       0.85091356, 0.85534398, 0.84720516, 0.84566953, 0.83507371])

#### Until now, we've seen implementation of cross validation on entire datasets, which isn't wrong.
#### But we can also do it on a training set, and then cross verify our results on a separate test set to see if we're able to generalize our results properly.

In [65]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [66]:
start = time.time()
result_tts = cross_val_score(estimator=pipe, X=X_train, y=y_train,
                              scoring='accuracy', cv=KFold(n_splits=5))
print("time taken: ", time.time()-start)

time taken:  6.2574238777160645


In [67]:
result_tts

array([0.85547025, 0.84702495, 0.84184261, 0.84661163, 0.84968324])

In [69]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('ct_step',
                 ColumnTransformer(transformers=[('rob', RobustScaler(),
                                                  Index(['age', ' fnlwgt', ' education-num', ' capital-gain', ' capital-loss',
       ' hours-per-week'],
      dtype='object')),
                                                 ('ohe',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse=False),
                                                  Index([' workclass', ' education', ' marital-status', ' occupation',
       ' relationship', ' race', ' sex', ' native-country'],
      dtype='object'))])),
                ('model',
                 RandomForestClassifier(n_estimators=10, random_state=0))])

In [70]:
pipe.score(X_test, y_test)

0.8446184553968985

#### By comparing this score with the 5 CV scores on the training set, we see the results are quite similar, which is a good sign

#### To see list of available scoring parameters, we can either remember this piece of code:
import sklearn
sorted(sklearn.metrics.SCORERS.keys())
#### or we can use this little hack where we set the scoing parameter to a random string, and then scikit-learn tells us in the error where the list of available metrics exist! 

In [71]:
cross_val_score(estimator=pipe, X=X_train, y=y_train,
                              scoring='kemfi', cv=KFold(n_splits=5))

ValueError: 'kemfi' is not a valid scoring value. Use sorted(sklearn.metrics.SCORERS.keys()) to get valid options.

In [72]:
import sklearn
sorted(sklearn.metrics.SCORERS.keys())

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'max_error',
 'mutual_info_score',
 'neg_brier_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_gamma_deviance',
 'neg_mean_poisson_deviance',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'neg_root_mean_squared_error',
 'normalized_mutual_info_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'roc_auc_ovo',
 'roc_auc_ovo_weighted',
 'roc_auc_ovr',
 'roc_auc_ovr_weighted',
 'v_measure_score']