# Cross-Validation Purpose and Applications 
### Author: Shachi Kaul

In [1]:
from sklearn.datasets import load_breast_cancer
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split,KFold,cross_val_score,cross_val_predict,StratifiedKFold,GridSearchCV
import warnings
warnings.filterwarnings('ignore')
seed=0

  from numpy.core.umath_tests import inner1d


# Milestones
- Load sklearn's breast_cancer dataset
- Ways of Data Partitioning, Training & Evaluation
    - train_test_split 
    - Cross-Validation
- Applications of Cross-Validation   
    - Choose your model
    - Hyper-parameter Tuning

In [2]:
loaded_data = load_breast_cancer()

In [3]:
df = pd.DataFrame(data= loaded_data.data, columns=[loaded_data.feature_names])

In [4]:
#Framing target column in our dataframe
df['target'] = pd.Series(data=loaded_data.target, index=df.index)

In [5]:
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [6]:
df.shape

(569, 31)

## Ways of Data Partitioning, Training & Evaluation
For building the model, data is partitioned into train/test. There are various ways of doing it, each having its own limitation

### train_test_split 

In [7]:
#Not using random_state by default generates random integer every time when code is run. Thsi results into different results.
xtrain, xtest, ytrain, ytest = train_test_split(df.drop(['target'],axis=1), df['target'],test_size=0.2,random_state=seed)

In [8]:
lg_model = LogisticRegression()
lg_model.fit(xtrain,ytrain)
lg_model.score(xtest,ytest)

0.956140350877193

### Cross-Validation

In [9]:
def return_score(model,X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    score = model.score(X_test, y_test)
    return score

In [10]:
X = df.drop(['target'],axis=1)
y = df['target']

#### K-Fold & Stratified K-Fold

##### Manually

In [11]:
model = LogisticRegression()

scores = []
#cv = StratifiedKFold(n_splits=5, random_state=0, shuffle=False)
cv = KFold(n_splits=5, random_state=seed, shuffle=False) 
for train_index, test_index in cv.split(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)
    score = return_score(model,X_train, X_test, y_train, y_test)    
    scores.append(score)

print("Accuracy score in each iteration: {}".format(scores))
print("K-Fold Score: {}".format(np.mean(scores)))

Accuracy score in each iteration: [0.9736842105263158, 0.9649122807017544, 0.9736842105263158, 0.9385964912280702, 0.9736842105263158]
K-Fold Score: 0.9649122807017545


#### Scikit-learn library:
    - cross_val_score: To compute score of each test fold. 
    - cross_val_predict: To predict and returns predicted score of each test fold.
Default, integer cv parameter assumes StratifiedK-Fold. As shown below, it is Stratified 5-folds implemented via cross_val_Score but you can also set cv to K-Fold commented.

In [12]:
model = LogisticRegression()

cv_scores_5_folds = cross_val_score(model,X,y,cv=5)
#cv_scores_5_folds = cross_val_score(model,X,y,cv=KFold(n_splits=5))

cv_predicts_5_folds = cross_val_predict(model,X,y,cv=5)

print("Accuracy score in each iteration: {}".format(cv_scores_5_folds))
print("Predicted class for each record: {}".format(cv_predicts_5_folds))
print("K-Fold Score: {}".format(np.mean(cv_scores_5_folds)))
print("Total records: {}, Total predicted values: {}".format(df.shape[0],len(cv_predicts_5_folds)))

Accuracy score in each iteration: [0.93043478 0.93913043 0.97345133 0.94690265 0.96460177]
Predicted class for each record: [0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 1 1 1 1 0 0 1 0 1 0 1 1 1 1 1 0 0 1 0 0 1 1 1 1 0 1 0 0 1 1 1 1 0 1 0 1
 1 0 1 0 0 1 1 1 0 0 1 0 1 0 1 1 1 1 0 1 0 0 1 1 1 0 0 1 1 1 1 0 1 1 0 1 1
 1 1 1 1 1 1 0 0 0 1 0 0 1 1 1 0 0 1 0 1 0 0 1 0 1 1 1 0 1 1 0 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 0 0 1 1 1 0 0 1 0 1 1 0 0 1 1 0 0 1 1 1 1 0 1 1 0 0 0 1 0
 1 0 1 1 1 0 1 1 0 1 1 0 0 0 0 1 0 0 0 1 0 1 0 1 1 0 1 0 0 0 1 1 1 0 0 1 1
 1 0 1 1 1 1 1 0 0 1 1 0 1 1 0 0 0 0 1 1 1 1 0 1 1 1 1 1 0 1 0 0 0 1 0 0 0
 0 0 0 0 0 0 0 1 1 1 1 1 1 0 1 0 1 1 0 1 1 0 1 0 0 1 1 1 1 1 1 0 1 1 1 1 1
 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 0 1 1 1 1 0 0 0 1 1
 1 1 0 1 0 1 0 1 1 1 0 1 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 0 0 1 0 0
 0 1 0 0 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 0 1 1 0 0 1 1 1 1 1 1 0 1 1 1 1 1 1
 1 0 1 1 1 1 0 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 

## Applications of Cross-Validation

### Choose your model
    Aim: To compare classifier using cross_val_score. 
    Conclusion: Not much difference in scores but RandomForest performed best among three

In [13]:
rf_cv_scores_5_folds = cross_val_score(RandomForestClassifier(random_state=seed),X,y,cv=5)
print("K-Fold Score of RandomForest: {}".format(np.mean(rf_cv_scores_5_folds)))

dt_cv_scores_5_folds = cross_val_score(DecisionTreeClassifier(random_state=seed),X,y,cv=5)
print("K-Fold Score of Decision Tree: {}".format(np.mean(dt_cv_scores_5_folds)))

log_cv_scores_5_folds = cross_val_score(LogisticRegression(random_state=seed),X,y,cv=5)
print("K-Fold Score of Logistic Regression: {}".format(np.mean(log_cv_scores_5_folds)))

K-Fold Score of RandomForest: 0.9561831473643709
K-Fold Score of Decision Tree: 0.9174297806848788
K-Fold Score of Logistic Regression: 0.9509041939207385


### Hyper-parameter Tuning of RandomForest 

- Model which was simply trained on manually estimated parameters may perform better if its parameters get more tuned as per data. This tuning is necessary to improvise your model which is about testing cartesian product of parameters using Grid Search or Random Search. 

- <b>GridSearchCV: </b>
Scikit-learn library to define ranges of parameters, perform CrossValidation with each combination of cartesian product of hyperparameters.

In [14]:
rf_model = RandomForestClassifier(random_state=seed)

In [15]:
n_estimators = list(range(5,20))
maxdepth = list(range(2,10)) 

hyperparas = dict(n_estimators=n_estimators,max_depth=maxdepth)
grid_rf_model = GridSearchCV(estimator=rf_model, param_grid=hyperparas, cv=5)
improved_model = grid_rf_model.fit(X,y)

In [16]:
print("Best score:",improved_model.best_score_)
print("Best parameters for best score:",improved_model.best_params_)

Best score: 0.9648506151142355
Best parameters for best score: {'max_depth': 6, 'n_estimators': 19}


#### Testing best parameters returned by GridSearchCV
    Conclusion: Score is improved in an accuracy of approx 1%

In [17]:
rf_model = RandomForestClassifier(max_depth=6, n_estimators= 19, random_state=seed)
score = cross_val_score(rf_model,X,y,cv=5)
print("CV Score:",np.mean(score))

CV Score: 0.9649403616775682


# END