# K-Fold Cross-Validation

![alt text](https://github.com/sara-kassani/Advanced-Predictive-Techniques-with-Scikit-Learn-and-TensorFlow/blob/master/data/k-fold.jpg?raw=true "K-fold")

![alt text](https://github.com/sara-kassani/Advanced-Predictive-Techniques-with-Scikit-Learn-and-TensorFlow/blob/master/data/k-fold2.jpg?raw=true "K-fold")

![alt text](https://github.com/sara-kassani/Advanced-Predictive-Techniques-with-Scikit-Learn-and-TensorFlow/blob/master/data/k-fold3.jpg?raw=true "K-fold")

![alt text](https://github.com/sara-kassani/Advanced-Predictive-Techniques-with-Scikit-Learn-and-TensorFlow/blob/master/data/k-fold4.jpg?raw=true "K-fold")

### Importing libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline

### Data preparation

In [None]:
# importing data
data_path= 'data/diamonds.csv'
diamonds = pd.read_csv(data_path)
diamonds = pd.concat([diamonds, pd.get_dummies(diamonds['cut'], prefix='cut', drop_first=True)],axis=1)
diamonds = pd.concat([diamonds, pd.get_dummies(diamonds['color'], prefix='color', drop_first=True)],axis=1)
diamonds = pd.concat([diamonds, pd.get_dummies(diamonds['clarity'], prefix='clarity', drop_first=True)],axis=1)
diamonds.drop(['cut','color','clarity'], axis=1, inplace=True)

## Diamonds dataset

### Preparing objects for modelling

In [None]:
from sklearn.preprocessing import RobustScaler
target_name = 'price'
robust_scaler = RobustScaler()
X = diamonds.drop('price', axis=1)
X = robust_scaler.fit_transform(X)
y = diamonds[target_name]
# Notice that we are not doing train-test split
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=55)

### Training our model

In [None]:
from sklearn.ensemble import RandomForestRegressor
RF = RandomForestRegressor(n_estimators=50, max_depth=16, random_state=123, n_jobs=-1)

In [None]:
# this will work from sklearn version 0.19, if you get an error 
# make sure you upgrade: $conda upgrade scikit-learn
from sklearn.model_selection import cross_validate

In [None]:
scores = cross_validate(estimator=RF,X=X,y=y,
                        scoring=['mean_squared_error','r2'],
                        cv=10, n_jobs=-1)

In [None]:
scores = pd.DataFrame(scores)
scores['test_mean_squared_error'] = -1*scores['test_mean_squared_error']
scores['train_mean_squared_error'] = -1*scores['train_mean_squared_error']
scores

In [None]:
print("Mean test MSE:", round(scores['test_mean_squared_error'].mean()))
print("Mean test R-squared:", scores['test_r2'].mean())

## Credit card default dataset 

### Preparing the data

In [None]:
default = pd.read_csv('data/credit-card-default.csv')
default.rename(columns={'pay_0':'pay_1','default payment next month':'default'}, inplace=True)
# Base values: female, other_education, not_married
default['grad_school'] = (default['education'] == 1).astype('int')
default['university'] = (default['education'] == 2).astype('int')
default['high_school'] = (default['education'] == 3).astype('int')
default.drop('education', axis=1, inplace=True)

default['male'] = (default['sex']==1).astype('int')
default.drop('sex', axis=1, inplace=True)

default['married'] = (default['marriage'] == 1).astype('int')
default.drop('marriage', axis=1, inplace=True)

# For pay_n features if >0 then it means the customer was delayed on that month
pay_features = ['pay_' + str(i) for i in range(1,7)]
for p in pay_features:
    default[p] = (default[p] > 0).astype(int)

### Preparing objects for modelling

In [None]:
target_name = 'default'
X_credit = default.drop('default', axis=1)
feature_names = X_credit.columns
robust_scaler = RobustScaler()
X_credit = robust_scaler.fit_transform(X_credit)
y_credit = default[target_name]

In [None]:
from sklearn.ensemble import RandomForestClassifier
RF_credit = RandomForestClassifier(n_estimators=35, max_depth=20, random_state=55, 
                                   max_features='sqrt', n_jobs=-1)

In [None]:
scores_credit = cross_validate(estimator=RF_credit, X=X_credit, y=y_credit,
                        scoring=['accuracy','precision','recall'],
                        cv=10, n_jobs=-1)

In [None]:
scores_credit = pd.DataFrame(scores_credit)
scores_credit

In [None]:
scores_credit[['test_accuracy','test_precision','test_recall']].mean()

In [None]:
scores_credit[['test_accuracy','test_precision','test_recall']].std()