<a href="https://colab.research.google.com/github/rnrios/IntroML/blob/master/cross_validation/cross_validation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd


data = pd.read_csv('https://gist.githubusercontent.com/guilhermesilveira/4d1d4a16ccbf6ea4e0a64a38a24ec884/raw/afd05cb0c796d18f3f5a6537053ded308ba94bf7/car-prices.csv')

#Pre-processing

In [None]:
data.head()

Unnamed: 0.1,Unnamed: 0,mileage_per_year,model_year,price,sold
0,0,21801,2000,30941.02,yes
1,1,7843,1998,40557.96,yes
2,2,7109,2006,89627.5,no
3,3,26823,2015,95276.14,no
4,4,7935,2014,117384.68,yes


In [None]:
swap = {
    'no': 0,
     'yes': 1
}
data.sold = data.sold.map(swap)

In [None]:
data.head()

Unnamed: 0.1,Unnamed: 0,mileage_per_year,model_year,price,sold
0,0,21801,2000,30941.02,1
1,1,7843,1998,40557.96,1
2,2,7109,2006,89627.5,0
3,3,26823,2015,95276.14,0
4,4,7935,2014,117384.68,1


In [None]:
from datetime import datetime

current_year = datetime.today().year
data['model_age'] = current_year - data.model_year
data.head()

Unnamed: 0.1,Unnamed: 0,mileage_per_year,model_year,price,sold,model_age
0,0,21801,2000,30941.02,1,21
1,1,7843,1998,40557.96,1,23
2,2,7109,2006,89627.5,0,15
3,3,26823,2015,95276.14,0,6
4,4,7935,2014,117384.68,1,7


In [None]:
data = data.drop(columns=['Unnamed: 0', 'model_year'], axis=1)
data.head()  

Unnamed: 0,mileage_per_year,price,sold,model_age
0,21801,30941.02,1,21
1,7843,40557.96,1,23
2,7109,89627.5,0,15
3,26823,95276.14,0,6
4,7935,117384.68,1,7


#Validating model with K-fold

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import numpy as np

X = data[['mileage_per_year', 'price', 'model_age']]
Y = data['sold']

SEED = 13
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, 
                                                    test_size=.25, 
                                                    random_state=SEED, 
                                                    stratify=Y)

model = DecisionTreeClassifier(random_state=SEED, max_depth=2)
model.fit(X_train, Y_train)
predictions = model.predict(X_test)

acc = accuracy_score(Y_test, predictions)
print("Accuracy on test dataset: %.2f"% acc)

Accuracy on test dataset: 0.77


In [None]:
from sklearn.model_selection import cross_validate


SEED = 13
np.random.seed(SEED)

model = DecisionTreeClassifier(max_depth=2)
results = cross_validate(model, X, Y, cv=5, return_train_score=False)
mean = results['test_score'].mean()
std = results['test_score'].std()
print('Mean accuracy: {:.3f}'.format(mean))
print('Accuracy with 5-fold (IC=.95): [{:.3f} {:.3f}]'.format(mean-2*std, mean+2*std))

Mean accuracy: 0.758
Accuracy with 5-fold (IC=.95): [0.752 0.764]


#KFold shuffling data

In [None]:
from sklearn.model_selection import KFold


SEED = 13
np.random.seed(SEED)

cv = KFold(n_splits=5, shuffle=True)
model = DecisionTreeClassifier(max_depth=2)
results = cross_validate(model, X, Y, cv=cv, return_train_score=False)


mean = results['test_score'].mean()
std = results['test_score'].std()
print('Mean accuracy: {:.3f}'.format(mean))
print('Accuracy with 5-fold (IC=.95): [{:.3f} {:.3f}]'.format(mean-2*std, mean+2*std))

Mean accuracy: 0.758
Accuracy with 5-fold (IC=.95): [0.745 0.770]


In [None]:
from sklearn.model_selection import StratifiedKFold


SEED = 13
np.random.seed(SEED)

cv = StratifiedKFold(n_splits=5, shuffle=True)
model = DecisionTreeClassifier(max_depth=2)
results = cross_validate(model, X, Y, cv=cv, return_train_score=False)


mean = results['test_score'].mean()
std = results['test_score'].std()
print('Mean accuracy: {:.3f}'.format(mean))
print('Accuracy with 5-fold (IC=.95): [{:.3f} {:.3f}]'.format(mean-2*std, mean+2*std))

Mean accuracy: 0.758
Accuracy with 5-fold (IC=.95): [0.747 0.769]


#Generalization hability

In [None]:
len(data)

10000

In [None]:
#Can we classify new models based on experience with 
np.random.seed(SEED)

random_offset = np.random.randint(0, 5, size=len(data))

model_id = data.model_age + random_offset -3

In [None]:
data['model_id'] = model_id
data.head()

Unnamed: 0,mileage_per_year,price,sold,model_age,model_id
0,21801,30941.02,1,21,20
1,7843,40557.96,1,23,20
2,7109,89627.5,0,15,14
3,26823,95276.14,0,6,3
4,7935,117384.68,1,7,6


In [None]:
#Grouping by model id
from sklearn.model_selection import GroupKFold


SEED = 13
np.random.seed(SEED)

cv = GroupKFold(n_splits=5)
model = DecisionTreeClassifier(max_depth=2)
results = cross_validate(model, X, Y, cv=cv, groups=data.model_id,
                         return_train_score=False)


mean = results['test_score'].mean()
std = results['test_score'].std()
print('Mean accuracy: {:.3f}'.format(mean))
print('Accuracy with 5-fold (IC=.95): [{:.3f} {:.3f}]'.format(mean-2*std, mean+2*std))

Mean accuracy: 0.758
Accuracy with 5-fold (IC=.95): [0.735 0.781]


#Validating with data scaling

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline



scaler = StandardScaler()
model = SVC()

#scale fitting should be applied only to train set
#scaler.fit(train) => scaler.transform(train), scaler.transform(test)
pipeline = Pipeline([('transform', scaler), ('estimator', model)])

SEED = 13
np.random.seed(SEED)

cv = GroupKFold(n_splits=5)
results = cross_validate(pipeline, X, Y, cv=cv, groups=data.model_id,
                         return_train_score=False)


mean = results['test_score'].mean()
std = results['test_score'].std()
print('Mean accuracy: {:.3f}'.format(mean))
print('Accuracy with 5-fold (IC=.95): [{:.3f} {:.3f}]'.format(mean-2*std, mean+2*std))

Mean accuracy: 0.766
Accuracy with 5-fold (IC=.95): [0.743 0.790]


#Biased (on purpose) Data

In [None]:
biased_data = data.sort_values("sold", ascending=True)
x_biased = biased_data[["price", "model_age","mileage_per_year"]]
y_biased = biased_data["sold"]
biased_data.head()

Unnamed: 0,mileage_per_year,price,sold,model_age,model_id
4999,15418,74023.29,0,15,12
5322,14351,84843.49,0,16,13
5319,22519,83100.27,0,22,22
5316,20039,87932.13,0,19,18
5315,17656,77937.01,0,18,18


In [None]:
SEED = 13
np.random.seed(SEED)

cv = KFold(n_splits = 5)
model = DecisionTreeClassifier(max_depth=2)
results = cross_validate(model, x_biased, y_biased, cv = cv, return_train_score=False)

mean = results['test_score'].mean()
std = results['test_score'].std()
print('Mean accuracy: {:.3f}'.format(mean))
print('Accuracy with 5-fold (IC=.95): [{:.3f} {:.3f}]'.format(mean-2*std, mean+2*std))

Mean accuracy: 0.597
Accuracy with 5-fold (IC=.95): [0.322 0.872]


In [None]:
SEED = 13
np.random.seed(SEED)

cv = KFold(n_splits = 5, shuffle=True)
model = DecisionTreeClassifier(max_depth=2)
results = cross_validate(model, x_biased, y_biased, cv = cv, return_train_score=False)

mean = results['test_score'].mean()
std = results['test_score'].std()
print('Mean accuracy: {:.3f}'.format(mean))
print('Accuracy with 5-fold (IC=.95): [{:.3f} {:.3f}]'.format(mean-2*std, mean+2*std))

Mean accuracy: 0.758
Accuracy with 5-fold (IC=.95): [0.747 0.769]


In [None]:
SEED = 13
np.random.seed(SEED)

cv = GroupKFold(n_splits = 5)
model = DecisionTreeClassifier(max_depth=2)
results = cross_validate(model, x_biased, y_biased, groups=data.model_id, cv = cv, return_train_score=False)

mean = results['test_score'].mean()
std = results['test_score'].std()
print('Mean accuracy: {:.3f}'.format(mean))
print('Accuracy with 5-fold (IC=.95): [{:.3f} {:.3f}]'.format(mean-2*std, mean+2*std))

Mean accuracy: 0.758
Accuracy with 5-fold (IC=.95): [0.731 0.785]
