# Cross-Validation Explained with Student Exam Analogy | Machine Learning Made Easy

**1. Train - Test Split Cross Validation**

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.datasets import make_classification
import pandas as pd

# Generate a synthetic classification dataset
X, y = make_classification(n_samples=100, n_features=10, random_state=42)

# Split the data into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest Classifier
model = RandomForestClassifier(random_state=42)

# Train the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy on test data: {accuracy * 100:.2f}%")


Accuracy on test data: 100.00%


**2. K-Fold Cross Validation**

1. Split data into k equal parts (folds)

2. Train on k-1 folds, test on 1 fold

3. Repeat k times, average results

In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import numpy as np

# Load data
X, y = load_iris(return_X_y=True)

# Set up K-Fold
kf = KFold(n_splits=5, shuffle=True, random_state=42)

models = []
scores = []

# Loop through each fold
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Train model
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)

    # Predict and evaluate
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)

    models.append(model)
    scores.append(acc)

# Find the best score and corresponding model
best_index = np.argmax(scores)
best_model = models[best_index]

print(f"Best model is from Fold {best_index + 1} with accuracy: {scores[best_index]:.4f}")



Best model is from Fold 1 with accuracy: 1.0000


**3. Stratified K-Fold**

1. Same as K-Fold, but ensures same class distribution in each fold.

2. Best for classification problems with class imbalance.

In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import numpy as np

# Load data
X, y = load_iris(return_X_y=True)

# Set up Stratified K-Fold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

models = []
scores = []

# Loop through each fold
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Train model
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)

    # Predict and evaluate
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)

    models.append(model)
    scores.append(acc)

# Find the best score and corresponding model
best_index = np.argmax(scores)
best_model = models[best_index]

print(f"Best model is from Fold {best_index + 1} with accuracy: {scores[best_index]:.4f}")


Best model is from Fold 1 with accuracy: 0.9667


**4. Leave-One-Out CV (LOOCV)**

1. Extreme case of K-Fold where k = number of data points

2. Train on all data except 1 sample; repeat for every sample

3. Not practical for large datasets (very slow), but very thorough

In [None]:
from sklearn.datasets import make_classification
from sklearn.model_selection import LeaveOneOut, cross_val_score
from sklearn.ensemble import RandomForestClassifier
import numpy as np

# Generate a small random dataset for demonstration
X, y = make_classification(
    n_samples=30,        # Small size since LOOCV is slow for big data
    n_features=5,
    n_informative=3,
    n_redundant=0,
    n_classes=2,
    random_state=42
)

# Initialize the model
model = RandomForestClassifier(random_state=42)

# Set up Leave-One-Out cross-validation
loo = LeaveOneOut()

# Evaluate using LOOCV
scores = cross_val_score(model, X, y, cv=loo, scoring='accuracy')

# Output results
print("Number of LOOCV iterations:", len(scores))
print("LOOCV Accuracy for each iteration:", scores)
print("Average LOOCV Accuracy:", np.mean(scores))


Number of LOOCV iterations: 30
LOOCV Accuracy for each iteration: [0. 1. 1. 1. 1. 0. 1. 1. 0. 1. 1. 0. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 0. 1. 1. 1. 1. 1.]
Average LOOCV Accuracy: 0.8


**5. TimeSeriesSplit**

1. Use for time-dependent data (like stock prices)

2. Ensures training data is always earlier than test data



In [None]:
from sklearn.model_selection import TimeSeriesSplit

tscv = TimeSeriesSplit(n_splits=5)
for train_index, test_index in tscv.split(X):
    print("Train:", train_index, "Test:", test_index)


Train: [0 1 2 3 4] Test: [5 6 7 8 9]
Train: [0 1 2 3 4 5 6 7 8 9] Test: [10 11 12 13 14]
Train: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14] Test: [15 16 17 18 19]
Train: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19] Test: [20 21 22 23 24]
Train: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24] Test: [25 26 27 28 29]


In [None]:
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Create a synthetic time-series dataset (10 data points, 1 feature)
X = np.random.rand(10, 1)
y = np.array([0, 1, 0, 1, 0, 1, 0, 1, 0, 1])  # Alternating 0s and 1s

# Initialize TimeSeriesSplit with 5 splits
tscv = TimeSeriesSplit(n_splits=5)

# Initialize the model
model = RandomForestClassifier(random_state=42)

# Loop over each split and train/test the model
for train_index, test_index in tscv.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Train the model
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Evaluate performance
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Test indices: {test_index}, Accuracy: {accuracy:.2f}")


Test indices: [5], Accuracy: 0.00
Test indices: [6], Accuracy: 0.00
Test indices: [7], Accuracy: 0.00
Test indices: [8], Accuracy: 0.00
Test indices: [9], Accuracy: 1.00
