# 08 â€” Cross-Validation and Validation Strategies

Single train-test splits can lead to misleading model performance.
Cross-validation evaluates models across multiple data splits for more reliable comparison.

Methods:
- K-Fold Cross Validation
- Stratified K-Fold for classification
- Why CV prevents overfitting and false performance confidence


In [6]:
import pandas as pd

# Load dataset
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
df = pd.read_csv(url)
df = df.drop(columns=["Cabin", "Ticket", "Name", "PassengerId"])

X = df.drop(columns=["Survived"])
y = df["Survived"]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [7]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_train.select_dtypes(include=['object']).columns

numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numeric_features),
    ("cat", categorical_transformer, categorical_features)
])


In [8]:
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

models = {
    "LogReg": LogisticRegression(max_iter=2000),
    "RandomForest": RandomForestClassifier(n_estimators=200, random_state=42),
    "SVC": SVC(probability=True)
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

results = []

for name, clf in models.items():
    pipe = Pipeline([
        ("preprocess", preprocessor),
        ("model", clf)
    ])
    
    scores = cross_val_score(pipe, X, y, cv=cv, scoring="f1")
    results.append([name, scores.mean(), scores.std()])

import pandas as pd
pd.DataFrame(results, columns=["Model", "Mean F1", "Std Dev"]).sort_values("Mean F1", ascending=False)


Unnamed: 0,Model,Mean F1,Std Dev
2,SVC,0.756235,0.035112
1,RandomForest,0.755505,0.017659
0,LogReg,0.724233,0.023676
