<a href="https://colab.research.google.com/github/robin-ochieng/Machine-Learning/blob/main/Cross_validation_By_ROO.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression

# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Create a logistic regression model
model = LogisticRegression(max_iter=1000)

In [13]:
# Perform 5-fold cross-validation
cv_results = cross_validate(model, X, y, cv=5)

# Print the results
print(cv_results)


{'fit_time': array([0.03019261, 0.01997304, 0.01447153, 0.01541185, 0.01439929]), 'score_time': array([0.00088167, 0.00075364, 0.00074482, 0.00075626, 0.00075078]), 'test_score': array([0.96666667, 1.        , 0.93333333, 0.96666667, 1.        ])}


In [14]:
cv_results

{'fit_time': array([0.03019261, 0.01997304, 0.01447153, 0.01541185, 0.01439929]),
 'score_time': array([0.00088167, 0.00075364, 0.00074482, 0.00075626, 0.00075078]),
 'test_score': array([0.96666667, 1.        , 0.93333333, 0.96666667, 1.        ])}

## Customizing the cross-validation function

In [15]:
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score

# Define custom scoring metrics
scoring = {
    'precision': make_scorer(precision_score, average='weighted'),
    'recall': make_scorer(recall_score, average='weighted'),
    'f1_score': make_scorer(f1_score, average='weighted')
}

# Perform 5-fold cross-validation with custom scoring metrics
cv_results = cross_validate(model, X, y, cv=5, scoring=scoring)

# Print the results
print(cv_results)


{'fit_time': array([0.13886809, 0.2440176 , 0.12440538, 0.11226869, 0.02253056]), 'score_time': array([0.04680824, 0.03698468, 0.02481008, 0.01582527, 0.00556302]), 'test_precision': array([0.96969697, 1.        , 0.94444444, 0.96969697, 1.        ]), 'test_recall': array([0.96666667, 1.        , 0.93333333, 0.96666667, 1.        ]), 'test_f1_score': array([0.96658312, 1.        , 0.93265993, 0.96658312, 1.        ])}


In [16]:
cv_results

{'fit_time': array([0.13886809, 0.2440176 , 0.12440538, 0.11226869, 0.02253056]),
 'score_time': array([0.04680824, 0.03698468, 0.02481008, 0.01582527, 0.00556302]),
 'test_precision': array([0.96969697, 1.        , 0.94444444, 0.96969697, 1.        ]),
 'test_recall': array([0.96666667, 1.        , 0.93333333, 0.96666667, 1.        ]),
 'test_f1_score': array([0.96658312, 1.        , 0.93265993, 0.96658312, 1.        ])}

In [17]:
cv_results = cross_validate(model, X, y, cv=5, return_train_score=True)
print(cv_results)


{'fit_time': array([0.02227259, 0.01907682, 0.01372123, 0.01506138, 0.11353731]), 'score_time': array([0.00084591, 0.000741  , 0.00071073, 0.00075865, 0.00076842]), 'test_score': array([0.96666667, 1.        , 0.93333333, 0.96666667, 1.        ]), 'train_score': array([0.96666667, 0.96666667, 0.98333333, 0.98333333, 0.975     ])}


In [18]:
cv_results

{'fit_time': array([0.02227259, 0.01907682, 0.01372123, 0.01506138, 0.11353731]),
 'score_time': array([0.00084591, 0.000741  , 0.00071073, 0.00075865, 0.00076842]),
 'test_score': array([0.96666667, 1.        , 0.93333333, 0.96666667, 1.        ]),
 'train_score': array([0.96666667, 0.96666667, 0.98333333, 0.98333333, 0.975     ])}

In [19]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# Create an SVM model and a random forest model
svm = SVC(kernel='linear', C=1, random_state=42)
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Perform 5-fold cross-validation for both models
cv_results_svm = cross_validate(svm, X, y, cv=5)
cv_results_rf = cross_validate(rf, X, y, cv=5)

# Print the results
print("SVM:", cv_results_svm)
print("Random Forest:", cv_results_rf)


SVM: {'fit_time': array([0.00186467, 0.00117731, 0.00104427, 0.00108218, 0.0010407 ]), 'score_time': array([0.00071931, 0.00059843, 0.0005796 , 0.00057554, 0.00056434]), 'test_score': array([0.96666667, 1.        , 0.96666667, 0.96666667, 1.        ])}
Random Forest: {'fit_time': array([0.15817285, 0.14139223, 0.13883209, 0.15164566, 0.15378237]), 'score_time': array([0.00943923, 0.00977015, 0.0092957 , 0.00973392, 0.0095427 ]), 'test_score': array([0.96666667, 0.96666667, 0.93333333, 0.96666667, 1.        ])}


In [20]:
cv_results_rf

{'fit_time': array([0.15817285, 0.14139223, 0.13883209, 0.15164566, 0.15378237]),
 'score_time': array([0.00943923, 0.00977015, 0.0092957 , 0.00973392, 0.0095427 ]),
 'test_score': array([0.96666667, 0.96666667, 0.93333333, 0.96666667, 1.        ])}

In [21]:
cv_results_svm

{'fit_time': array([0.00186467, 0.00117731, 0.00104427, 0.00108218, 0.0010407 ]),
 'score_time': array([0.00071931, 0.00059843, 0.0005796 , 0.00057554, 0.00056434]),
 'test_score': array([0.96666667, 1.        , 0.96666667, 0.96666667, 1.        ])}

##Handling Imbalanced data with cross validate

In [22]:
from sklearn.model_selection import StratifiedKFold

# Create a stratified k-fold cross-validator
stratified_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Perform 5-fold stratified cross-validation
cv_results = cross_validate(model, X, y, cv=stratified_cv)

# Print the results
print(cv_results)


{'fit_time': array([0.02661228, 0.02330875, 0.01565742, 0.01685929, 0.02469563]), 'score_time': array([0.00118136, 0.00078869, 0.00074458, 0.00072408, 0.00076032]), 'test_score': array([1.        , 0.96666667, 0.93333333, 1.        , 0.93333333])}


In [23]:
cv_results

{'fit_time': array([0.02661228, 0.02330875, 0.01565742, 0.01685929, 0.02469563]),
 'score_time': array([0.00118136, 0.00078869, 0.00074458, 0.00072408, 0.00076032]),
 'test_score': array([1.        , 0.96666667, 0.93333333, 1.        , 0.93333333])}

##Nested cross validation for model selection



In [24]:

from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

# Define the outer and inner cross-validation strategies
outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)
inner_cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Define the parameter grid
C_values = [0.001, 0.01, 0.1, 1, 10, 100]

# Nested cross-validation
outer_scores = []

for train_index, val_index in from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

# Define the outer and inner cross-validation strategies
outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)
inner_cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Define the parameter grid
C_values = [0.001, 0.01, 0.1, 1, 10, 100]

# Nested cross-validation
outer_scores = []

for train_index, val_index in outer_cv.split(X, y):
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]

    best_score = 0
    best_C = None

    for C in C_values:
        model = LogisticRegression(C=C, max_iter=1000)
        inner_scores = cross_validate(model, X_train, y_train, cv=inner_cv, scoring='accuracy')['test_score']
        score = np.mean(inner_scores)

        if score > best_score:
            best_score = score
            best_C = C

    # Train the model with the best C value on the outer training set
    model = LogisticRegression(C=best_C, max_iter=1000)
    model.fit(X_train, y_train)
    val_score = accuracy_score(y_val, model.predict(X_val))
    outer_scores.append(val_score)

# Print the average accuracy across the outer folds
print("Average accuracy:", np.mean(outer_scores))from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

# Define the outer and inner cross-validation strategies
outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)
inner_cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Define the parameter grid
C_values = [0.001, 0.01, 0.1, 1, 10, 100]

# Nested cross-validation
outer_scores = []

for train_index, val_index in outer_cv.split(X, y):
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]

    best_score = 0
    best_C = None

    for C in C_values:
        model = LogisticRegression(C=C, max_iter=1000)
        inner_scores = cross_validate(model, X_train, y_train, cv=inner_cv, scoring='accuracy')['test_score']
        score = np.mean(inner_scores)

        if score > best_score:
            best_score = score
            best_C = C

    # Train the model with the best C value on the outer training set
    model = LogisticRegression(C=best_C, max_iter=1000)
    model.fit(X_train, y_train)
    val_score = accuracy_score(y_val, model.predict(X_val))
    outer_scores.append(val_score)

# Print the average accuracy across the outer folds
print("Average accuracy:", np.mean(outer_scores))from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

# Define the outer and inner cross-validation strategies
outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)
inner_cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Define the parameter grid
C_values = [0.001, 0.01, 0.1, 1, 10, 100]

# Nested cross-validation
outer_scores = []

for train_index, val_index in outer_cv.split(X, y):
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]

    best_score = 0
    best_C = None

    for C in C_values:
        model = LogisticRegression(C=C, max_iter=1000)
        inner_scores = cross_validate(model, X_train, y_train, cv=inner_cv, scoring='accuracy')['test_score']
        score = np.mean(inner_scores)

        if score > best_score:
            best_score = score
            best_C = C

    # Train the model with the best C value on the outer training set
    model = LogisticRegression(C=best_C, max_iter=1000)
    model.fit(X_train, y_train)
    val_score = accuracy_score(y_val, model.predict(X_val))
    outer_scores.append(val_score)

# Print the average accuracy across the outer folds
print("Average accuracy:", np.mean(outer_scores))from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

# Define the outer and inner cross-validation strategies
outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)
inner_cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Define the parameter grid
C_values = [0.001, 0.01, 0.1, 1, 10, 100]

# Nested cross-validation
outer_scores = []

for train_index, val_index in outer_cv.split(X, y):
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]

    best_score = 0
    best_C = None

    for C in C_values:
        model = LogisticRegression(C=C, max_iter=1000)
        inner_scores = cross_validate(model, X_train, y_train, cv=inner_cv, scoring='accuracy')['test_score']
        score = np.mean(inner_scores)

        if score > best_score:
            best_score = score
            best_C = C

    # Train the model with the best C value on the outer training set
    model = LogisticRegression(C=best_C, max_iter=1000)
    model.fit(X_train, y_train)
    val_score = accuracy_score(y_val, model.predict(X_val))
    outer_scores.append(val_score)

# Print the average accuracy across the outer folds
print("Average accuracy:", np.mean(outer_scores))from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

# Define the outer and inner cross-validation strategies
outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)
inner_cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Define the parameter grid
C_values = [0.001, 0.01, 0.1, 1, 10, 100]

# Nested cross-validation
outer_scores = []

for train_index, val_index in outer_cv.split(X, y):
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]

    best_score = 0
    best_C = None

    for C in C_values:
        model = LogisticRegression(C=C, max_iter=1000)
        inner_scores = cross_validate(model, X_train, y_train, cv=inner_cv, scoring='accuracy')['test_score']
        score = np.mean(inner_scores)

        if score > best_score:
            best_score = score
            best_C = C

    # Train the model with the best C value on the outer training set
    model = LogisticRegression(C=best_C, max_iter=1000)
    model.fit(X_train, y_train)
    val_score = accuracy_score(y_val, model.predict(X_val))
    outer_scores.append(val_score)

# Print the average accuracy across the outer folds
print("Average accuracy:", np.mean(outer_scores))from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

# Define the outer and inner cross-validation strategies
outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)
inner_cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Define the parameter grid
C_values = [0.001, 0.01, 0.1, 1, 10, 100]

# Nested cross-validation
outer_scores = []

for train_index, val_index in outer_cv.split(X, y):
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]

    best_score = 0
    best_C = None

    for C in C_values:
        model = LogisticRegression(C=C, max_iter=1000)
        inner_scores = cross_validate(model, X_train, y_train, cv=inner_cv, scoring='accuracy')['test_score']
        score = np.mean(inner_scores)

        if score > best_score:
            best_score = score
            best_C = C

    # Train the model with the best C value on the outer training set
    model = LogisticRegression(C=best_C, max_iter=1000)
    model.fit(X_train, y_train)
    val_score = accuracy_score(y_val, model.predict(X_val))
    outer_scores.append(val_score)

# Print the average accuracy across the outer folds
print("Average accuracy:", np.mean(outer_scores)):
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]

    best_score = 0
    best_C = None

    for C in C_values:
        model = LogisticRegression(C=C, max_iter=1000)
        inner_scores = cross_validate(model, X_train, y_train, cv=inner_cv, scoring='accuracy')['test_score']
        score = np.mean(inner_scores)

        if score > best_score:
            best_score = score
            best_C = C

    # Train the model with the best C value on the outer training set
    model = LogisticRegression(C=best_C, max_iter=1000)
    model.fit(X_train, y_train)
    val_score = accuracy_score(y_val, model.predict(X_val))
    outer_scores.append(val_score)

# Print the average accuracy across the outer folds
print("Average accuracy:", np.mean(outer_scores))


Average accuracy: 0.9733333333333334


In [26]:
outer_scores

[1.0, 1.0, 0.9333333333333333, 0.9666666666666667, 0.9666666666666667]

In [27]:
best_score

0.95

In [28]:
outer_cv

KFold(n_splits=5, random_state=42, shuffle=True)

In [29]:
inner_cv

KFold(n_splits=5, random_state=42, shuffle=True)

In [31]:
rain = outer_cv.split(X, y)

In [32]:
rain


<generator object _BaseKFold.split at 0x7ff815b00cf0>