<a href="https://colab.research.google.com/github/rakshitarajan/Basic-ML/blob/main/KFold.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from sklearn.datasets import load_iris
import pandas as pd
import numpy as np

# Load the Iris dataset
X, y = load_iris(return_X_y=True)

# X contains the features, y contains the target variable (species)
print(f"Shape of features (X): {X.shape}")
print(f"Shape of target (y): {y.shape}")

# Optionally, you can convert X and y into a Pandas DataFrame for easier viewing
iris_df = pd.DataFrame(X, columns=[f'sepal_length', 'sepal_width', 'petal_length', 'petal_width'])
iris_df['target'] = y

print("\nFirst 5 rows of the Iris dataset:")
print(iris_df.head())

print("\nTarget names (species):")
print(load_iris().target_names) # To see what 0, 1, 2 in 'y' correspond to

Shape of features (X): (150, 4)
Shape of target (y): (150,)

First 5 rows of the Iris dataset:
   sepal_length  sepal_width  petal_length  petal_width  target
0           5.1          3.5           1.4          0.2       0
1           4.9          3.0           1.4          0.2       0
2           4.7          3.2           1.3          0.2       0
3           4.6          3.1           1.5          0.2       0
4           5.0          3.6           1.4          0.2       0

Target names (species):
['setosa' 'versicolor' 'virginica']


In [3]:
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler # Good practice for Logistic Regression

# Initialize KFold
# n_splits: The number of folds (k) - Iris has 150 samples, 5 folds means 30 samples per fold
# shuffle: Whether to shuffle the data before splitting
# random_state: Seed for shuffling for reproducibility
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Initialize a list to store accuracy scores for each fold
accuracy_scores = []
models = []

# Loop through each fold
print("Starting K-Fold Cross-Validation on Iris Dataset:")
for fold, (train_index, test_index) in enumerate(kf.split(X)):
    print(f"\n--- Fold {fold+1} ---")

    # Split data into training and testing sets for the current fold
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    print(f"Train set size: {len(X_train)} samples")
    print(f"Test set size: {len(X_test)} samples")

    # --- Feature Scaling (Good practice for Logistic Regression) ---
    # Logistic Regression is sensitive to feature scaling.
    # We fit the scaler ONLY on the training data to prevent data leakage.
    # Then we transform both training and test data.
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Initialize the Logistic Regression model
    # For multi-class classification, 'liblinear' can do one-vs-rest.
    # 'lbfgs' is also a good general-purpose solver for multinomial.
    # multi_class='multinomial' explicitly tells it to handle multiple classes.
    # max_iter increased because convergence warnings can appear on scaled data
    model = LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=200, random_state=42)

    # Train the model on the (scaled) training data
    model.fit(X_train_scaled, y_train)
    print("Model trained successfully.")

    # Make predictions on the (scaled) test data
    y_pred = model.predict(X_test_scaled)

    # Calculate accuracy for the current fold
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_scores.append(accuracy)
    models.append(model)

    print(f"Accuracy for Fold {fold+1}: {accuracy:.4f}")

# Calculate and print the average accuracy
print("\n--- K-Fold Cross-Validation Results on Iris ---")
print(f"Individual fold accuracies: {np.round(accuracy_scores, 4)}")
print(f"Mean accuracy: {np.mean(accuracy_scores):.4f}")
print(f"Standard deviation of accuracy: {np.std(accuracy_scores):.4f}")

Starting K-Fold Cross-Validation on Iris Dataset:

--- Fold 1 ---
Train set size: 120 samples
Test set size: 30 samples
Model trained successfully.
Accuracy for Fold 1: 1.0000

--- Fold 2 ---
Train set size: 120 samples
Test set size: 30 samples
Model trained successfully.
Accuracy for Fold 2: 0.9667

--- Fold 3 ---
Train set size: 120 samples
Test set size: 30 samples
Model trained successfully.
Accuracy for Fold 3: 0.9333

--- Fold 4 ---
Train set size: 120 samples
Test set size: 30 samples
Model trained successfully.
Accuracy for Fold 4: 0.9000

--- Fold 5 ---
Train set size: 120 samples
Test set size: 30 samples
Model trained successfully.
Accuracy for Fold 5: 0.9667

--- K-Fold Cross-Validation Results on Iris ---
Individual fold accuracies: [1.     0.9667 0.9333 0.9    0.9667]
Mean accuracy: 0.9533
Standard deviation of accuracy: 0.0340




In [None]:
from google.colab import drive
drive.mount('/content/drive')