<a href="https://colab.research.google.com/github/rakshithasetty41/machine-learning/blob/main/K_fold_cross_validation_lab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import numpy as np


In [4]:

# Load the data
df = pd.read_csv("/content/diab_scores.txt")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Age              20 non-null     int64  
 1   BMI              20 non-null     float64
 2   Blood Pressure   20 non-null     int64  
 3   Glucose Level    20 non-null     int64  
 4   Insulin Level    20 non-null     int64  
 5   Diabetic Status  20 non-null     object 
 6   diabetic_score   20 non-null     float64
dtypes: float64(2), int64(4), object(1)
memory usage: 1.2+ KB


In [5]:


# Convert 'Diabetic Status' to binary labels
df['Diabetic Status'] = df['Diabetic Status'].map({'Yes': 1, 'No': 0})
df['Diabetic Status']


Unnamed: 0,Diabetic Status
0,1
1,1
2,0
3,1
4,1
5,0
6,0
7,1
8,0
9,1


In [6]:
# Define input features and target
X = df[['Age', 'BMI', 'Blood Pressure', 'Glucose Level', 'Insulin Level']].values
y = df['Diabetic Status'].values
X.shape


(20, 5)

In [7]:
y.shape

(20,)

In [8]:

# Standardize features manually
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)   # (x-x.mean())/x.std()
X_scaled


array([[-0.11632626, -0.62884247, -0.42953131,  0.08787149,  0.04136063],
       [ 0.46530506,  0.4350602 ,  0.8526517 ,  1.02935178,  0.73070439],
       [-0.93061011, -1.31277989, -1.07062281, -1.79508907, -1.33732691],
       [ 1.6285677 ,  1.61295243,  1.4937432 ,  1.34317854,  1.42004816],
       [ 1.04693638,  1.0810011 ,  0.5962151 ,  0.71552502,  1.0064419 ],
       [-1.6285677 , -1.61675208, -1.71171431, -2.10891584, -2.02667068],
       [-0.46530506, -0.17288418, -0.68596791, -0.22595527, -0.64798314],
       [ 0.23265253,  0.13108801,  0.2115602 ,  0.40169825,  0.45496689],
       [-1.27958891, -1.08480075, -1.32705941, -1.16743555, -0.92372065],
       [ 1.39591517,  1.34697676,  1.1090883 ,  1.21764783,  1.14431065],
       [-0.58163132, -0.43885985, -0.30131301, -0.03765921, -0.37224563],
       [ 0.11632626,  0.01709844,  0.3397785 ,  0.3389329 ,  0.31709813],
       [-1.16326264, -0.78082856, -1.19884111, -1.04190485, -1.19945816],
       [ 0.81428385,  0.81502543,  0.8

In [9]:

# Set up k-fold cross-validation
k = 5
kf = KFold(n_splits=k, shuffle=True, random_state=42)  # random state is equivalant to np.random.seed()
kf


KFold(n_splits=5, random_state=42, shuffle=True)

In [10]:
from sklearn.metrics import precision_score, recall_score, f1_score
# Store accuracy scores
accuracy_scores = []
precisions = []
recalls = []
f1s = []

# Perform k-fold cross-validation
for train_index, test_index in kf.split(X_scaled):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Train logistic regression
    model = LogisticRegression()
    model.fit(X_train, y_train)

    # Predict and evaluate
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    accuracy_scores.append(acc)
    p = precision_score(y_test, y_pred)
    r = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    precisions.append(p)
    recalls.append(r)
    f1s.append(f1)
# Print results
print(f"Cross-validation accuracy scores: {accuracy_scores}")
print(f"Mean accuracy: {np.mean(accuracy_scores):.4f}")
print("-"*60)
print(f" precisions : {precisions}")
print(f"Overall precision {np.mean(precisions)}")
print("-"*60)
print(f" recalls : {recalls}")
print(f"Overall recall {np.mean(recalls)}")
print("-"*60)
print(f"f1 scores {f1s}")
print(f"Overall f1 score {np.mean(f1s)}")


Cross-validation accuracy scores: [0.75, 1.0, 1.0, 1.0, 0.75]
Mean accuracy: 0.9000
------------------------------------------------------------
 precisions : [1.0, 1.0, 1.0, 1.0, 0.5]
Overall precision 0.9
------------------------------------------------------------
 recalls : [0.75, 1.0, 1.0, 1.0, 1.0]
Overall recall 0.95
------------------------------------------------------------
f1 scores [0.8571428571428571, 1.0, 1.0, 1.0, 0.6666666666666666]
Overall f1 score 0.9047619047619048


In [11]:
list(kf.split(X_scaled))

[(array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 16, 18, 19]),
  array([ 0,  1, 15, 17])),
 (array([ 0,  1,  2,  4,  6,  7,  9, 10, 12, 13, 14, 15, 16, 17, 18, 19]),
  array([ 3,  5,  8, 11])),
 (array([ 0,  1,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 14, 15, 17, 19]),
  array([ 2, 13, 16, 18])),
 (array([ 0,  1,  2,  3,  5,  6,  7,  8, 10, 11, 13, 14, 15, 16, 17, 18]),
  array([ 4,  9, 12, 19])),
 (array([ 0,  1,  2,  3,  4,  5,  8,  9, 11, 12, 13, 15, 16, 17, 18, 19]),
  array([ 6,  7, 10, 14]))]