# 5-fold cross-validation using the KFold method
#### https://www.kaggle.com/datasets/uciml/pima-indians-diabetes-database
#### Pregnancies: Number of times pregnant
#### Glucose: Plasma glucose concentration a 2 hours in an oral glucose tolerance test
#### BloodPressure: Diastolic blood pressure (mm Hg)
#### SkinThickness: Triceps skin fold thickness (mm)
#### Insulin: 2-Hour serum insulin (mu U/ml)
#### BMI: Body mass index (weight in kg/(height in m)^2)
#### DiabetesPedigreeFunction: Diabetes pedigree function
#### Age: Age (years)
#### Outcome: Class variable (0 or 1)

In [1]:
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [2]:
# Load the dataset
diabetes_df = pd.read_csv('diabetes.csv')

In [3]:
# Split the dataset into features and labels
X = diabetes_df.drop('Outcome', axis=1)
y = diabetes_df['Outcome']

In [4]:
diabetes_df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
# Define the classifiers
classifiers = {
    "SVM": SVC(),
    "Logistic Regression": LogisticRegression(max_iter=4000),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier()
}


In [6]:
# Define the number of folds for cross-validation
n_splits = 5

In [7]:
# Perform 10-fold cross-validation for each classifier
for clf_name, clf in classifiers.items():
    print(f"Classifier: {clf_name}")
    print("-" * 30)
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    precision_list = []
    recall_list = []
    f1score_list = []
    accuracy_list = []
    for train_index, test_index in kf.split(X):
        #print(f"training_index: {train_index}")
        #print(f"test_index: {test_index}")
        # Split the data into training and test sets
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Fit the classifier on the training data
        clf.fit(X_train, y_train)

        # Make predictions on the test data
        y_pred = clf.predict(X_test)

        # Compute the precision, recall, f1-score, and accuracy
        report = classification_report(y_test, y_pred, output_dict=True)
        precision_list.append(report["1"]["precision"])
        recall_list.append(report["1"]["recall"])
        f1score_list.append(report["1"]["f1-score"])
        accuracy_list.append(report["accuracy"])

    # Compute the average precision, recall, f1-score, and accuracy over all folds
    avg_precision = sum(precision_list) / n_splits
    avg_recall = sum(recall_list) / n_splits
    avg_f1score = sum(f1score_list) / n_splits
    avg_accuracy = sum(accuracy_list) / n_splits

    print(f"Average precision: {avg_precision:.2f}")
    print(f"Average recall: {avg_recall:.2f}")
    print(f"Average f1-score: {avg_f1score:.2f}")
    print(f"Average accuracy: {avg_accuracy:.2f}")
    print("-" * 30)

Classifier: SVM
------------------------------
Average precision: 0.72
Average recall: 0.45
Average f1-score: 0.55
Average accuracy: 0.75
------------------------------
Classifier: Logistic Regression
------------------------------
Average precision: 0.71
Average recall: 0.58
Average f1-score: 0.64
Average accuracy: 0.77
------------------------------
Classifier: Naive Bayes
------------------------------
Average precision: 0.66
Average recall: 0.60
Average f1-score: 0.63
Average accuracy: 0.75
------------------------------
Classifier: Random Forest
------------------------------
Average precision: 0.70
Average recall: 0.58
Average f1-score: 0.63
Average accuracy: 0.77
------------------------------
