In [11]:
import pandas as pd
import numpy as np

In [118]:
def stratified_data(data):
    data = data.sample(frac=1).reset_index(drop=True)
    classes = data["class"].unique()
    class_data = {c: data[data["class"] == c] for c in classes}
    class_folds = {c: [] for c in classes}
    
    for c in classes:
        class_size = len(class_data[c])
        fold_size = class_size // 10
        extra_samples = class_size % 10
    
        indicies = np.arange(class_size)
        np.random.shuffle(indicies)
    
        for i in range(10):
            fold_indicies = indicies[i*fold_size:(i+1)*fold_size]
            if extra_samples > 0:
                fold_indicies = np.concatenate([fold_indicies, [indicies[-extra_samples]]])
                extra_samples -= 1

            fold_data = class_data[c].iloc[fold_indicies]
            class_folds[c].append(fold_data)
    
    folds = []
    for i in range(10):
        fold = pd.concat([class_folds[c][i] for c in classes])
        folds.append(fold)
    
    return folds

In [114]:
pima = pd.read_csv("pima.csv")
pima_folds = stratified_data(pima)

In [115]:
with open("pima-fold.csv", "w") as f:
    for i, fold in enumerate(pima_folds):
        f.write("fold{}\n".format(i+1))
        f.write("{}\n".format(fold.to_csv(index=False, header=False)))

In [122]:
for i in range(10):
    # Get the training and validation data for this fold
    val_data = pima_folds[i]
    label = val_data['class']
    val_data = val_data.drop(columns=['class'])
    train_data = pd.concat([fold for j, fold in enumerate(pima_folds) if j != i])
    
    # Write the train and test data to separate text files
    train_file = f'numerical_cv/train_fold_{i+1}.txt'
    test_file = f'numerical_cv/test_fold_{i+1}.txt'
    label_file = f'numerical_cv/label_fold_{i+1}.txt'
    train_data.to_csv(train_file, index=False, header=False)
    val_data.to_csv(test_file, index=False, header=False)
    label.to_csv(label_file, index=False, header=False)

In [121]:
pima_cfs = pd.read_csv("pima-CFS.csv")
pima_cfs_folds = stratified_data(pima_cfs)

with open("pima-cfs-fold.csv", "w") as f:
    for i, fold in enumerate(pima_cfs_folds):
        f.write("fold{}\n".format(i+1))
        f.write("{}\n".format(fold.to_csv(index=False, header=False)))

In [123]:
for i in range(10):
    # Get the training and validation data for this fold
    val_data = pima_cfs_folds[i]
    label = val_data['class']
    val_data = val_data.drop(columns=['class'])
    train_data = pd.concat([fold for j, fold in enumerate(pima_cfs_folds) if j != i])
    
    # Write the train and test data to separate text files
    train_file = f'numerical_cv_cfs/train_fold_{i+1}.txt'
    test_file = f'numerical_cv_cfs/test_fold_{i+1}.txt'
    label_file = f'numerical_cv_cfs/label_fold_{i+1}.txt'
    train_data.to_csv(train_file, index=False, header=False)
    val_data.to_csv(test_file, index=False, header=False)
    label.to_csv(label_file, index=False, header=False)

In [124]:
pima_discrete = pd.read_csv("pima-indians-diabetes-discrete.csv")
pima_discrete_folds = stratified_data(pima_discrete)

with open("pima-discrete-fold.csv", "w") as f:
    for i, fold in enumerate(pima_discrete_folds):
        f.write("fold{}\n".format(i+1))
        f.write("{}\n".format(fold.to_csv(index=False, header=False)))

In [126]:
for i in range(10):
    # Get the training and validation data for this fold
    val_data = pima_discrete_folds[i]
    label = val_data['class']
    val_data = val_data.drop(columns=['class'])
    train_data = pd.concat([fold for j, fold in enumerate(pima_discrete_folds) if j != i])
    
    # Write the train and test data to separate text files
    train_file = f'nominal_cv/train_fold_{i+1}.txt'
    test_file = f'nominal_cv/test_fold_{i+1}.txt'
    label_file = f'nominal_cv/label_fold_{i+1}.txt'
    train_data.to_csv(train_file, index=False, header=False)
    val_data.to_csv(test_file, index=False, header=False)
    label.to_csv(label_file, index=False, header=False)

In [127]:
pima_discrete_cfs = pd.read_csv("pima-discretised-CFS.csv")
pima_discrete_cfs_folds = stratified_data(pima_discrete_cfs)

with open("pima-discrete-fold-cfs.csv", "w") as f:
    for i, fold in enumerate(pima_discrete_cfs_folds):
        f.write("fold{}\n".format(i+1))
        f.write("{}\n".format(fold.to_csv(index=False, header=False)))

In [128]:
for i in range(10):
    # Get the training and validation data for this fold
    val_data = pima_discrete_cfs_folds[i]
    label = val_data['class']
    val_data = val_data.drop(columns=['class'])
    train_data = pd.concat([fold for j, fold in enumerate(pima_discrete_cfs_folds) if j != i])
    
    # Write the train and test data to separate text files
    train_file = f'nominal_cv_cfs/train_fold_{i+1}.txt'
    test_file = f'nominal_cv_cfs/test_fold_{i+1}.txt'
    label_file = f'nominal_cv_cfs/label_fold_{i+1}.txt'
    train_data.to_csv(train_file, index=False, header=False)
    val_data.to_csv(test_file, index=False, header=False)
    label.to_csv(label_file, index=False, header=False)