# Repeated k-folds
This notebook will implement a k-fold iterator variant on non-overlapping groups.

**Step 1**: Load the dataset into a pandas dataframe to extract all unique SITE_ID values.

In [1]:
#Import modules for this step
from nilearn import datasets
import pandas as pd
import os

#Fetch data using nilearn.datasets.fetch
abide = datasets.fetch_abide_pcp(data_dir=os.path.join(os.sep,"/home/ubuntu/nai"),
                                 pipeline="cpac",
                                 quality_checked=True)

#Load phenotypic data into pandas dataframe
abide_pheno = pd.DataFrame(abide.phenotypic)

#Create array to hold unique site names
#groups = abide_pheno.SITE_ID.unique()

groups = []
for s in abide_pheno.SITE_ID:
    groups.append(s.decode())

  output = genfromtxt(fname, **kwargs)


**Step 2**: Define the dataset split using built-in scikit-learn methods. In this case, I am using sklearn.model_selection.GroupKFold.

In [3]:
#Import modules 
import numpy as np 
from sklearn.model_selection import RepeatedKFold
import prepare_data
import os
from sklearn.svm import LinearSVC
import statistics

#Define data and output directories 
data_dir = os.path.join(os.sep,"/home/ubuntu/nai")
output_dir = data_dir

X, y = prepare_data.prepare_data(data_dir,output_dir)

logo = RepeatedKFold(n_splits=10, n_repeats=2, random_state=2652124)
logo.get_n_splits(X, y, groups)

Loading dataset...


  output = genfromtxt(fname, **kwargs)


Feature file found.
Running PCA...


20

**Step 3:** Choosing which machine learning classifier to use. We will try four different classifiers in this script.

**Step 3.1:** Support Vector Machines (SVM) - LinearSVC

In [4]:
print("----------------------------------------------------")
print("RepeatedKFold with Linear Support Vector Classification")
print("----------------------------------------------------")

l_svc = LinearSVC(max_iter=10000)

accuracy = []
count = 0
for train_index, test_index in logo.split(X,y,groups): 
    count += 1
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    print("Training model ",count)
    l_svc.fit(X_train,y_train)
    acc_score = l_svc.score(X_test, y_test)
    accuracy.append(acc_score)

print("Finished training.\n")

#Mean accuracy of self.predict(X) with regard to y for each model
index = 0
for a in accuracy: 
    index += 1
    print("Accuracy score for model", index, " ", a)

#Report the average accuracy for all models 
print("\nAverage accuracy score for all models: ", statistics.mean(accuracy))
print("Maximum accuracy score of all models: ", max(accuracy))
print("Minimum accuracy score of all models: ", min(accuracy))

----------------------------------------------------
RepeatedKFold with Linear Support Vector Classification
----------------------------------------------------
Training model  1
Training model  2
Training model  3
Training model  4
Training model  5
Training model  6
Training model  7
Training model  8
Training model  9
Training model  10
Training model  11
Training model  12
Training model  13
Training model  14
Training model  15
Training model  16
Training model  17
Training model  18
Training model  19
Training model  20
Finished training.

Accuracy score for model 1   0.7045454545454546
Accuracy score for model 2   0.632183908045977
Accuracy score for model 3   0.6666666666666666
Accuracy score for model 4   0.5862068965517241
Accuracy score for model 5   0.6896551724137931
Accuracy score for model 6   0.7241379310344828
Accuracy score for model 7   0.5172413793103449
Accuracy score for model 8   0.632183908045977
Accuracy score for model 9   0.6666666666666666
Accuracy score fo

In [6]:
from sklearn.model_selection import GridSearchCV
# Function returning the best estimator

def best_estimator(model, param_grid, X, y, cv):
    grid = GridSearchCV (model, param_grid, cv=cv)
    grid.fit(X, y)
    model_best= grid.best_estimator_
    param_best= grid.best_params_
    
    return model_best , param_best

In [7]:
param_grid_SVC = {'C': [0.01, 0.1, 1, 1.2, 1.3, 1.4, 1.5, 2, 3, 4,  5, 10]}

best_estimator(LinearSVC(max_iter=10000), param_grid_SVC, X, y, 10)

(LinearSVC(C=0.01, max_iter=10000), {'C': 0.01})

In [5]:
from sklearn.svm import LinearSVC
import statistics
print("----------------------------------------------------------------------------------")
print("RepeatedKFold with Linear Support Vector Classification with Hyperparameter Tuning")
print("----------------------------------------------------------------------------------")

l_svc = LinearSVC(C=0.01, class_weight=None, dual=True, fit_intercept=True,
           intercept_scaling=1, loss='squared_hinge', max_iter=10000,
           multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
           verbose=0)

accuracy = []
count = 0
for train_index, test_index in logo.split(X,y,groups): 
    count += 1
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    print("Training model ",count)
    l_svc.fit(X_train,y_train)
    acc_score = l_svc.score(X_test, y_test)
    accuracy.append(acc_score)

print("Finished training.\n")

#Mean accuracy of self.predict(X) with regard to y for each model
index = 0
for a in accuracy: 
    index += 1
    print("Accuracy score for model", index, " ", a)

#Report the average accuracy for all models 
print("\nAverage accuracy score for all models: ", statistics.mean(accuracy))
print("Maximum accuracy score of all models: ", max(accuracy))
print("Minimum accuracy score of all models: ", min(accuracy))

----------------------------------------------------------------------------------
RepeatedKFold with Linear Support Vector Classification with Hyperparameter Tuning
----------------------------------------------------------------------------------
Training model  1
Training model  2
Training model  3
Training model  4
Training model  5
Training model  6
Training model  7
Training model  8
Training model  9
Training model  10
Training model  11
Training model  12
Training model  13
Training model  14
Training model  15
Training model  16
Training model  17
Training model  18
Training model  19
Training model  20
Finished training.

Accuracy score for model 1   0.7272727272727273
Accuracy score for model 2   0.7011494252873564
Accuracy score for model 3   0.6666666666666666
Accuracy score for model 4   0.632183908045977
Accuracy score for model 5   0.7126436781609196
Accuracy score for model 6   0.7241379310344828
Accuracy score for model 7   0.5862068965517241
Accuracy score for model 

**Step 3.2:** k-Nearest Neighbors - KNeighborsClassifier

In [9]:
from sklearn.neighbors import KNeighborsClassifier
import statistics
print("--------------------------------------------------")
print("RepeatedKFold with K-Nearest Neighbors Classification")
print("--------------------------------------------------")

knn = KNeighborsClassifier()

accuracy = []
count = 0
for train_index, test_index in logo.split(X,y,groups): 
    count += 1
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    print("Training model ",count)
    knn.fit(X_train,y_train)
    acc_score = knn.score(X_test, y_test)
    accuracy.append(acc_score)

print("Finished training.\n")

#Mean accuracy of self.predict(X) with regard to y for each model
index = 0
for a in accuracy: 
    index += 1
    print("Accuracy score for model", index, " ", a)

#Report the average accuracy for all models 
print("\nAverage accuracy score for all models: ", statistics.mean(accuracy))
print("Maximum accuracy score of all models: ", max(accuracy))
print("Minimum accuracy score of all models: ", min(accuracy))

--------------------------------------------------
RepeatedKFold with K-Nearest Neighbors Classification
--------------------------------------------------
Training model  1
Training model  2
Training model  3
Training model  4
Training model  5
Training model  6
Training model  7
Training model  8
Training model  9
Training model  10
Finished training.

Accuracy score for model 1   0.5872093023255814
Accuracy score for model 2   0.4883720930232558
Accuracy score for model 3   0.48717948717948717
Accuracy score for model 4   0.5443037974683544
Accuracy score for model 5   0.6266666666666667
Accuracy score for model 6   0.5694444444444444
Accuracy score for model 7   0.5633802816901409
Accuracy score for model 8   0.6666666666666666
Accuracy score for model 9   0.4939759036144578
Accuracy score for model 10   0.4883720930232558

Average accuracy score for all models:  0.5515570736102311
Maximum accuracy score of all models:  0.6666666666666666
Minimum accuracy score of all models:  0.48

In [10]:
from sklearn.neighbors import KNeighborsClassifier

model_KNN = KNeighborsClassifier()

param_grid = {'n_neighbors': [1, 2 , 3, 4, 6, 10, 15, 20, 25, 30, 50, 100],
              'algorithm':['auto', 'kd_tree']}

best_estimator(model_KNN, param_grid, X, y, 10)

(KNeighborsClassifier(n_neighbors=25),
 {'algorithm': 'auto', 'n_neighbors': 25})

In [11]:
from sklearn.neighbors import KNeighborsClassifier
import statistics
print("--------------------------------------------------------------------------------")
print("RepeatedKFold with K-Nearest Neighbors Classification with Hyperparameter Tuning")
print("--------------------------------------------------------------------------------")

knn = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                      metric_params=None, n_jobs=None, n_neighbors=25, p=2,
                      weights='uniform')

accuracy = []
count = 0
for train_index, test_index in logo.split(X,y,groups): 
    count += 1
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    print("Training model ",count)
    knn.fit(X_train,y_train)
    acc_score = knn.score(X_test, y_test)
    accuracy.append(acc_score)

print("Finished training.\n")

#Mean accuracy of self.predict(X) with regard to y for each model
index = 0
for a in accuracy: 
    index += 1
    print("Accuracy score for model", index, " ", a)

#Report the average accuracy for all models 
print("\nAverage accuracy score for all models: ", statistics.mean(accuracy))
print("Maximum accuracy score of all models: ", max(accuracy))
print("Minimum accuracy score of all models: ", min(accuracy))

--------------------------------------------------------------------------------
RepeatedKFold with K-Nearest Neighbors Classification with Hyperparameter Tuning
--------------------------------------------------------------------------------
Training model  1
Training model  2
Training model  3
Training model  4
Training model  5
Training model  6
Training model  7
Training model  8
Training model  9
Training model  10
Finished training.

Accuracy score for model 1   0.563953488372093
Accuracy score for model 2   0.47674418604651164
Accuracy score for model 3   0.6025641025641025
Accuracy score for model 4   0.569620253164557
Accuracy score for model 5   0.6533333333333333
Accuracy score for model 6   0.5972222222222222
Accuracy score for model 7   0.5774647887323944
Accuracy score for model 8   0.5362318840579711
Accuracy score for model 9   0.5421686746987951
Accuracy score for model 10   0.6046511627906976

Average accuracy score for all models:  0.5723954095982678
Maximum accuracy

**Step 3.3:** Decision Tree - DecisionTreeClassifier

In [6]:
from sklearn.tree import DecisionTreeClassifier
import statistics
print("--------------------------------------------")
print("RepeatedKFold with Decision Tree Classification")
print("--------------------------------------------")

dt = DecisionTreeClassifier()

accuracy = []
count = 0
for train_index, test_index in logo.split(X,y,groups): 
    count += 1
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    print("Training model ",count)
    dt.fit(X_train,y_train)
    acc_score = dt.score(X_test, y_test)
    accuracy.append(acc_score)

print("Finished training.\n")

#Mean accuracy of self.predict(X) with regard to y for each model
index = 0
for a in accuracy: 
    index += 1
    print("Accuracy score for model", index, " ", a)

#Report the average accuracy for all models 
print("\nAverage accuracy score for all models: ", statistics.mean(accuracy))
print("Maximum accuracy score of all models: ", max(accuracy))
print("Minimum accuracy score of all models: ", min(accuracy))

--------------------------------------------
RepeatedKFold with Decision Tree Classification
--------------------------------------------
Training model  1
Training model  2
Training model  3
Training model  4
Training model  5
Training model  6
Training model  7
Training model  8
Training model  9
Training model  10
Training model  11
Training model  12
Training model  13
Training model  14
Training model  15
Training model  16
Training model  17
Training model  18
Training model  19
Training model  20
Finished training.

Accuracy score for model 1   0.3977272727272727
Accuracy score for model 2   0.47126436781609193
Accuracy score for model 3   0.45977011494252873
Accuracy score for model 4   0.5632183908045977
Accuracy score for model 5   0.4482758620689655
Accuracy score for model 6   0.5632183908045977
Accuracy score for model 7   0.47126436781609193
Accuracy score for model 8   0.4942528735632184
Accuracy score for model 9   0.5977011494252874
Accuracy score for model 10   0.5287

In [13]:
from sklearn.tree import DecisionTreeClassifier
#Hyperparameter tuning for DecisioTree Classifier
model_tree = DecisionTreeClassifier()
param_grid = {'max_depth': [1, 2 , 3, 4, 5, 6, 10]}

best_estimator(model_tree, param_grid, X, y, 10)

(DecisionTreeClassifier(max_depth=5), {'max_depth': 5})

In [7]:
from sklearn.tree import DecisionTreeClassifier
import statistics
print("-------------------------------------------------------------------------")
print("RepeatedKFold with Decision Tree Classification with Hyperparameter Tuning")
print("--------------------------------------------------------------------------")

dt = DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=5, max_features=None, max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=None, splitter='best')

accuracy = []
count = 0
for train_index, test_index in logo.split(X,y,groups): 
    count += 1
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    print("Training model ",count)
    dt.fit(X_train,y_train)
    acc_score = dt.score(X_test, y_test)
    accuracy.append(acc_score)

print("Finished training.\n")

#Mean accuracy of self.predict(X) with regard to y for each model
index = 0
for a in accuracy: 
    index += 1
    print("Accuracy score for model", index, " ", a)

#Report the average accuracy for all models 
print("\nAverage accuracy score for all models: ", statistics.mean(accuracy))
print("Maximum accuracy score of all models: ", max(accuracy))
print("Minimum accuracy score of all models: ", min(accuracy))

-------------------------------------------------------------------------
RepeatedKFold with Decision Tree Classification with Hyperparameter Tuning
--------------------------------------------------------------------------
Training model  1
Training model  2
Training model  3
Training model  4
Training model  5
Training model  6
Training model  7
Training model  8
Training model  9
Training model  10
Training model  11
Training model  12
Training model  13
Training model  14
Training model  15
Training model  16
Training model  17
Training model  18
Training model  19
Training model  20
Finished training.

Accuracy score for model 1   0.5
Accuracy score for model 2   0.5172413793103449
Accuracy score for model 3   0.4827586206896552
Accuracy score for model 4   0.4827586206896552
Accuracy score for model 5   0.5172413793103449
Accuracy score for model 6   0.5517241379310345
Accuracy score for model 7   0.47126436781609193
Accuracy score for model 8   0.5402298850574713
Accuracy score 

**Step 3.4:** Random Forests - RandomForestClassifier

In [8]:
from sklearn.ensemble import RandomForestClassifier
import statistics
print("--------------------------------------------")
print("RepeatedKFold with Random Forest Classification")
print("--------------------------------------------")

rf = RandomForestClassifier()

accuracy = []
count = 0
for train_index, test_index in logo.split(X,y,groups): 
    count += 1
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    print("Training model ",count)
    rf.fit(X_train,y_train)
    acc_score = rf.score(X_test, y_test)
    accuracy.append(acc_score)

print("Finished training.\n")

#Mean accuracy of self.predict(X) with regard to y for each model
index = 0
for a in accuracy: 
    index += 1
    print("Accuracy score for model", index, " ", a)

#Report the average accuracy for all models 
print("\nAverage accuracy score for all models: ", statistics.mean(accuracy))
print("Maximum accuracy score of all models: ", max(accuracy))
print("Minimum accuracy score of all models: ", min(accuracy))

--------------------------------------------
RepeatedKFold with Random Forest Classification
--------------------------------------------
Training model  1
Training model  2
Training model  3
Training model  4
Training model  5
Training model  6
Training model  7
Training model  8
Training model  9
Training model  10
Training model  11
Training model  12
Training model  13
Training model  14
Training model  15
Training model  16
Training model  17
Training model  18
Training model  19
Training model  20
Finished training.

Accuracy score for model 1   0.5568181818181818
Accuracy score for model 2   0.632183908045977
Accuracy score for model 3   0.5517241379310345
Accuracy score for model 4   0.5057471264367817
Accuracy score for model 5   0.47126436781609193
Accuracy score for model 6   0.5747126436781609
Accuracy score for model 7   0.5517241379310345
Accuracy score for model 8   0.5517241379310345
Accuracy score for model 9   0.5977011494252874
Accuracy score for model 10   0.5977011

In [None]:
from sklearn.ensemble import RandomForestClassifier
import statistics
print("-------------------------------------------------------------------------")
print("RepeatedKFold with Random Forest Classification with Hyperparameter Tuning")
print("-------------------------------------------------------------------------")

rf = RandomForestClassifier(n_estimators=400)

accuracy = []
count = 0
for train_index, test_index in logo.split(X,y,groups): 
    count += 1
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    print("Training model ",count)
    rf.fit(X_train,y_train)
    acc_score = rf.score(X_test, y_test)
    accuracy.append(acc_score)

print("Finished training.\n")

#Mean accuracy of self.predict(X) with regard to y for each model
index = 0
for a in accuracy: 
    index += 1
    print("Accuracy score for model", index, " ", a)

#Report the average accuracy for all models 
print("\nAverage accuracy score for all models: ", statistics.mean(accuracy))
print("Maximum accuracy score of all models: ", max(accuracy))
print("Minimum accuracy score of all models: ", min(accuracy))

-------------------------------------------------------------------------
RepeatedKFold with Random Forest Classification with Hyperparameter Tuning
-------------------------------------------------------------------------
Training model  1
Training model  2
Training model  3
Training model  4
Training model  5
Training model  6
Training model  7
Training model  8
Training model  9
Training model  10
Training model  11
Training model  12
Training model  13
Training model  14
Training model  15
Training model  16
Training model  17
Training model  18
