# K-Fold Cross Validation

In [4]:
import pandas as pd
import numpy as np

#Disable SettingWithCopyWarning
pd.options.mode.chained_assignment = None

#Split data into 5 sets
admissions = pd.read_csv("admissions.csv")
admissions["actual_label"] = admissions["admit"]
admissions = admissions.drop("admit", axis=1)
shuffled_index = np.random.permutation(admissions.index)
shuffled_admissions = admissions.loc[shuffled_index]
admissions = shuffled_admissions.reset_index()
admissions['fold'] = 1
admissions['fold'][0:129] = 1
admissions['fold'][129:258] = 2
admissions['fold'][258:387] = 3
admissions['fold'][387:515] = 4
admissions['fold'][515:645] = 5

print (admissions.head())
print (admissions.tail())

   index  col       gpa         gre  actual_label  fold
0    473  473  3.348772  605.968197           0.0     1
1    271  271  3.183203  595.195508           1.0     1
2    472  472  3.092351  656.186812           0.0     1
3    129  129  2.177100  623.303559           0.0     1
4     21   21  3.105242  562.939715           0.0     1
     index  col       gpa         gre  actual_label  fold
995    650  650  3.298076  771.875487           1.0     1
996    369  369  3.268196  691.260976           0.0     1
997    640  640  3.581406  559.764878           1.0     1
998    116  116  3.310185  505.381002           0.0     1
999    671  671  3.690988  631.754900           0.0     1


In [7]:
#First iteration
#Use set 1 as Test set, set 2-5 as Training sets

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
train_data_index = admissions['fold']!=1
train_data = admissions[train_data_index]
test_data = admissions[admissions['fold']==1]
lr.fit (train_data[['gpa']], train_data['actual_label'])
label = lr.predict(test_data[['gpa']])
correct_predict_i= (label == test_data['actual_label'])
correct_predict = test_data[correct_predict_i]

iteration_one_accuracy = len(correct_predict)/len(test_data)
print (iteration_one_accuracy)

0.7603305785123967


In [12]:
fold_ids = [1,2,3,4,5]


def train_and_test (admissions, folds):
    #Inputs: admissions data frame, list of folds to use for k-fold validation
    accuracies = []
    for fold in folds:
        lr = LogisticRegression()
        train_data_index = admissions['fold']!=fold
        train_data = admissions[train_data_index]
        test_data = admissions[admissions['fold']==fold]
        lr.fit (train_data[['gpa']], train_data['actual_label'])
        label = lr.predict(test_data[['gpa']])
        correct_predict_i= (label == test_data['actual_label'])
        correct_predict = test_data[correct_predict_i]
        iteration_accuracy = len(correct_predict)/len(test_data)
        accuracies.append(iteration_accuracy)
    #Output: List of accuracies of each fold of validation
    return accuracies

accuracies = train_and_test(admissions, [1,2,3,4,5])
average_accuracy = sum(accuracies)/5

print(accuracies)
print(average_accuracy)

[0.7603305785123967, 0.7286821705426356, 0.7674418604651163, 0.7421875, 0.7692307692307693]
0.7535745757501836


In [13]:
#Utilize sklearn to perform K-fold Validation
from sklearn.cross_validation import KFold
from sklearn.cross_validation import cross_val_score
admissions = pd.read_csv("admissions.csv")
admissions["actual_label"] = admissions["admit"]
admissions = admissions.drop("admit", axis=1)

kf = KFold(len(admissions), 5, shuffle=True, random_state=8)
lr = LogisticRegression()

accuracies = cross_val_score(lr, admissions[['gpa']], admissions['actual_label'], scoring=None, cv=kf)
average_accuracy = sum(accuracies)/5

print (accuracies)
print(average_accuracy)


[ 0.755  0.765  0.735  0.775  0.75 ]
0.756
