In [1]:
# k-Fold Cross Validation

# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importing the dataset
dataset = pd.read_table('data.txt')
X = dataset.values[:,:3]
r_y = dataset.values[:,3]
y = [1.0 if i == 1 else 0.0 for i in r_y]
y = np.array(y)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X = sc.fit_transform(X)

In [5]:
# Applying k-Fold Cross Validation
#from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import LeaveOneOut
# Applying metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss
from sklearn.metrics import precision_score

In [6]:
from sklearn.linear_model import LogisticRegression

# Logistic Regression
accuracies = []
log_losses = []
precision_1 = []
precision_0 = []

for i in range(10):
    skf = StratifiedKFold(n_splits = 10, shuffle=True)
    for train, test in skf.split(X, y):
        X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
        classifier = LogisticRegression()
        classifier.fit(X_train, y_train)
        y_pred = classifier.predict(X_test)
        y_prob = classifier.predict_proba(X_test)
        accuracies.append(accuracy_score(y_test, y_pred))
        precision_1.append(precision_score(y_test, y_pred, pos_label=1))
        precision_0.append(precision_score(y_test, y_pred, pos_label=0))
        log_losses.append(log_loss(y_test, y_prob))

print('Accuracy')
print('Mean:' + str((np.array(accuracies)).mean()))
print('Std:' + str((np.array(accuracies)).std()))
print('Precision 1')
print('Mean:' + str((np.array(precision_1)).mean()))
print('Std:' + str((np.array(precision_1)).std()))
print('Precision 0')
print('Mean:' + str((np.array(precision_0)).mean()))
print('Std:' + str((np.array(precision_0)).std()))
print('Log Loss')
print('Mean:' + str((np.array(log_losses)).mean()))
print('Std:' + str((np.array(log_losses)).std()))

Accuracy
Mean:0.763586729363
Std:0.0406335848581
Precision 1
Mean:0.700337470806
Std:0.0927077646486
Precision 0
Mean:0.78760694513
Std:0.0308306171335
Log Loss
Mean:0.499330831712
Std:0.0434182538695


In [8]:
from sklearn.ensemble import RandomForestClassifier

# RandomForest Classifier Proba
accuracies = []
precision_1 = []
precision_0 = []
log_losses = []

for i in range(10):
    skf = StratifiedKFold(n_splits = 10, shuffle=True)
    for train, test in skf.split(X, y):
        X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
        classifier = RandomForestClassifier(n_estimators=100)
        classifier.fit(X_train, y_train)
        y_pred = classifier.predict(X_test)
        y_prob = classifier.predict_proba(X_test)
        accuracies.append(accuracy_score(y_test, y_pred))
        precision_1.append(precision_score(y_test, y_pred, pos_label=1))
        precision_0.append(precision_score(y_test, y_pred, pos_label=0))
        log_losses.append(log_loss(y_test, y_prob))

print('Accuracy')
print('Mean:' + str((np.array(accuracies)).mean()))
print('Std:' + str((np.array(accuracies)).std()))
print('Precision 1')
print('Mean:' + str((np.array(precision_1)).mean()))
print('Std:' + str((np.array(precision_1)).std()))
print('Precision 0')
print('Mean:' + str((np.array(precision_0)).mean()))
print('Std:' + str((np.array(precision_0)).std()))
print('Log Loss')
print('Mean:' + str((np.array(log_losses)).mean()))
print('Std:' + str((np.array(log_losses)).std()))

Accuracy
Mean:0.73940830721
Std:0.0410954916226
Precision 1
Mean:0.634325571451
Std:0.0819389157608
Precision 0
Mean:0.781898582715
Std:0.0295963471122
Log Loss
Mean:0.610145251334
Std:0.162693799109


In [9]:
from sklearn.ensemble import RandomForestClassifier

# RandomForest Classifier Proba 10 arvores
accuracies = []
precision_1 = []
precision_0 = []
log_losses = []

for i in range(10):
    skf = StratifiedKFold(n_splits = 10, shuffle=True)
    for train, test in skf.split(X, y):
        X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
        classifier = RandomForestClassifier(n_estimators=10)
        classifier.fit(X_train, y_train)
        y_pred = classifier.predict(X_test)
        y_prob = classifier.predict_proba(X_test)
        accuracies.append(accuracy_score(y_test, y_pred))
        precision_1.append(precision_score(y_test, y_pred, pos_label=1))
        precision_0.append(precision_score(y_test, y_pred, pos_label=0))
        log_losses.append(log_loss(y_test, y_prob))

print('Accuracy')
print('Mean:' + str((np.array(accuracies)).mean()))
print('Std:' + str((np.array(accuracies)).std()))
print('Precision 1')
print('Mean:' + str((np.array(precision_1)).mean()))
print('Std:' + str((np.array(precision_1)).std()))
print('Precision 0')
print('Mean:' + str((np.array(precision_0)).mean()))
print('Std:' + str((np.array(precision_0)).std()))
print('Log Loss')
print('Mean:' + str((np.array(log_losses)).mean()))
print('Std:' + str((np.array(log_losses)).std()))

Accuracy
Mean:0.731950104493
Std:0.0451358450069
Precision 1
Mean:0.627021994301
Std:0.0902791977131
Precision 0
Mean:0.770186541997
Std:0.0344192643607
Log Loss
Mean:1.50602361132
Std:0.674368153953


In [15]:
from sklearn.ensemble import RandomForestRegressor

# RandomForest Regressor
accuracies = []
precision_1 = []
precision_0 = []
log_losses = []

for i in range(10):
    skf = StratifiedKFold(n_splits = 10, shuffle=True)
    for train, test in skf.split(X, y):
        X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
        classifier = RandomForestRegressor(n_estimators=100)
        classifier.fit(X_train, y_train)
        y_prob = classifier.predict(X_test)
        #y_prob = classifier.predict_proba(X_test)
        #accuracies.append(accuracy_score(y_test, y_pred))
        #precision_1.append(precision_score(y_test, y_pred, pos_label=1))
        #precision_0.append(precision_score(y_test, y_pred, pos_label=0))
        log_losses.append(log_loss(y_test, y_prob))

#print('Accuracy')
#print('Mean:' + str((np.array(accuracies)).mean()))
#print('Std:' + str((np.array(accuracies)).std()))
#print('Precision 1')
#print('Mean:' + str((np.array(precision_1)).mean()))
#print('Std:' + str((np.array(precision_1)).std()))
#print('Precision 0')
print('Mean:' + str((np.array(precision_0)).mean()))
print('Std:' + str((np.array(precision_0)).std()))
print('Log Loss')
print('Mean:' + str((np.array(log_losses)).mean()))
print('Std:' + str((np.array(log_losses)).std()))

Mean:nan
Std:nan
Log Loss
Mean:0.73423247103
Std:0.320294418836


  ret = ret.dtype.type(ret / rcount)
  keepdims=keepdims)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)


In [16]:
from sklearn.neighbors import KNeighborsRegressor

# RandomForest Regressor
accuracies = []
precision_1 = []
precision_0 = []
log_losses = []

for i in range(10):
    skf = StratifiedKFold(n_splits = 10, shuffle=True)
    for train, test in skf.split(X, y):
        X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
        classifier = KNeighborsRegressor(n_neighbors=5)
        classifier.fit(X_train, y_train)
        y_prob = classifier.predict(X_test)
        #y_prob = classifier.predict_proba(X_test)
        #accuracies.append(accuracy_score(y_test, y_pred))
        #precision_1.append(precision_score(y_test, y_pred, pos_label=1))
        #precision_0.append(precision_score(y_test, y_pred, pos_label=0))
        log_losses.append(log_loss(y_test, y_prob))

#print('Accuracy')
#print('Mean:' + str((np.array(accuracies)).mean()))
#print('Std:' + str((np.array(accuracies)).std()))
#print('Precision 1')
#print('Mean:' + str((np.array(precision_1)).mean()))
#print('Std:' + str((np.array(precision_1)).std()))
#print('Precision 0')
print('Mean:' + str((np.array(precision_0)).mean()))
print('Std:' + str((np.array(precision_0)).std()))
print('Log Loss')
print('Mean:' + str((np.array(log_losses)).mean()))
print('Std:' + str((np.array(log_losses)).std()))

Mean:nan
Std:nan
Log Loss
Mean:2.03531856591
Std:0.774988313977


  ret = ret.dtype.type(ret / rcount)
  keepdims=keepdims)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)


In [19]:
from sklearn.neighbors import KNeighborsRegressor

# RandomForest Regressor
accuracies = []
precision_1 = []
precision_0 = []
log_losses = []

for i in range(10):
    skf = StratifiedKFold(n_splits = 10, shuffle=True)
    for train, test in skf.split(X, y):
        X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
        classifier = KNeighborsRegressor(n_neighbors=100)
        classifier.fit(X_train, y_train)
        y_prob = classifier.predict(X_test)
        #y_prob = classifier.predict_proba(X_test)
        #accuracies.append(accuracy_score(y_test, y_pred))
        #precision_1.append(precision_score(y_test, y_pred, pos_label=1))
        #precision_0.append(precision_score(y_test, y_pred, pos_label=0))
        log_losses.append(log_loss(y_test, y_prob))

#print('Accuracy')
#print('Mean:' + str((np.array(accuracies)).mean()))
#print('Std:' + str((np.array(accuracies)).std()))
#print('Precision 1')
#print('Mean:' + str((np.array(precision_1)).mean()))
#print('Std:' + str((np.array(precision_1)).std()))
#print('Precision 0')
print('Mean:' + str((np.array(precision_0)).mean()))
print('Std:' + str((np.array(precision_0)).std()))
print('Log Loss')
print('Mean:' + str((np.array(log_losses)).mean()))
print('Std:' + str((np.array(log_losses)).std()))

Mean:nan
Std:nan
Log Loss
Mean:0.502671628721
Std:0.0363030712132


  ret = ret.dtype.type(ret / rcount)
  keepdims=keepdims)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)


In [21]:
from sklearn.neighbors import KNeighborsRegressor

# RandomForest Regressor
accuracies = []
precision_1 = []
precision_0 = []
log_losses = []

for i in range(10):
    skf = StratifiedKFold(n_splits = 10, shuffle=True)
    for train, test in skf.split(X, y):
        X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
        classifier = KNeighborsRegressor(n_neighbors=50)
        classifier.fit(X_train, y_train)
        y_prob = classifier.predict(X_test)
        #y_prob = classifier.predict_proba(X_test)
        #accuracies.append(accuracy_score(y_test, y_pred))
        #precision_1.append(precision_score(y_test, y_pred, pos_label=1))
        #precision_0.append(precision_score(y_test, y_pred, pos_label=0))
        log_losses.append(log_loss(y_test, y_prob))

#print('Accuracy')
#print('Mean:' + str((np.array(accuracies)).mean()))
#print('Std:' + str((np.array(accuracies)).std()))
#print('Precision 1')
#print('Mean:' + str((np.array(precision_1)).mean()))
#print('Std:' + str((np.array(precision_1)).std()))
#print('Precision 0')
print('Mean:' + str((np.array(precision_0)).mean()))
print('Std:' + str((np.array(precision_0)).std()))
print('Log Loss')
print('Mean:' + str((np.array(log_losses)).mean()))
print('Std:' + str((np.array(log_losses)).std()))

Mean:nan
Std:nan
Log Loss
Mean:0.569737228294
Std:0.185511702196


  ret = ret.dtype.type(ret / rcount)
  keepdims=keepdims)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)


In [23]:
from sklearn.neighbors import KNeighborsRegressor

# RandomForest Regressor
accuracies = []
precision_1 = []
precision_0 = []
log_losses = []

for i in range(10):
    skf = StratifiedKFold(n_splits = 10, shuffle=True)
    for train, test in skf.split(X, y):
        X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
        classifier = KNeighborsRegressor(n_neighbors=60)
        classifier.fit(X_train, y_train)
        y_prob = classifier.predict(X_test)
        #y_prob = classifier.predict_proba(X_test)
        #accuracies.append(accuracy_score(y_test, y_pred))
        #precision_1.append(precision_score(y_test, y_pred, pos_label=1))
        #precision_0.append(precision_score(y_test, y_pred, pos_label=0))
        log_losses.append(log_loss(y_test, y_prob))

#print('Accuracy')
#print('Mean:' + str((np.array(accuracies)).mean()))
#print('Std:' + str((np.array(accuracies)).std()))
#print('Precision 1')
#print('Mean:' + str((np.array(precision_1)).mean()))
#print('Std:' + str((np.array(precision_1)).std()))
#print('Precision 0')
print('Mean:' + str((np.array(precision_0)).mean()))
print('Std:' + str((np.array(precision_0)).std()))
print('Log Loss')
print('Mean:' + str((np.array(log_losses)).mean()))
print('Std:' + str((np.array(log_losses)).std()))

Mean:nan
Std:nan
Log Loss
Mean:0.499613891314
Std:0.0458849867937


  ret = ret.dtype.type(ret / rcount)
  keepdims=keepdims)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)


In [24]:
from sklearn.neighbors import KNeighborsClassifier

# RandomForest Classifier Proba 10 arvores
accuracies = []
precision_1 = []
precision_0 = []
log_losses = []

for i in range(10):
    skf = StratifiedKFold(n_splits = 10, shuffle=True)
    for train, test in skf.split(X, y):
        X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
        classifier = KNeighborsClassifier(n_neighbors=5)
        classifier.fit(X_train, y_train)
        y_pred = classifier.predict(X_test)
        y_prob = classifier.predict_proba(X_test)
        accuracies.append(accuracy_score(y_test, y_pred))
        precision_1.append(precision_score(y_test, y_pred, pos_label=1))
        precision_0.append(precision_score(y_test, y_pred, pos_label=0))
        log_losses.append(log_loss(y_test, y_prob))

print('Accuracy')
print('Mean:' + str((np.array(accuracies)).mean()))
print('Std:' + str((np.array(accuracies)).std()))
print('Precision 1')
print('Mean:' + str((np.array(precision_1)).mean()))
print('Std:' + str((np.array(precision_1)).std()))
print('Precision 0')
print('Mean:' + str((np.array(precision_0)).mean()))
print('Std:' + str((np.array(precision_0)).std()))
print('Log Loss')
print('Mean:' + str((np.array(log_losses)).mean()))
print('Std:' + str((np.array(log_losses)).std()))

Accuracy
Mean:0.743625914316
Std:0.0462547863131
Precision 1
Mean:0.634109091577
Std:0.0866712663298
Precision 0
Mean:0.789812081183
Std:0.0399834060066
Log Loss
Mean:2.06513546466
Std:0.742933642952


In [25]:
from sklearn.neighbors import KNeighborsClassifier

# RandomForest Classifier Proba 10 arvores
accuracies = []
precision_1 = []
precision_0 = []
log_losses = []

for i in range(10):
    skf = StratifiedKFold(n_splits = 10, shuffle=True)
    for train, test in skf.split(X, y):
        X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
        classifier = KNeighborsClassifier(n_neighbors=10)
        classifier.fit(X_train, y_train)
        y_pred = classifier.predict(X_test)
        y_prob = classifier.predict_proba(X_test)
        accuracies.append(accuracy_score(y_test, y_pred))
        precision_1.append(precision_score(y_test, y_pred, pos_label=1))
        precision_0.append(precision_score(y_test, y_pred, pos_label=0))
        log_losses.append(log_loss(y_test, y_prob))

print('Accuracy')
print('Mean:' + str((np.array(accuracies)).mean()))
print('Std:' + str((np.array(accuracies)).std()))
print('Precision 1')
print('Mean:' + str((np.array(precision_1)).mean()))
print('Std:' + str((np.array(precision_1)).std()))
print('Precision 0')
print('Mean:' + str((np.array(precision_0)).mean()))
print('Std:' + str((np.array(precision_0)).std()))
print('Log Loss')
print('Mean:' + str((np.array(log_losses)).mean()))
print('Std:' + str((np.array(log_losses)).std()))

Accuracy
Mean:0.741351880878
Std:0.0412602826437
Precision 1
Mean:0.678066548469
Std:0.107866621115
Precision 0
Mean:0.761787847054
Std:0.0292295316255
Log Loss
Mean:1.02687083922
Std:0.509688125399


In [35]:
from sklearn.neighbors import KNeighborsClassifier

# RandomForest Classifier Proba 10 arvores
accuracies = []
precision_1 = []
precision_0 = []
log_losses = []

for i in range(10):
    skf = StratifiedKFold(n_splits = 10, shuffle=True)
    for train, test in skf.split(X, y):
        X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
        classifier = KNeighborsClassifier(n_neighbors=72)
        classifier.fit(X_train, y_train)
        y_pred = classifier.predict(X_test)
        y_prob = classifier.predict_proba(X_test)
        accuracies.append(accuracy_score(y_test, y_pred))
        precision_1.append(precision_score(y_test, y_pred, pos_label=1))
        precision_0.append(precision_score(y_test, y_pred, pos_label=0))
        log_losses.append(log_loss(y_test, y_prob))

print('Accuracy')
print('Mean:' + str((np.array(accuracies)).mean()))
print('Std:' + str((np.array(accuracies)).std()))
print('Precision 1')
print('Mean:' + str((np.array(precision_1)).mean()))
print('Std:' + str((np.array(precision_1)).std()))
print('Precision 0')
print('Mean:' + str((np.array(precision_0)).mean()))
print('Std:' + str((np.array(precision_0)).std()))
print('Log Loss')
print('Mean:' + str((np.array(log_losses)).mean()))
print('Std:' + str((np.array(log_losses)).std()))

Accuracy
Mean:0.747202194357
Std:0.0401366584078
Precision 1
Mean:0.667949321112
Std:0.0906502005048
Precision 0
Mean:0.776779702972
Std:0.0335054297453
Log Loss
Mean:0.501311403932
Std:0.0487741105876


In [42]:
from sklearn.neural_network import MLPRegressor

# RandomForest Classifier Proba 10 arvores
accuracies = []
precision_1 = []
precision_0 = []
log_losses = []

for i in range(10):
    skf = StratifiedKFold(n_splits = 10, shuffle=True)
    for train, test in skf.split(X, y):
        X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
        classifier = MLPRegressor(hidden_layer_sizes=(6, ))
        classifier.out_activation_ = 'logistic'
        classifier.fit(X_train, y_train)
        y_prob = classifier.predict(X_test)
        #y_prob = classifier.predict_proba(X_test)
        #accuracies.append(accuracy_score(y_test, y_pred))
        #precision_1.append(precision_score(y_test, y_pred, pos_label=1))
        #precision_0.append(precision_score(y_test, y_pred, pos_label=0))
        log_losses.append(log_loss(y_test, y_prob))

#print('Accuracy')
#print('Mean:' + str((np.array(accuracies)).mean()))
#print('Std:' + str((np.array(accuracies)).std()))
#print('Precision 1')
#print('Mean:' + str((np.array(precision_1)).mean()))
#print('Std:' + str((np.array(precision_1)).std()))
#print('Precision 0')
#print('Mean:' + str((np.array(precision_0)).mean()))
#print('Std:' + str((np.array(precision_0)).std()))
print('Log Loss')
print('Mean:' + str((np.array(log_losses)).mean()))
print('Std:' + str((np.array(log_losses)).std()))



Log Loss
Mean:1.01420618673
Std:0.629717813713


In [44]:
from sklearn.svm import SVC

# RandomForest Classifier Proba 10 arvores
accuracies = []
precision_1 = []
precision_0 = []
log_losses = []

for i in range(10):
    skf = StratifiedKFold(n_splits = 10, shuffle=True)
    for train, test in skf.split(X, y):
        X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
        classifier = SVC(probability=True, kernel='sigmoid')
        classifier.fit(X_train, y_train)
        y_prob = classifier.predict(X_test)
        #y_prob = classifier.predict_proba(X_test)
        #accuracies.append(accuracy_score(y_test, y_pred))
        #precision_1.append(precision_score(y_test, y_pred, pos_label=1))
        #precision_0.append(precision_score(y_test, y_pred, pos_label=0))
        log_losses.append(log_loss(y_test, y_prob))

#print('Accuracy')
#print('Mean:' + str((np.array(accuracies)).mean()))
#print('Std:' + str((np.array(accuracies)).std()))
#print('Precision 1')
#print('Mean:' + str((np.array(precision_1)).mean()))
#print('Std:' + str((np.array(precision_1)).std()))
#print('Precision 0')
#print('Mean:' + str((np.array(precision_0)).mean()))
#print('Std:' + str((np.array(precision_0)).std()))
print('Log Loss')
print('Mean:' + str((np.array(log_losses)).mean()))
print('Std:' + str((np.array(log_losses)).std()))

Log Loss
Mean:11.6945909633
Std:1.64776885385


In [53]:
from sklearn.linear_model import LogisticRegression

# Logistic Regression
accuracies = []
log_losses = []
log_losses_in = []
precision_1 = []
precision_0 = []

for i in range(10):
    skf = StratifiedKFold(n_splits = 10, shuffle=True)
    for train, test in skf.split(X, y):
        X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
        classifier = LogisticRegression(penalty="l2")
        classifier.fit(X_train, y_train)
        y_pred = classifier.predict(X_test)
        y_prob = classifier.predict_proba(X_test)
        accuracies.append(accuracy_score(y_test, y_pred))
        precision_1.append(precision_score(y_test, y_pred, pos_label=1))
        precision_0.append(precision_score(y_test, y_pred, pos_label=0))
        log_losses.append(log_loss(y_test, y_prob))
        log_losses_in.append(log_loss(y_train, classifier.predict_proba(X_train)))
        #print('\nLogisticRegression coefficients:')
        #print(classifier.coef_.tolist()[0])

print('Accuracy')
print('Mean:' + str((np.array(accuracies)).mean()))
print('Std:' + str((np.array(accuracies)).std()))
print('Precision 1')
print('Mean:' + str((np.array(precision_1)).mean()))
print('Std:' + str((np.array(precision_1)).std()))
print('Precision 0')
print('Mean:' + str((np.array(precision_0)).mean()))
print('Std:' + str((np.array(precision_0)).std()))
print('Log Loss Eout')
print('Mean:' + str((np.array(log_losses)).mean()))
print('Std:' + str((np.array(log_losses)).std()))
print('Log Loss Ein')
print('Mean:' + str((np.array(log_losses_in)).mean()))
print('Std:' + str((np.array(log_losses_in)).std()))

Accuracy
Mean:0.761097178683
Std:0.0374549938713
Precision 1
Mean:0.692947106674
Std:0.0817494116209
Precision 0
Mean:0.785610421232
Std:0.0299304075909
Log Loss Eout
Mean:0.4997627706
Std:0.0464986130751
Log Loss Ein
Mean:0.495281806528
Std:0.00518479112053
