In [33]:
import numpy as np
from matplotlib import pyplot as plt
from numpy import genfromtxt
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import LassoLars
from sklearn.linear_model import LassoLarsCV
from sklearn.linear_model import Lars
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectPercentile
from sklearn.neural_network import MLPClassifier

from sklearn.feature_selection import SelectKBest

from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split

from sklearn.cluster import KMeans

from brew.base import Ensemble
from brew.base import EnsembleClassifier
from brew.combination.combiner import Combiner


# from sklearn.model_selection import KFold

from sklearn.cross_validation import KFold
from sklearn.metrics import mean_squared_error


from sklearn import svm, datasets


%matplotlib inline

In [4]:
def load_data(filename, skiprows = 1):
    """
    Function loads data stored in the file filename and returns it as a numpy ndarray.
    
    Inputs:
        filename: given as a string.
        
    Outputs:
        Data contained in the file, returned as a numpy ndarray
    """
    return np.loadtxt(filename, skiprows=skiprows, delimiter=' ')

In [5]:
training_data = np.array(load_data("training_data.txt", skiprows = 1))

y_train = training_data[:, 0]
X_train = training_data[:, 1:]

In [31]:
X_tr, X_tst, y_tr, y_tst = train_test_split(X_train, y_train, random_state=0, test_size=.25)

fold = KFold(len(y_tr), n_folds=6, shuffle=True, random_state=777)
log_reg_CV = LogisticRegressionCV(
            Cs=list(np.power(10.0, np.arange(-10, 10)))
            ,penalty='l2'
            ,scoring='f1'
            ,cv=fold
            ,random_state=777
            ,max_iter=15000
            ,fit_intercept=True
            ,solver='newton-cg'
            ,tol=10
        )
    

select = SelectPercentile(percentile=80)
select.fit(X_tr, y_tr)
X_train_selected = select.transform(X_tr)
log_reg_CV.fit(X_train_selected, y_tr)


y_pred = np.sign(log_reg_CV.predict(X_train_selected))
binarytrain_error = (y_pred!=y_tr).sum()
binary_train_error = binarytrain_error/15000

X_test_selected = select.transform(X_tst)

y_test_prediction = np.sign(log_reg_CV.predict(X_test_selected))
binarytest_error = (y_test_prediction!=y_tst).sum()

binarytest_error = binarytest_error/5000

In [32]:
print(binarytrain_error, binarytest_error)

1892 0.1504


In [34]:
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)

clf.fit(X_train_selected, y_tr)

MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(5, 2), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       solver='lbfgs', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)

In [35]:
y_pred = np.sign(clf.predict(X_train_selected))
binarytrain_error = (y_pred!=y_tr).sum()
binarytrain_error/15000

X_test_selected = select.transform(X_tst)

y_test_prediction = np.sign(clf.predict(X_test_selected))
binarytest_error = (y_test_prediction!=y_tst).sum()



binarytest_error/5000

0.185

In [36]:
print(binarytrain_error/15000)

0.04633333333333333


In [38]:
print(binarytest_error/5000)

0.185
