In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier  # bagging aka bootstrapping
from sklearn.tree import DecisionTreeClassifier  # bagging
from util import getData_tmp
from sklearn.metrics import accuracy_score  # finding scores from different classifiers
from sklearn.utils import shuffle
from sklearn.decomposition import PCA


def main():
    X, Y = getData_tmp()  # X is image, Y is labels
    X, Y = shuffle(X, Y)
    N, D = X.shape
    Ntrain = int(N * 0.8)
    X_Train, Y_Train = X[:Ntrain], Y[:Ntrain]  # sets training set
    X_Test, Y_Test = X[Ntrain:], Y[Ntrain:]  # test set

    # feature reduction PCA w/o reducing dimensionality that computes min number of dimensions req to preserve 95% of training set variance
    pca = PCA(n_components=0.95)
    pca.fit(X_Train)
    xtrain_pca = pca.transform(X_Train)  # changes size of xtrain
    xtest_pca = pca.transform(X_Test)


    log_clf = LogisticRegression()
    rnd_clf = RandomForestClassifier(n_jobs=-1)
    svm_clf = SVC(kernel='rbf',gamma='auto')
    bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500, max_samples=800, bootstrap=True,
                                n_jobs=-1)  # boot, n_estimators: number of trees or rounds

    voting_clf = VotingClassifier(estimators=[('lr', log_clf), ('rf',   rnd_clf), ('svc', svm_clf), ('bag', bag_clf)],
                                  voting='hard')  # can change to soft voting to get higher performance because gives weight to highly confident nvotes
    voting_clf.fit(xtrain_pca, Y_Train)

    # examining each classifier's accuracy on test set
    for clf in (log_clf, rnd_clf, svm_clf, voting_clf, bag_clf):
        clf.fit(X, Y)  # represents x train and y train
        y_pred = clf.predict(X_Test)
        print(clf.__class__.__name__, accuracy_score(Y_Test, y_pred))  # supposed to be y_test, y_pred
        # whatever has the highest value is the best




if __name__ == '__main__':
    main()


LogisticRegression 0.879120879121
RandomForestClassifier 0.995604395604
SVC 0.873626373626
VotingClassifier 0.921978021978
BaggingClassifier 0.934065934066
