In [7]:
from scipy import sparse
import os
import pandas as pd
from sklearn.linear_model import LogisticRegression

In [8]:
def load_data(dataX_path, dataY_path, data_root='data/processed'):
    dataX = sparse.load_npz(os.path.join(data_root, dataX_path))
    dataY = pd.read_csv(os.path.join(data_root, dataY_path), header=None)

    print('Input Matrix Shape {0} x {1}, Target Shape {2} x {3}.'.format(dataX.shape[0], dataX.shape[1],
                                                                         dataY.shape[0], dataY.shape[1]))
    return dataX.toarray(), dataY.values.ravel()

In [9]:
def start_lr():
    trainX, trainY = load_data('trainsetInputVector_sparse.npz', 'trainsetResult.csv')
    validX, validY = load_data('validsetInputVector_sparse.npz', 'validsetResult.csv')
    testX, testY = load_data('testsetInputVector_sparse.npz', 'testsetResult.csv')

    solvers = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
    best_score = 0
    best_solver = 'liblinear'

    for sol in solvers:
        lr = LogisticRegression(random_state=0, solver=sol)
        lr.fit(trainX, trainY)
        score = lr.score(validX, validY)
        if score > best_score:
            best_score = score
            best_solver = sol
        print("Validation accuracy using {0}: {1}".format(sol, score))

    lr = LogisticRegression(random_state=0, solver=best_solver)
    lr.fit(trainX, trainY)
    test_score = lr.score(testX, testY)

    print("Test set accuracy using {0}: {1}".format(best_solver, test_score))
    

In [None]:
start_lr()

Input Matrix Shape 2028742 x 226, Target Shape 2028742 x 1.
Input Matrix Shape 1521558 x 226, Target Shape 1521558 x 1.


In [None]:
from scipy import sparse
import os
import pandas as pd

In [None]:
def start_naive_bayes():
    trainX, trainY = load_data('trainsetInputVector_sparse.npz', 'trainsetResult.csv')
    validX, validY = load_data('validsetInputVector_sparse.npz', 'validsetResult.csv')
    testX, testY = load_data('testsetInputVector_sparse.npz', 'testsetResult.csv')

    from sklearn.naive_bayes import GaussianNB
    gauss_nb = GaussianNB()
    gauss_nb.fit(trainX, trainY)
    gauss_score = gauss_nb.score(validX, validY)
    print("Validation accuracy using Gaussian NB: %f" % gauss_score)

    gauss_score = gauss_nb.score(testX, testY)
    print("Test set accuracy using Gaussian NB: %f" % gauss_score)

    from sklearn.naive_bayes import MultinomialNB
    multi_nb = MultinomialNB()
    multi_nb.fit(trainX, trainY)
    multi_score = multi_nb.score(validX, validY)
    print("Validation accuracy using Multinomial NB: %f" % multi_score)

    multi_score = multi_nb.score(testX, testY)
    print("Test set accuracy using Multinomial NB: %f" % multi_score)

    from sklearn.naive_bayes import BernoulliNB
    bern_nb = BernoulliNB()
    bern_nb.fit(trainX, trainY)
    bern_score = bern_nb.score(validX, validY)
    print("Validation accuracy using Bernoulli NB: %f" % bern_score)

    bern_score = bern_nb.score(testX, testY)
    print("Test set accuracy using Bernoulli NB: %f" % bern_score)
