In [15]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

In [16]:
from sklearn import datasets
from sklearn.naive_bayes import GaussianNB

In [95]:
def load_train_data(path):
    df = pd.read_csv(path)
    X = df.values.copy()
    
    np.random.shuffle(X)

    X, labels = X[:, 1:-1].astype(np.float32), X[:, -1]

    encoder = LabelEncoder()
    y = encoder.fit_transform(labels).astype(np.int32)
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    print(y)
    return X, y, encoder, scaler

In [96]:
def load_test_data(path, scaler):
    df = pd.read_csv(path)
    X = df.values.copy()
    X, ids = X[:, 1:].astype(np.float32), X[:, 0].astype(str)
    X = scaler.transform(X)
    print(X," ",ids)
    return X, ids

In [97]:


def make_submission(clf, X_test, ids, encoder, name='bayes_submission.csv'):
    y_prob = clf.predict_proba(X_test)
    with open(name, 'w') as f:
        f.write('id,')
        f.write(','.join(encoder.classes_))
        f.write('\n')
        for id, probs in zip(ids, y_prob):
            probas = ','.join([id] + map(str, probs.tolist()))
            f.write(probas)
            f.write('\n')
    print("Wrote submission to file {}.".format(name))



In [98]:
X, y, encoder, scaler = load_train_data('train.csv')

[1 2 4 ..., 8 5 0]


In [93]:
X_test, ids = load_test_data('test.csv', scaler)

(array([[-0.25352421, -0.21006599, -0.30717084, ..., -0.12956464,
        -0.38694605, -0.10497177],
       [ 1.05776191,  1.38699305,  4.46326447, ..., -0.12956464,
         1.6489749 , -0.10497177],
       [-0.25352421,  0.58846354,  3.78177357, ..., -0.12956464,
        -0.38694605,  0.72724378],
       ..., 
       [-0.25352421,  0.58846354, -0.30717084, ...,  0.35946801,
        -0.38694605, -0.10497177],
       [-0.25352421, -0.21006599, -0.30717084, ..., -0.12956464,
         0.63101441, -0.10497177],
       [-0.25352421, -0.21006599, -0.30717084, ..., -0.12956464,
        -0.38694605, -0.10497177]], dtype=float32), ' ', array(['1', '2', '3', ..., '144366', '144367', '144368'], 
      dtype='|S24'))


In [94]:
num_classes = len(encoder.classes_)
num_features = X.shape[1]
print(num_classes," ",num_features)

(9, ' ', 93)


In [23]:

gnb = GaussianNB()

In [27]:
bayes = gnb.fit(X,y)

In [28]:
make_submission(bayes, X_test, ids, encoder)

Wrote submission to file bayes_submission.csv.
