In [1]:
import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import keras
from keras.utils import np_utils

Using TensorFlow backend.


In [2]:
def preprocess_data(X, scaler=None):
    '''Standardize features by removing the mean and scaling to unit variance'''
    if not scaler:
        scaler = StandardScaler()
        scaler.fit(X)
    return scaler.transform(X), scaler

def preprocess_labels(y, encoder=None, categorical=True):
    '''Encode labels with value between 0 and n_classes-1.'''
    if not encoder:
        encoder = LabelEncoder()
        encoder.fit(y)
    labels = encoder.transform(y).astype(np.int32)
    if categorical:
        labels = np_utils.to_categorical(labels)
    return labels, encoder

def load_data(path):
    df = pd.read_csv(path)
    data = df.values
    np.random.shuffle(data)
    X = data[:, 1:-1].astype(np.float32)
    y = data[:, -1]
    y, _ = preprocess_labels(y)
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    x_test, x_val, y_test, y_val = train_test_split(x_test, y_test, test_size=0.5)
    x_train, scaler = preprocess_data(x_train)
#     we use only the training data to normalize the data.
    x_test, _ = preprocess_data(x_test, scaler)
    x_val, _ = preprocess_data(x_val, scaler)
    return ((x_train, y_train),
            (x_val, y_val),
            (x_test, y_test))

In [3]:
path = "data/train.csv"
(x_train, y_train), (x_val, y_val), (x_test, y_test) = load_data(path=path)

In [4]:
print("Number of dimensions =", x_train.shape[1])
print("Number of classes =", y_train.shape[1])

Number of dimensions = 93
Number of classes = 9
