Built neural nets from the scratch to implement output encoding using a vector of categorical variable combinations. This method is clearly explained in www.elsevier.com/locate/neunet. This method is compared against 1-hot encoding and other simple ways of using categorical inputs.

In [None]:
"""Sharath - Machine Learning - Re-engineering neural nets"""
import theano as th
from theano import tensor as T
import pandas as pd
import numpy as np

In [None]:
from sklearn.cross_validation import train_test_split
from sklearn.metrics import auc, accuracy_score
from sklearn.preprocessing import LabelBinarizer
from itertools import permutations, combinations_with_replacement, product

Regression on Car dataset:

Comparing different methods developed for categorical variable usage in neural nets.
The following code tries to implement and compare separation method described in the below paper to use categorical input features effectively in feed forward neural nets with existing techniques. 
Original paper "A feed-forward network for input that is both categorical and quantitative Roelof K. Brouwer"
Neural Networks 15 (2002) 881–890
www.elsevier.com/locate/neunet

I have used car dataset from https://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data to demonstrate the method

Features in car dataset:
buying       v-high, high, med, low
maint        v-high, high, med, low
doors        2, 3, 4, 5-more
persons      2, 4, more
lug_boot     small, med, big
safety       low, med, high

Method1: Convert categories to integers

Method2: 1-hot encoding or dummy variables

Method3: Separation method - Encode the output using the combination of categorical variables

In [None]:
car = pd.read_csv("car.csv", index_col=False, header=None)
car.columns = ["buying","maint","doors","persons","lug_boot","safety","target"]
car.describe()

Change the variable types from character to factor. But first we will change them to integers for easy handling. 

In [None]:
for col in car.columns:
    car[col], indexer = pd.factorize(car[col])
    car[col] = car[col]
print car.dtypes

In [None]:
#Packages for neural net
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.optimizers import RMSprop, Adam
from keras.utils import np_utils
from keras.callbacks import EarlyStopping

In [337]:
#data prep
X = car.ix[:,:6]
y = car.ix[:,6]
Xtr, Xts, ytr, yts = train_test_split(X,y, test_size = 0.3)
print Xtr.shape, ytr.shape
print y.astype(object).describe()

(1209, 6) (1209,)
count     1728
unique       4
top          0
freq      1210
Name: target, dtype: int64


In [None]:
# #Binarize the target variable for later use
# lb = LabelBinarizer()
# lb.fit(y)

In [338]:
# nb_classes = 4
# ytr = np_utils.to_categorical(ytr,nb_classes)
# yts = np_utils.to_categorical(yts, nb_classes)
ytr = np.array(ytr, dtype=np.int32)
yts = np.array(yts, dtype=np.int32)

Xtr = np.array(Xtr, dtype=np.float32)
Xts = np.array(Xts, dtype=np.float32)
print Xtr.shape, ytr.shape

(1209, 6) (1209,)


In [339]:
def my_rmse(y, y_hat):
    return T.sqrt(T.mean(T.square(y-y_hat), axis = -1))
# Network Architecture
model = Sequential()
model.add(Dense(10, input_dim = 6))
model.add(Activation("relu"))
model.add(Dropout(0.1))
model.add(Dense(20))
model.add(Activation("relu"))

model.add(Dense(1))
adam = Adam(lr=1e-2)
model.compile(optimizer = adam, loss = my_rmse)
#model.compile(loss = "categorical_crossentropy", optimizer = rms)

In [340]:
#training the network
batch_size = 10
nb_epoch = 100
earlyStopping = EarlyStopping(monitor='val_loss', patience=5, verbose=0, mode='auto')
fit = model.fit(Xtr, ytr,
         nb_epoch = nb_epoch,
         batch_size = batch_size,
         show_accuracy = True,
         verbose = 0, callbacks = [earlyStopping],
         validation_data = (Xts, yts)) #acheived accuracy on test = 0.94

In [341]:
#prediction and validation
yts_pred = model.predict(Xts, batch_size=batch_size, verbose=1)
#print ("test accuracy is: %.3f" %(np.sqrt(np.mean(np.square(yts-t), axis=-1))))



In [342]:
fit.history["val_loss"][-1]
mse1 = np.mean([np.square(a1-b1) for a1,b1 in zip(yts_pred.tolist(),yts)])

In [344]:
print "mse in method1:", mse1

mse in method1: 0.119865217343


Method 2: Use binarized form for categorical variables as inputs

In [345]:
car_num = car.ix[:,["doors","persons"]]
car_cat = car[["buying","maint","lug_boot","safety"]]

#Create dummy variable from categorical features
car_cat = pd.get_dummies(car_cat.astype(str))
#add back to main df
car_with_dummies = pd.concat([car_num, car_cat], axis=1)
print car_with_dummies.shape

(1728, 16)


In [346]:
Xtr, Xts, ytr, yts = train_test_split(car_with_dummies,y, test_size = 0.3)
print Xtr.shape, ytr.shape

(1209, 16) (1209,)


In [347]:
# Prepare data for a Network
ytr = np.array(ytr, dtype=np.int32)
yts = np.array(yts, dtype=np.int32)

Xtr = np.array(Xtr, dtype=np.float32)
Xts = np.array(Xts, dtype=np.float32)
print Xtr.shape, ytr.shape

(1209, 16) (1209,)


In [348]:
def my_rmse(y, y_hat):
    return T.sqrt(T.mean(T.square(y-y_hat), axis = -1))
# Network Architecture
model = Sequential()
model.add(Dense(10, input_dim = 16))
model.add(Activation("relu"))
model.add(Dropout(0.1))
model.add(Dense(20))
model.add(Activation("relu"))

model.add(Dense(1))
adam = Adam(lr=1e-2)
model.compile(optimizer = adam, loss = my_rmse)
#model.compile(loss = "categorical_crossentropy", optimizer = rms)

In [349]:
#training the network
batch_size = 10
nb_epoch = 100
earlyStopping = EarlyStopping(monitor='val_loss', patience=5, verbose=0, mode='auto')
fit = model.fit(Xtr, ytr,
         nb_epoch = nb_epoch,
         batch_size = batch_size,
         show_accuracy = True,
         verbose = 0, callbacks = [earlyStopping],
         validation_data = (Xts, yts)) #acheived accuracy on test = 0.965

In [350]:
#prediction and validation
yts_pred2 = model.predict(Xts, batch_size=batch_size, verbose=1)



In [351]:
fit.history["val_loss"][-1]
mse2 = np.mean([np.square(a1-b1) for a1,b1 in zip(yts_pred2.tolist(),yts)])

In [352]:
print "mse in method 2 is:", mse2

mse in method 2 is: 0.0848446252746


Method 3: Separation Method As described in the paper.
Let us look at levels in categorical variables. In our separation method, we have to consider 4X4X3X3 = 144 element vector to encode the output layer.
s vector = 144 length

number of nodes in output layer = 144

In [353]:
# Create numerical and categorical feature sets
car_num = car.ix[:,["doors","persons"]]
car_cat = car[["buying","maint","lug_boot","safety"]]

Creating svector: 
1. There can be 144 unique combinations from categorical variables
2. We need to encode each row in categorical data with one of the 144 unique combinations
3. Create a 144 length binarized vector from each encoded row
4. Need to come up with much simpler method later.

In [354]:
#Creating s-vector as described in the paper
levels = []
for col in car_cat.columns:
    levels.append(list(set(car_cat[col])))
levels
svector = list(product(levels[0],levels[1], levels[2], levels[3]))
print "Unique combinations of categorical variables are %d" %len(svector)

#converting it into a dictionary
svec = {}
for idx, val in enumerate(svector):
    svec[val] = idx

#encoding each row in categorical data with one of the 144 unique comb
combined = []
for idx, row in car_cat.iterrows():
    combined.append(svec[tuple([row[0], row[1], row[2], row[3]])])
print "%d rows encoded" %len(combined)

Unique combinations of categorical variables are 144
1728 rows encoded


In [355]:
# We need to binarize the svector
lb_svec = LabelBinarizer()
lb_svec.fit(svec.values())
svec = lb_svec.fit_transform(combined)

print svec.shape
print "for every row, categorical value encoder of %d length created" %(svec.shape[1])

(1728, 144)
for every row, categorical value encoder of 144 length created


Preparing the data for method 3

In [356]:
X = car_num
X = np.array(X, dtype=np.float32)
y = car.ix[:,"target"]
y = np.array(y)
print X.shape, y.shape

(1728, 2) (1728,)


In [357]:
# Prepare data for method 3
sample = np.random.permutation(X.shape[0])
t = 0.7*len(sample)
Xtr, Xts = X[sample[:t],], X[sample[t:],]
ytr, yts = y[sample[:t]], y[sample[t:]]
print "train and test splits: ", Xtr.shape, Xts.shape, ytr.shape, yts.shape
#Categorical encoder
s_tr, s_ts = svec[sample[:t,]], svec[sample[t:,]]
print "svector for train and test:",s_tr.shape, s_ts.shape

train and test splits:  (1209, 2) (519, 2) (1209,) (519,)
svector for train and test: (1209, 144) (519, 144)


IMP: Please create the Class object and Activation Functions below before training the net

Network Architecture: 
Input layer with 2 neurons
2 hidden layers with 10 and 20 neurons.
output layer with 144 neurons

Trained 100 epochs and 0.1 lr

In [360]:
param=((2,0,0),(10, logistic, logistic_prime),(20, logistic, logistic_prime),(144,identity, identity_prime))
#Set learning rate.
rates=[0.05]
net = NeuralNetwork(Xtr,ytr,s_tr,param)

In [369]:
net.train(200,0.1)

In [370]:
pred = net.predict(Xts, s_ts)
pred = np.sum(pred, axis=1)

In [371]:
mse3 = np.mean([np.square(a1-b1) for a1,b1 in zip(pred.tolist(),yts)])

In [372]:
print "mse in method 3 is:", mse3

mse in method 3 is: 0.0768177149449


In [358]:
#Most of this I learned from https://triangleinequality.wordpress.com/2014/03/31/neural-networks-part-2/.
#But built the code as per Separation methodology mentioned in www.elsevier.com/locate/neunet
class NeuralNetwork(object):
    def __init__(self, X, y, svec, parameters):
        #Input data
        self.X=X
        #Output data
        self.y=y
        #Categorical vector for output encoding
        self.svec = svec
        
        #Expect parameters to be a tuple of the form:
        self.n_layers = len(parameters)
        #Counts number of neurons without bias neurons in each layer.
        self.sizes = [layer[0] for layer in parameters]
        #Activation functions for each layer.
        self.fs =[layer[1] for layer in parameters]
        #Derivatives of activation functions for each layer.
        self.fprimes = [layer[2] for layer in parameters]
        self.build_network()
 
    def build_network(self):
        #List of weight matrices taking the output of one layer to the input of the next.
        self.weights=[]
        #Bias vector for each layer.
        self.biases=[]
        #Input vector for each layer.
        self.inputs=[]
        #Output vector for each layer.
        self.outputs=[]
        #Vector of errors at each layer.
        self.errors=[]
        #We initialise the weights randomly, and fill the other vectors with 1s.
        for layer in range(self.n_layers-1):
            n = self.sizes[layer]
            m = self.sizes[layer+1]
            self.weights.append(np.random.normal(0,1, (m,n)))
            self.biases.append(np.random.normal(0,1,(m,1)))
            self.inputs.append(np.zeros((n,1)))
            self.outputs.append(np.zeros((n,1)))
            self.errors.append(np.zeros((n,1)))
        #There are only n-1 weight matrices, so we do the last case separately.
        n = self.sizes[-1]
        self.inputs.append(np.zeros((n,1)))
        self.outputs.append(np.zeros((n,1)))
        self.errors.append(np.zeros((n,1)))
 
    def feedforward(self, x):
        #Propagates the input from the input layer to the output layer.
        k=len(x)
        x.shape=(k,1)
        self.inputs[0]=x
        self.outputs[0]=x
        for i in range(1,self.n_layers):
            self.inputs[i]=self.weights[i-1].dot(self.outputs[i-1])+self.biases[i-1]
            self.outputs[i]=self.fs[i](self.inputs[i])
        return self.outputs[-1]
 
    def update_weights(self,x,y,s):
        #Update the weight matrices for each layer based on a single input x and target y.
        output = self.feedforward(x)
        
        #Encode the output with categorical vector 
        output = output.T*s
        self.errors[-1]=self.fprimes[-1](self.outputs[-1])*(output- s*y)
        self.errors[-1]=self.errors[-1].T
        #print output.shape
 
        n=self.n_layers-2
        for i in xrange(n,0,-1):
            self.errors[i] = self.fprimes[i](self.inputs[i])*self.weights[i].T.dot(self.errors[i+1])
            #print self.outputs[i], self.errors[i+1].shape
            self.weights[i] = self.weights[i]-self.learning_rate*np.outer(self.errors[i+1],self.outputs[i])
            self.biases[i] = self.biases[i] - self.learning_rate*self.errors[i+1]
        self.weights[0] = self.weights[0]-self.learning_rate*np.outer(self.errors[1],self.outputs[0])
        self.biases[0] = self.biases[0] - self.learning_rate*self.errors[1] 
    
    def train(self,n_iter, learning_rate=1):
        #Updates the weights after comparing each input in X with y
        #repeats this process n_iter times.
        self.learning_rate=learning_rate
        n=self.X.shape[0]
        for repeat in range(n_iter):
            #We shuffle the order in which we go through the inputs on each iter.
            index=list(range(n))
            np.random.shuffle(index)
            for row in index:
                x=self.X[row]
                y=self.y[row]
                s=self.svec[row]
                self.update_weights(x,y,s)
 
    def predict_x(self, x):
        return self.feedforward(x)
 
    def predict(self,X,svec):
        n = len(X)
        m = self.sizes[-1]
        ret = np.ones((n,m))
        for i in range(len(X)):
            output = self.feedforward(X[i])
            #print output.shape, svec[i,].shape
            ret[i,:] = output.T*svec[i,]
        return ret

In [359]:
def logistic(x):
    return 1.0/(1+np.exp(-x))
 
def logistic_prime(x):
    ex=np.exp(-x)
    return ex/(1+ex)**2
 
def identity(x):
    return x
 
def identity_prime(x):
    return 1