#### Distributed learning on the census dataset using neural network
This notebook contains the code that was used to produce the results in the paper "Weight Exchange in Distributed Learning" by Julian Dorner, Samuel Favrichon and Arif Selcuk Ogrenci. The data can be found on this link : https://archive.ics.uci.edu/ml/datasets/Census+Income

As we can see in the following link the problem of using the census dataset to preict the income of households as already been solved: http://www.mis.nsysu.edu.tw/db-book/DMProject2007Spring/6/project.pdf. However we'll use this as an example to compare differnt methods of exchanging weights between training neural networks to achieve the best accuracy possible. 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import StratifiedShuffleSplit

#Keras library to build neural networks
from keras.models import Sequential, Graph
from keras.utils import np_utils
from keras.layers.core import Dense, Activation, Dropout
from keras.callbacks import EarlyStopping

from keras.optimizers import SGD, RMSprop, Adagrad

### Preprocesing:
Steps : 
* collect the data from the files
* remove unused features
* convert categorical to numeric features (one hot encoding)
* scale numeric attributes
* randomly divide data and balance the classes 


In [2]:
column_names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
                'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 
                'hours-per-week', 'native-country', 'income']
#We won't be using the following features in our analysis 
drop_columns = ['fnlwgt', 'education-num']
#We concatenate the test and training dataset to have a bigger input dataset
df_1 = pd.read_csv('./adult.data', na_values='?', names=column_names)
df_2 = pd.read_csv('./adult.test', na_values='?', names=column_names)
df = pd.concat((df_1, df_2))
df = df[df.age != '|1x3 Cross validator']

In [3]:
df_numeric = df[['age', 'capital-gain', 'capital-loss', 'hours-per-week']]
df_categoric = df[['workclass', 'education', 'marital-status','occupation', 'relationship', 'race', 'sex', 'native-country', 'income']]

In [4]:
#Scaling of the input numeric features
scaler = StandardScaler().fit(df_numeric)
df_transformed = scaler.transform(df_numeric)


In [5]:
#One hot encoding to convert categorical feature to multiple columns with numeric features
#https://en.wikipedia.org/wiki/One-hot
dummies = []
for col in df_categoric:
    dummies.append(pd.get_dummies(df[col]))
df_encoded = pd.concat(dummies, axis=1)

enc = OneHotEncoder(categorical_features='all', dtype='float',handle_unknown='error', n_values='auto', sparse=False)
encoded_data = enc.fit_transform(df_encoded)


In [6]:
processed_array = np.concatenate((df_transformed, encoded_data), axis=1)
input_dim = processed_array.shape[1] - 2

In [7]:
# We now have an array containing 216 numeric features normalized and almost 50K samples 
processed_array.shape

(48842, 216)

In [8]:
#We split the data into a training and testing set We'll use 40K samples in our training and the rest for testing
rng = np.random.RandomState(11)  # reproducible results with a fixed seed
indices = np.arange(len(processed_array))
rng.shuffle(indices)
data_shuffled = processed_array[indices]
test_size = 40000
train, test = processed_array[:test_size,:], processed_array[test_size:,:]

In [9]:
def init_model(i):
    """initialise a keras multilayer neural network
    the parameter i is used to create a different set of initialization for each neurla network"""
    model = Sequential()
    #100 neurons in first hidden layer with sigmoid activation function
    model.add(Dense(100, input_dim=input_dim, init='uniform', activation='sigmoid'))
    #Dropout of half the neurons to avoid overfitting http://www.cs.toronto.edu/~rsalakhu/papers/srivastava14a.pdf
    model.add(Dropout(0.5))
    #2nd layer with 2 neurons
    model.add(Dense(2, init='uniform', activation='softmax'))
    ada = Adagrad()
    #We store the initials weight of each model
    model.compile(loss='mean_squared_error', optimizer=ada, class_mode='categorical')
    name = 'initial_weights_{}_census.h5'.format(i)
    init_weights = model.save_weights(name, overwrite=True)
    
    return model

In [10]:
#initialise model
model = init_model(0)
#Divide data between testing and training set
X_train, y_train = train[:, :-2], train[:, :-3:-1]
X_test, y_test = test[:, :-2], test[:, :-3:-1]
#run the model for 150 epochs on the whole dataset
model.fit(X_train, y_train, nb_epoch=150, shuffle=True, batch_size=1000, verbose=0, validation_split=0.1)
print(model.evaluate(X_test, y_test, batch_size=1000))
preds = model.predict_classes(X_test, batch_size=1000)
print("Confusion matrix :")
print(confusion_matrix(y_test[:,1], preds))

0.0162087071093
[[2092   11]
 [   0 6739]]


In [11]:
a = np.arange(5)
np.random.shuffle(a)

In [12]:
split =  8000
validation = 4000
X_test, y_test = test[validation:, :-2], test[validation:, :-3:-1]
X_val, y_val = test[:validation, :-2], test[:validation, :-3:-1]

In [13]:
#run the model for a dataset divided in 5 subsets
for i in a:
    X_train_1, y_train_1 = train[i*split:(i+1)*split, :-2], train[i*split:(i+1)*split, :-3:-1]
    model = init_model(0)
    model.fit(X_train_1, y_train_1, nb_epoch=100, shuffle=True, batch_size=1000, verbose=0, validation_split=0.1)#, callbacks=[early_stopping])
    #print(model.evaluate(X_test, y_test, batch_size=1000, verbose=0))
    preds = model.predict_classes(X_test, batch_size=1000, verbose=0)
    print("confusion matrix {}:".format(i))
    print(confusion_matrix(y_test[:,1], preds))

0.237373633942
[[   0 1158]
 [   0 3684]]
0.00585100859906
[[1156    2]
 [   0 3684]]
0.237198954227
[[   0 1158]
 [   0 3684]]
0.237167034592
[[   0 1158]
 [   0 3684]]
0.237286793932
[[   0 1158]
 [   0 3684]]


In [None]:
MAX_EPOCH = 100
nb_block = 5
block_size = 8000
min_index = 0
epoch = 0
results = []
loss_evolution = np.empty((100,5))

#create a list of models
models = []
for i in range(nb_block):
    models.append(init_model(i+1))
#runs the collage testing method
while epoch < MAX_EPOCH:
    if epoch%10==0:
        print("epoch n° :{}".format(epoch))

    for i, model in enumerate(models):    
        if min_index == 0 and epoch == 0:
            model.load_weights('initial_weights.h5')
        else:
            model.load_weights('weigths_{}_census.h5'.format(min_index+1))
        
        X_train, y_train = train[i*block_size:(i+1)*block_size,:-2], train[i*block_size:(i+1)*block_size, :-3:-1]
        model.fit(X_train, y_train, nb_epoch=1, shuffle=True, batch_size=1000, validation_split=0.1, verbose=0)
        name = "weigths_{}_census.h5".format(i+1)
        model.save_weights(name, overwrite=True)
        results.append(model.evaluate(X_val, y_val, batch_size=1000, verbose=0))

    #print(results)    
    loss_evolution[i] = results
    min_index = results.index(min(results))
    results = []
    epoch +=1

In [15]:
for model in models:
    preds = model.predict_classes(X_test, batch_size=1000, verbose=0)
    print(confusion_matrix(y_test[:,1], preds))

[[1051  107]
 [   0 3684]]
[[1050  108]
 [   0 3684]]
[[1049  109]
 [   0 3684]]
[[1050  108]
 [   0 3684]]
[[1158    0]
 [   0 3684]]


In [16]:
def save_weights(model, indice):
    name = "weigths_{}_census.h5".format(indice+1)
    model.save_weights(name, overwrite=True)

In [17]:
def most_common(lst):
    return max(set(lst), key=lst.count)

In [18]:
X1, y1 = train[0:8000,:-2], train[0:8000, :-3:-1]
X2, y2 = train[8000:16000,:-2], train[8000:16000, :-3:-1]
X3, y3 = train[16000:24000,:-2], train[16000:24000, :-3:-1]
X4, y4 = train[24000:32000,:-2], train[24000:32000, :-3:-1]
X5, y5 = train[32000:,:-2], train[32000:, :-3:-1]

In [19]:
#creates the subsets for training
X_train, X_test, y_train, y_test = train_test_split(train[:,:-2], train[:, :-3:-1], test_size=0.5, random_state=42)
X1, X2, y1, y2 = train_test_split(X_train, y_train, test_size=0.5, random_state=2)
X3, X4, y3, y4 = train_test_split(X_test, y_test, test_size=0.5, random_state=2)


In [20]:
def best_weights_2(models, Xs, ys, method="average"):
    """finds the best set of weights for the method chosen in cross testing i.e. average or majority"""
    loss_array =  np.empty((4,4))
    for i, model in enumerate(models):
        loss_array[i] = test_model_2(model, 4, Xs, ys)
    if method=="majority":
        best_indice = most_common(list(np.argmin(loss_array, axis=1)))
    else:
        best_indice = np.argmin(np.mean(loss_array, axis=1))
        
    return best_indice

In [23]:
def test_model_2(model, nb_block, Xs, ys):
    """test one of the models against all the others subsets"""
    loss_values = []
    for i in range(nb_block):
        loss_values.append(model.evaluate(Xs[i], ys[i], batch_size=1000, verbose=0))

    return loss_values

In [24]:
Xs = [X1,X2,X3,X4]
ys = [y1,y2,y3,y4]

In [25]:
MAX_EPOCH = 200

min_index = 0
epoch = 0
results = []
models = []

for i in range(4):
    models.append(init_model(i+1))
#runs the majority method crosstesting  
while epoch < MAX_EPOCH:
    for i, model in enumerate(models):
        
        if epoch != 0:
            model.load_weights('weigths_{}_census.h5'.format(min_index+1))
        model.fit(Xs[i], ys[i], nb_epoch=1, shuffle=True, batch_size=1000, validation_split=0.1, verbose=0)
        save_weights(model, i)
        results.append(model.evaluate(X_test, y_test, batch_size=1000, verbose=0))
        
    if epoch%10 == 0:
        print(results)
    min_index = best_weights_2(models, Xs, ys, "majority")
    #print("min {}".format(min_index))
    results = []
    epoch +=1

[0.042611388675868514, 0.043410747870802881, 0.042639494314789771, 0.042770495824515822]
[0.041904450580477717, 0.041895999945700167, 0.041897670365869999, 0.041864950023591516]
[0.041633573919534685, 0.041596679203212264, 0.041591840982437137, 0.041580136865377426]
[0.041264043375849727, 0.041212228871881959, 0.041201391816139223, 0.041177860461175442]
[0.040716387145221231, 0.040652271453291179, 0.040648943837732078, 0.040613694768399003]
[0.040034050494432448, 0.039964877907186745, 0.039969194307923318, 0.039911045413464311]
[0.039230609964579347, 0.03914747331291437, 0.039163721725344658, 0.03908301517367363]
[0.03835273291915655, 0.038255922216922048, 0.038259987346827985, 0.038191731367260215]
[0.037412532605230808, 0.037290447205305097, 0.037317358888685703, 0.03723915433511138]
[0.036351911444216969, 0.036237919330596925, 0.036273089889436963, 0.036178752034902575]
[0.035189119447022674, 0.035017394274473188, 0.035076319426298144, 0.03499798225238919]
[0.033735078573226926, 0.0

In [26]:
for model in models:
    preds = model.predict_classes(test[:,:-2], batch_size=1000, verbose=0)
    print(confusion_matrix(test[:, -2], preds))

[[1779  324]
 [   0 6739]]
[[1844  259]
 [   0 6739]]
[[1870  233]
 [   0 6739]]
[[1856  247]
 [   0 6739]]


In [27]:
MAX_EPOCH = 200

min_index = 0
epoch = 0
results = []
models = []

for i in range(4):
    models.append(init_model(i+1))
#collage testing example
while epoch < MAX_EPOCH:
    for i, model in enumerate(models):
        
        if epoch != 0:
            model.load_weights('weigths_{}_census.h5'.format(min_index+1))
        model.fit(Xs[i], ys[i], nb_epoch=1, shuffle=True, batch_size=1000, validation_split=0.1, verbose=0)
        save_weights(model, i)
        results.append(model.evaluate(X_val, y_val, batch_size=1000, verbose=0))
        
    if epoch%10 == 0:
        print(results)
        
    min_index = results.index(min(results))
    results = []
    epoch +=1

[0.20750543475151062, 0.21002118289470673, 0.20901274681091309, 0.20803317427635193]
[0.21854540333151817, 0.2182774543762207, 0.21846534684300423, 0.21805630624294281]
[0.21552538871765137, 0.21520137041807175, 0.21514877304434776, 0.21484673395752907]
[0.21012190729379654, 0.21021044254302979, 0.21033564954996109, 0.20919886603951454]
[0.20379667729139328, 0.20408762246370316, 0.20423764735460281, 0.20287972316145897]
[0.19699989631772041, 0.1965523436665535, 0.19751847162842751, 0.19602303951978683]
[0.18924149498343468, 0.18962305411696434, 0.1895323172211647, 0.18762225285172462]
[0.18136759847402573, 0.18113085255026817, 0.18187719210982323, 0.17867915332317352]
[0.17163272947072983, 0.16931698471307755, 0.16978064551949501, 0.16909613460302353]
[0.16127610951662064, 0.15921389311552048, 0.15864903852343559, 0.15342552214860916]
[0.14095119386911392, 0.13481224328279495, 0.14046195894479752, 0.13878185302019119]
[0.1194979902356863, 0.11122330464422703, 0.11752458289265633, 0.113

In [28]:
for model in models:
    preds = model.predict_classes(test[:,:-2], batch_size=1000, verbose=0)
    print(confusion_matrix(test[:, -2], preds))

[[2102    1]
 [   0 6739]]
[[2102    1]
 [   0 6739]]
[[2102    1]
 [   0 6739]]
[[2102    1]
 [   0 6739]]


Here we train each of the models separately to determine our ground performance :

In [29]:
MAX_EPOCH = 200
nb_block = 4
block_size = 8000
min_index = 0
epoch = 0
results = []
loss_evolution = np.empty((100,4))

models = []
for i in range(nb_block):
    models.append(init_model(i+1))

while epoch < MAX_EPOCH:
    for i, model in enumerate(models):    
        model.fit(Xs[i], ys[i], nb_epoch=1, shuffle=True, batch_size=1000, validation_split=0.1, verbose=0)
        results.append(model.evaluate(X_val, y_val, batch_size=1000, verbose=0))

    loss_evolution[i] = results
    min_index = results.index(min(results))
    results = []
    epoch +=1

In [30]:
for model in models:
    preds = model.predict_classes(test[:,:-2], batch_size=1000, verbose=0)
    print(confusion_matrix(test[:, -2], preds))

[[1854  249]
 [   0 6739]]
[[2035   68]
 [   0 6739]]
[[1854  249]
 [   0 6739]]
[[2068   35]
 [   0 6739]]


In this notebok we explore the different ways of sharing weights to increase accuracy of prediction of the census dataset. The methods compared are crosstesting and collage.