In [39]:
import numpy as np
import pandas
import matplotlib.pyplot as plt

from keras import backend as K
from keras import layers, optimizers, models
from sklearn.preprocessing import LabelEncoder

In [15]:
np.random.seed(7)
train_data = pandas.read_csv("dataset/adult.data", encoding="UTF-8", header=None)
test_data = pandas.read_csv("dataset/adult.test", encoding="UTF-8", header=None, skiprows=1)
full_data = pandas.concat([train_data, test_data])

In [16]:
print(full_data.head(5))

   0                  1       2           3   4                    5   \
0  39          State-gov   77516   Bachelors  13        Never-married   
1  50   Self-emp-not-inc   83311   Bachelors  13   Married-civ-spouse   
2  38            Private  215646     HS-grad   9             Divorced   
3  53            Private  234721        11th   7   Married-civ-spouse   
4  28            Private  338409   Bachelors  13   Married-civ-spouse   

                   6               7       8        9     10  11  12  \
0        Adm-clerical   Not-in-family   White     Male  2174   0  40   
1     Exec-managerial         Husband   White     Male     0   0  13   
2   Handlers-cleaners   Not-in-family   White     Male     0   0  40   
3   Handlers-cleaners         Husband   Black     Male     0   0  40   
4      Prof-specialty            Wife   Black   Female     0   0  40   

               13      14  
0   United-States   <=50K  
1   United-States   <=50K  
2   United-States   <=50K  
3   United-State

In [17]:
del train_data
del test_data

transform_needed = [False,
           True,
           False,
           True,
           False,
           True,
           True,
           True,
           True,
           True,
           False,
           False,
           False,
           True,
           True]


def replaceSpaceAndDot(text: str):
    return text.replace(' ', '').replace('.', '')


full_data[14] = full_data[14].apply(replaceSpaceAndDot)

In [18]:
data = np.zeros(shape=(full_data.shape[0], full_data.shape[1]), dtype=np.float32)

for i in range(len(transform_needed)):
    if transform_needed[i]:
        tmp_data = full_data.iloc[:, i].tolist()
        encoder = LabelEncoder()
        encoder.fit(tmp_data)
        data[:, i] = encoder.transform(tmp_data)
    else:
        data[:, i] = full_data.iloc[:, i].tolist()

In [None]:
#print(pandas.unique(full_data.iloc[:, 14]))
train_size = int(len(data) * .8)

x_train = data[:train_size, :13]
y_train = data[:train_size, 14]

x_test = data[train_size:, :13]
y_test = data[train_size:, 14]

total_class = np.unique(data[:train_size, 14]).shape[0]

model = models.Sequential()
model.add(layers.Dense(64, input_shape=(x_train.shape[1],), activation="sigmoid"))
model.add(layers.Dense(32, activation="relu"))
model.add(layers.Dense(32, activation="relu"))
model.add(layers.Dropout(0.05))
model.add(layers.Dense(16, activation="relu"))
model.add(layers.Dense(1, activation="sigmoid"))

opt = optimizers.RMSprop(lr=0.001, rho=0.9, epsilon=None, decay=0.0)
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])
history = model.fit(x_train, y_train, validation_split=0.33, epochs=10, batch_size=16)

loss, accuracy = model.evaluate(x_test, y_test)
print("Test Acc : " + str(accuracy))
print("Test Loss : " + str(loss))

plt.plot(history.history['acc'])
plt.plot(history.history['loss'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['acc', 'loss'], loc='upper left')
plt.show()

Train on 26178 samples, validate on 12895 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
 1776/26178 [=>............................] - ETA: 7s - loss: 0.5401 - acc: 0.7686