In [1]:
import pandas as pd
import tensorflow as tf
import keras
import os.path
import numpy as np

from keras.utils import np_utils
from datetime import datetime
from sklearn.model_selection import train_test_split

print(tf.VERSION)
print(keras.__version__)

Using TensorFlow backend.


1.9.0
2.2.0


# Read-in data

In [23]:
train_pd = pd.read_csv('./datasets/train.csv', sep=',', header=0)
# the columns (SQBage, agesq) and (tamhog, hhsize, hogar_total) are identical - REMOVE THEM
train_pd.drop(columns=['SQBage', 'tamhog', 'hogar_total'], inplace=True)

len(train_pd.columns)

140

## Getting to know the data

Check if there is more than 1 household head per household

Group by *idhogar* (household key) and then check column *parentesco1*

In [4]:
idhogar_aggregated = train_pd.groupby('idhogar').parentesco1.sum()

In [8]:
idhogar_aggregated[idhogar_aggregated != 1]

idhogar
03c6bdf85    0
09b195e7a    0
1367ab31d    0
1bc617b23    0
374ca5a19    0
61c10e099    0
6b1b2405f    0
896fe6d3e    0
a0812ef17    0
ad687ad89    0
b1f4d89d7    0
bfd5067c2    0
c0c8a5013    0
d363d9183    0
f2bfa75c4    0
Name: parentesco1, dtype: int64

There are 15 households with no household head

Check if the poverty label is the same for all members of a household

In [12]:
target_aggregated = train_pd.groupby('idhogar').Target.aggregate(['min', 'max'])

target_aggregated['poverty_different'] = target_aggregated['min'] != target_aggregated['max']
target_aggregated.loc[target_aggregated['poverty_different']]

Unnamed: 0_level_0,min,max,poverty_different
idhogar,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0172ab1d9,2,3,True
03f4e5f4d,1,2,True
0511912b6,3,4,True
078a0b6e2,1,2,True
09e25d616,1,2,True
0f3e65c83,1,2,True
0f9494d3a,2,3,True
15a891635,1,2,True
17fb04a62,1,2,True
18832b840,2,3,True


It is not, meaning that members of a given household can have different poverty levels.

In [13]:
train_pd.loc[train_pd['idhogar'] == '0172ab1d9']

Unnamed: 0,Id,v2a1,hacdor,rooms,hacapo,v14a,refrig,v18q,v18q1,r4h1,...,SQBescolari,SQBage,SQBhogar_total,SQBedjefe,SQBhogar_nin,SQBovercrowding,SQBdependency,SQBmeaned,agesq,Target
7651,ID_a5e2b0639,,0,5,0,1,1,0,,0,...,49,196,25,36,4,2.777778,0.444444,58.777775,196,3
7652,ID_5dfdf4ebe,,0,5,0,1,1,0,,0,...,100,289,25,36,4,2.777778,0.444444,58.777775,289,2
7653,ID_762e1fd96,,0,5,0,1,1,0,,0,...,36,2601,25,36,4,2.777778,0.444444,58.777775,2601,3
7654,ID_c76b87004,,0,5,0,1,1,0,,0,...,36,2304,25,36,4,2.777778,0.444444,58.777775,2304,3
7655,ID_1dffe3dcf,,0,5,0,1,1,0,,0,...,121,441,25,36,4,2.777778,0.444444,58.777775,441,2


# Define the network

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, BatchNormalization
from keras.optimizers import RMSprop, Adam
from keras.utils import vis_utils

def instantiate_model():
    m = Sequential([
        Dense(units=40, input_shape=(139,), activation='relu', use_bias=True, init='glorot_normal'),
        Dense(units=20, activation='relu', use_bias=True, init='glorot_normal'),
        Dropout(rate=0.3),
        Dense(units=1, activation='linear', use_bias=True, kernel_initializer='glorot_normal')
    ])
    m.compile(loss='mae', optimizer='adam', metrics=['mae'])
    return m
