In [109]:
import pandas as pd
import tensorflow as tf
import keras
import os.path
import numpy as np

from keras.utils import np_utils
from datetime import datetime
from sklearn.model_selection import train_test_split

print(tf.VERSION)
print(keras.__version__)

1.9.0
2.2.2


# Read-in data

In [110]:
train_pd = pd.read_csv('./datasets/train.csv', sep=',', header=0)
# the columns (SQBage, agesq) and (tamhog, hhsize, hogar_total) are identical - REMOVE THEM
train_pd.drop(columns=['SQBage', 'tamhog', 'hogar_total'], inplace=True)
# drop interaction columns ['edjefe', 'edjefa']
train_pd.drop(columns=['edjefe', 'edjefa', 'SQBedjefe'], inplace=True)
# drop dependency and its square - mixed numerical and alphanumeric values, unclear contribution
train_pd.drop(columns=['dependency', 'SQBdependency'], inplace=True)

len(train_pd.columns)

135

## Getting to know the data

Check if there is more than 1 household head per household

Group by *idhogar* (household key) and then check column *parentesco1*

In [3]:
idhogar_aggregated = train_pd.groupby('idhogar').parentesco1.sum()

In [4]:
idhogar_aggregated[idhogar_aggregated != 1]

idhogar
03c6bdf85    0
09b195e7a    0
1367ab31d    0
1bc617b23    0
374ca5a19    0
61c10e099    0
6b1b2405f    0
896fe6d3e    0
a0812ef17    0
ad687ad89    0
b1f4d89d7    0
bfd5067c2    0
c0c8a5013    0
d363d9183    0
f2bfa75c4    0
Name: parentesco1, dtype: int64

There are 15 households with no household head

Check if the poverty label is the same for all members of a household

In [5]:
target_aggregated = train_pd.groupby('idhogar').Target.aggregate(['min', 'max'])

target_aggregated['poverty_different'] = target_aggregated['min'] != target_aggregated['max']
target_aggregated.loc[target_aggregated['poverty_different']].head(10)

Unnamed: 0_level_0,min,max,poverty_different
idhogar,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0172ab1d9,2,3,True
03f4e5f4d,1,2,True
0511912b6,3,4,True
078a0b6e2,1,2,True
09e25d616,1,2,True
0f3e65c83,1,2,True
0f9494d3a,2,3,True
15a891635,1,2,True
17fb04a62,1,2,True
18832b840,2,3,True


It is not, meaning that members of a given household can have different poverty levels.

In [13]:
train_pd.loc[train_pd['idhogar'] == '0172ab1d9']

Unnamed: 0,Id,v2a1,hacdor,rooms,hacapo,v14a,refrig,v18q,v18q1,r4h1,...,SQBescolari,SQBage,SQBhogar_total,SQBedjefe,SQBhogar_nin,SQBovercrowding,SQBdependency,SQBmeaned,agesq,Target
7651,ID_a5e2b0639,,0,5,0,1,1,0,,0,...,49,196,25,36,4,2.777778,0.444444,58.777775,196,3
7652,ID_5dfdf4ebe,,0,5,0,1,1,0,,0,...,100,289,25,36,4,2.777778,0.444444,58.777775,289,2
7653,ID_762e1fd96,,0,5,0,1,1,0,,0,...,36,2601,25,36,4,2.777778,0.444444,58.777775,2601,3
7654,ID_c76b87004,,0,5,0,1,1,0,,0,...,36,2304,25,36,4,2.777778,0.444444,58.777775,2304,3
7655,ID_1dffe3dcf,,0,5,0,1,1,0,,0,...,121,441,25,36,4,2.777778,0.444444,58.777775,441,2


Check if houses that have been fully paid have a montly payment - they should not.

In [7]:
train_pd.loc[(train_pd['tipovivi1'] == 1) & (~np.isnan(train_pd['v2a1']))] # GOOD

Unnamed: 0,Id,v2a1,hacdor,rooms,hacapo,v14a,refrig,v18q,v18q1,r4h1,...,age,SQBescolari,SQBhogar_total,SQBedjefe,SQBhogar_nin,SQBovercrowding,SQBdependency,SQBmeaned,agesq,Target


In [77]:
len(train_pd.columns)

138

# Define the network

In [116]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, BatchNormalization
from keras.optimizers import RMSprop, Adam
from keras.utils import vis_utils

def instantiate_model():
    m = Sequential([
        Dense(units=60, input_shape=(132,), activation='relu', use_bias=True, init='glorot_normal'),
        Dense(units=40, activation='sigmoid', use_bias=True, init='glorot_normal'),
        Dropout(rate=0.3),
        Dense(units=10, activation='relu', use_bias=True, init='glorot_normal'),
        Dropout(rate=0.1),
        Dense(units=5, activation='softmax', use_bias=True, kernel_initializer='glorot_normal')
    ])
    m.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
    return m


In [117]:
# prepare the data

# convert all NaNs to 0 - big assumption
train_pd = train_pd.fillna(0)

# split 
# remove the id columns also
input_train, input_test, \
target_train, target_test = train_test_split(train_pd.loc[:, ~train_pd.columns.isin(['idhogar', 'Id', 'Target'])].values,
                                             train_pd['Target'].values,
                                             train_size=0.7)



In [118]:
model = instantiate_model()

model.fit(input_train, np_utils.to_categorical(target_train), nb_epoch = 100, verbose = 2)



  
  if __name__ == '__main__':
  # This is added back by InteractiveShellApp.init_path()
  This is separate from the ipykernel package so we can avoid doing imports until


Epoch 1/100
 - 1s - loss: 1.2095 - acc: 0.5750
Epoch 2/100
 - 0s - loss: 1.0968 - acc: 0.6240
Epoch 3/100
 - 0s - loss: 1.0615 - acc: 0.6292
Epoch 4/100
 - 0s - loss: 1.0494 - acc: 0.6295
Epoch 5/100
 - 0s - loss: 1.0330 - acc: 0.6322
Epoch 6/100
 - 0s - loss: 1.0197 - acc: 0.6348
Epoch 7/100
 - 0s - loss: 1.0188 - acc: 0.6363
Epoch 8/100
 - 0s - loss: 1.0046 - acc: 0.6393
Epoch 9/100
 - 0s - loss: 1.0008 - acc: 0.6391
Epoch 10/100
 - 0s - loss: 1.0027 - acc: 0.6427
Epoch 11/100
 - 0s - loss: 0.9990 - acc: 0.6428
Epoch 12/100
 - 0s - loss: 0.9832 - acc: 0.6437
Epoch 13/100
 - 1s - loss: 0.9740 - acc: 0.6475
Epoch 14/100
 - 0s - loss: 0.9813 - acc: 0.6422
Epoch 15/100
 - 0s - loss: 0.9724 - acc: 0.6428
Epoch 16/100
 - 1s - loss: 0.9657 - acc: 0.6479
Epoch 17/100
 - 0s - loss: 0.9634 - acc: 0.6503
Epoch 18/100
 - 0s - loss: 0.9582 - acc: 0.6481
Epoch 19/100
 - 0s - loss: 0.9501 - acc: 0.6541
Epoch 20/100
 - 0s - loss: 0.9544 - acc: 0.6536
Epoch 21/100
 - 1s - loss: 0.9556 - acc: 0.6505
E

<keras.callbacks.History at 0x1b11c748>