In [1]:
import pandas as pd 
import matplotlib.pyplot as plt
import numpy as np

# Data Loading

In [2]:
data = pd.read_csv("../../data/processed/data.csv",
                   dtype = {
                            "expensa_id": "object",
                            "unidad_id": "object",
                            "propietario_id": "object",
                            "inquilino_id": "object",
                            "consorcio_id": "object",
                            "expensa_padre_id": "object",
                            "unidad_prop": "object",
                            "consorcio_id": "object",
                            "usuario_creador_id": "object",
                            "concepto_id": "object",
                            "mes": "object",
                            "anio": "object",
                            "unidad_denominacion":"object",
                            "consorcio_nombre": "object",
                            "version":"object",
                            "concepto_id": "object",
                            "unidad_tipo": "object",
                            "modo_pago": "object",
                            "forma_prorrateo": "object",
                            "tipo_prorrateo": "object",
                            "metodo_pago": "object",                       
                    },
                    parse_dates = ["fecha_vencimiento_1", "fecha_vencimiento_2"])
data.fecha_pago = pd.to_datetime(data.fecha_pago, errors="coerce")

In [3]:
data.shape

(264541, 27)

In [4]:
data.dtypes

expensa_id                     object
unidad_id                      object
propietario_id                 object
inquilino_id                   object
fecha_vencimiento_1    datetime64[ns]
fecha_vencimiento_2    datetime64[ns]
monto                         float64
monto_total                   float64
monto_parcial                 float64
expensa_padre_id               object
int_dia                       float64
int_mes                       float64
mes                            object
anio                           object
unidad_denominacion            object
unidad_prop                    object
consorcio_id                   object
consorcio_nombre               object
version                        object
concepto_id                    object
unidad_tipo                    object
modo_pago                      object
forma_prorrateo                object
tipo_prorrateo                 object
fecha_pago             datetime64[ns]
monto_pago                    float64
metodo_pago 

# Generate Posible Target Variables

## Impago

In [5]:
data["target"] = ~((data.fecha_pago.dt.month == data.fecha_vencimiento_1.dt.month) & (data.fecha_pago.dt.year == data.fecha_vencimiento_1.dt.year)) 

In [6]:
data.target.sum()

81959

In [7]:
data.target.sum()/data.target.count()

0.30981586975175873

## Paga En el Mes?

In [8]:
data["fecha_expensa"] = data.anio + '-' + data.mes + '-01'
data.fecha_expensa = pd.to_datetime(data.fecha_expensa)
data["fecha_expensa_anterior"] = data.fecha_expensa - pd.DateOffset(months=1)

In [9]:
data_lstm = data[["unidad_id", "fecha_expensa", "target", "fecha_expensa_anterior"]]
data_mes_anterior = data.set_index(["unidad_id", "fecha_expensa"])

In [10]:
data_lstm = pd.merge(data_lstm,
         data_mes_anterior,
         left_on = ["unidad_id","fecha_expensa_anterior"],
         right_index = True,
         suffixes = ("", "_lag_1"))

for i in range(1,12):
    data_lstm = pd.merge(data_lstm,
                         data_mes_anterior,
                         left_on = ["unidad_id","fecha_expensa_anterior_lag_{}".format(i)],
                         right_index = True,
                         suffixes = ("", "_lag_{}".format(i+1)))

target_columns = ["fecha_expensa"] + [column for column in data_lstm.columns if column[0:6] == "target"]
data_lstm = data_lstm[target_columns]

In [11]:
X = data_lstm.drop(columns="target")
Y = data_lstm[["fecha_expensa", "target"]]

In [12]:
X = X.loc[data_lstm.fecha_expensa < "2019-01-01",:]
Y = Y.loc[data_lstm.fecha_expensa < "2019-01-01",:]

In [13]:
print(X.shape)
print(Y.shape)

(105280, 13)
(105280, 2)


In [14]:
target_columns2 = [column for column in X.columns if column[0:6] == "target"]
X_train = X.loc[X.fecha_expensa < '2018-09-01', target_columns2]
X_test = X.loc[X.fecha_expensa >= '2018-09-01', target_columns2]
Y_train = Y.loc[Y.fecha_expensa < '2018-09-01', "target"]
Y_test = Y.loc[Y.fecha_expensa >= '2018-09-01', "target"]
print(X_train.shape, X_test.shape)
print(Y_train.shape, Y_test.shape)

(79629, 12) (25651, 12)
(79629,) (25651,)


In [15]:
Y_train = Y_train.values.reshape(len(Y_train),1)

In [16]:
Y_test = Y_test.values.reshape(len(Y_test),1)

In [17]:
X_train_conv = X_train.values.reshape(*X_train.shape,1)
X_test_conv = X_test.values.reshape(*X_test.shape,1)

In [44]:
X_train.values.astype(float)

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 0., ..., 1., 1., 1.],
       [0., 0., 0., ..., 1., 1., 1.]])

In [18]:
print(X_train_conv.shape, X_test_conv.shape)
print(Y_train.shape, Y_test.shape)

(79629, 12, 1) (25651, 12, 1)
(79629, 1) (25651, 1)


In [121]:
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.models import Sequential
from keras.layers import Conv1D, Embedding, LSTM

conv_model = Sequential()
conv_model.add(Conv1D(filters=2, kernel_size = 7, input_shape=(12,1), padding= "same"))
conv_model.add(Flatten())
conv_model.add(Dense(12, activation="relu"))
conv_model.add(Dense(1, activation='sigmoid'))
conv_model.compile(loss='binary_crossentropy', optimizer="adam", metrics=['binary_accuracy'])

In [122]:
conv_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_15 (Conv1D)           (None, 12, 2)             16        
_________________________________________________________________
flatten_3 (Flatten)          (None, 24)                0         
_________________________________________________________________
dense_38 (Dense)             (None, 12)                300       
_________________________________________________________________
dense_39 (Dense)             (None, 1)                 13        
Total params: 329
Trainable params: 329
Non-trainable params: 0
_________________________________________________________________


In [123]:
conv_model.fit(X_train_conv, Y_train, epochs = 5, batch_size=128)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x189233989b0>

In [124]:
pred = conv_model.predict_classes(X_train_conv)

In [125]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
print("accuracy", accuracy_score(Y_train, pred))
print("f1", f1_score(Y_train, pred))
print("precision", precision_score(Y_train, pred))
print("recall", recall_score(Y_train, pred))

accuracy 0.8801190521041329
f1 0.7800663533314901
precision 0.8247186632240464
recall 0.7400008742405035


In [126]:
from sklearn.metrics import confusion_matrix

confusion_matrix(Y_train, pred)

array([[53154,  3598],
       [ 5948, 16929]], dtype=int64)

In [127]:
pred_test = conv_model.predict_classes(X_test_conv)

In [128]:
print("accuracy", accuracy_score(Y_test, pred_test))
print("f1", f1_score(Y_test, pred_test))
print("precision", precision_score(Y_test, pred_test))
print("recall", recall_score(Y_test, pred_test))

accuracy 0.8593037308486998
f1 0.6921961620469083
precision 0.7710431312939389
recall 0.6279789538842464


## LSTM

In [25]:
X_train_lstm = X_train.values.reshape(*X_train.shape,1)
X_test_lstm = X_test.values.reshape(*X_test.shape,1)

In [28]:
X_train_lstm = X_train_lstm[9:,:,:]
X_test_lstm  = X_test_lstm[6:,:,:]
Y_train = Y_train[9:]
Y_test  = Y_test[6:]
print(X_train_lstm.shape, X_test_lstm.shape)
print(Y_train.shape, Y_test.shape)

(122710, 6, 1) (34670, 6, 1)
(122710,) (34670,)


In [43]:
from keras.layers.core import Dense, Dropout, Activation
from keras.models import Sequential
from keras.layers import LSTM, Embedding

lstm_model = Sequential()
lstm_model.add(LSTM(128, batch_input_shape=(10,6,1), dropout=0.0, recurrent_dropout=0.0, stateful=True, kernel_initializer='random_uniform'))
lstm_model.add(Dropout(0.3))
lstm_model.add(Dense(64,activation='relu'))
lstm_model.add(Dense(1,activation='softmax'))
lstm_model.compile(loss='binary_crossentropy', optimizer="adam")

In [44]:
lstm_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_4 (LSTM)                (10, 128)                 66560     
_________________________________________________________________
dropout_4 (Dropout)          (10, 128)                 0         
_________________________________________________________________
dense_7 (Dense)              (10, 64)                  8256      
_________________________________________________________________
dense_8 (Dense)              (10, 1)                   65        
Total params: 74,881
Trainable params: 74,881
Non-trainable params: 0
_________________________________________________________________


In [45]:
lstm_model.fit(X_train_lstm, Y_train, epochs=2, verbose=3, batch_size=10, shuffle=True)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x1678c9c7c18>

In [47]:
pred = lstm_model.predict_classes(X_train_lstm, batch_size=10)
print("accuracy", accuracy_score(Y_train, pred))
print("f1", f1_score(Y_train, pred))
print("precision", precision_score(Y_train, pred))
print("recall", recall_score(Y_train, pred))

accuracy 0.7278461413087768
f1 0.8424895294872279
precision 0.7278461413087768
recall 1.0


In [48]:
pred = lstm_model.predict_classes(X_test_lstm, batch_size=10)
print("accuracy", accuracy_score(Y_test, pred))
print("f1", f1_score(Y_test, pred))
print("precision", precision_score(Y_test, pred))
print("recall", recall_score(Y_test, pred))

accuracy 0.7469570233631382
f1 0.855152145557812
precision 0.7469570233631382
recall 1.0
