In [4]:
import pandas as pd 
import matplotlib.pyplot as plt
import numpy as np

# Data Loading

In [5]:
data = pd.read_csv("../../data/processed/data.csv",
                   dtype = {
                            "expensa_id": "object",
                            "unidad_id": "object",
                            "propietario_id": "object",
                            "inquilino_id": "object",
                            "consorcio_id": "object",
                            "expensa_padre_id": "object",
                            "unidad_prop": "object",
                            "consorcio_id": "object",
                            "usuario_creador_id": "object",
                            "concepto_id": "object",
                            "mes": "object",
                            "anio": "object",
                            "unidad_denominacion":"object",
                            "consorcio_nombre": "object",
                            "version":"object",
                            "concepto_id": "object",
                            "unidad_tipo": "object",
                            "modo_pago": "object",
                            "forma_prorrateo": "object",
                            "tipo_prorrateo": "object",
                            "metodo_pago": "object",                       
                    },
                    parse_dates = ["fecha_vencimiento_1", "fecha_vencimiento_2"])
data.fecha_pago = pd.to_datetime(data.fecha_pago, errors="coerce")

In [6]:
data.shape

(264541, 27)

In [7]:
data.dtypes

expensa_id                     object
unidad_id                      object
propietario_id                 object
inquilino_id                   object
fecha_vencimiento_1    datetime64[ns]
fecha_vencimiento_2    datetime64[ns]
monto                         float64
monto_total                   float64
monto_parcial                 float64
expensa_padre_id               object
int_dia                       float64
int_mes                       float64
mes                            object
anio                           object
unidad_denominacion            object
unidad_prop                    object
consorcio_id                   object
consorcio_nombre               object
version                        object
concepto_id                    object
unidad_tipo                    object
modo_pago                      object
forma_prorrateo                object
tipo_prorrateo                 object
fecha_pago             datetime64[ns]
monto_pago                    float64
metodo_pago 

# Generate Posible Target Variables

## Impago

In [8]:
data["target"] = (data.fecha_pago.dt.month == data.fecha_vencimiento_1.dt.month) & (data.fecha_pago.dt.year == data.fecha_vencimiento_1.dt.year)  

## Paga En el Mes?

In [9]:
data["fecha_expensa"] = data.anio + '-' + data.mes + '-01'
data.fecha_expensa = pd.to_datetime(data.fecha_expensa)
data["fecha_expensa_anterior"] = data.fecha_expensa - pd.DateOffset(months=1)

In [10]:
data_lstm = data[["unidad_id", "fecha_expensa", "target", "fecha_expensa_anterior"]]
data_mes_anterior = data.set_index(["unidad_id", "fecha_expensa"])

In [11]:
data_lstm = pd.merge(data_lstm,
         data_mes_anterior,
         left_on = ["unidad_id","fecha_expensa_anterior"],
         right_index = True,
         suffixes = ("", "_lag_1"))

for i in range(1,6):
    data_lstm = pd.merge(data_lstm,
                         data_mes_anterior,
                         left_on = ["unidad_id","fecha_expensa_anterior_lag_{}".format(i)],
                         right_index = True,
                         suffixes = ("", "_lag_{}".format(i+1)))

target_columns = ["fecha_expensa"] + [column for column in data_lstm.columns if column[0:6] == "target"]
data_lstm = data_lstm[target_columns]

In [12]:
data_lstm.groupby(target_columns[1:6]).count().reset_index().sort_values("fecha_expensa", ascending = False)

Unnamed: 0,target,target_lag_1,target_lag_2,target_lag_3,target_lag_4,fecha_expensa,target_lag_5,target_lag_6
31,True,True,True,True,True,94244,94244,94244
0,False,False,False,False,False,23857,23857,23857
15,False,True,True,True,True,8108,8108,8108
30,True,True,True,True,False,5990,5990,5990
23,True,False,True,True,True,4748,4748,4748
29,True,True,True,False,True,4591,4591,4591
27,True,True,False,True,True,4551,4551,4551
16,True,False,False,False,False,2364,2364,2364
1,False,False,False,False,True,2280,2280,2280
7,False,False,True,True,True,2225,2225,2225


In [13]:
X = data_lstm.drop(columns="target")
Y = data_lstm[["fecha_expensa", "target"]]

In [14]:
X = X.loc[data_lstm.fecha_expensa < "2019-01-01",:]
Y = Y.loc[data_lstm.fecha_expensa < "2019-01-01",:]

In [15]:
print(X.shape)
print(Y.shape)

(157395, 7)
(157395, 2)


In [16]:
target_columns2 = [column for column in X.columns if column[0:6] == "target"]
X_train = X.loc[X.fecha_expensa < '2018-09-01', target_columns2]
X_test = X.loc[X.fecha_expensa >= '2018-09-01', target_columns2]
Y_train = Y.loc[Y.fecha_expensa < '2018-09-01', "target"]
Y_test =Y.loc[Y.fecha_expensa >= '2018-09-01', "target"]
print(X_train.shape, X_test.shape)
print(Y_train.shape, Y_test.shape)

(122719, 6) (34676, 6)
(122719,) (34676,)


## XGBoost

In [18]:
from xgboost import XGBClassifier
xgb = XGBClassifier(
   max_depth = 3,
   n_estimators = 256,
   learning_rate = 0.01)

In [20]:
xgb.fit(X_train, Y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.01, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=256,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [22]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
pred = xgb.predict(X_train)
print("accuracy", accuracy_score(Y_train, pred))
print("f1", f1_score(Y_train, pred))
print("precision", precision_score(Y_train, pred))
print("recall", recall_score(Y_train, pred))

accuracy 0.8729862531474344
f1 0.9153997709546631
precision 0.8884112937210282
recall 0.9440793524624117


In [23]:
pred = xgb.predict(X_test)
print("accuracy", accuracy_score(Y_test, pred))
print("f1", f1_score(Y_test, pred))
print("precision", precision_score(Y_test, pred))
print("recall", recall_score(Y_test, pred))

accuracy 0.8588649209828123
f1 0.9093099103105775
precision 0.8743451765795944
recall 0.9471875844496777


## LSTM

In [25]:
X_train_lstm = X_train.values.reshape(*X_train.shape,1)
X_test_lstm = X_test.values.reshape(*X_test.shape,1)

In [28]:
X_train_lstm = X_train_lstm[9:,:,:]
X_test_lstm  = X_test_lstm[6:,:,:]
Y_train = Y_train[9:]
Y_test  = Y_test[6:]
print(X_train_lstm.shape, X_test_lstm.shape)
print(Y_train.shape, Y_test.shape)

(122710, 6, 1) (34670, 6, 1)
(122710,) (34670,)


In [43]:
from keras.layers.core import Dense, Dropout, Activation
from keras.models import Sequential
from keras.layers import LSTM, Embedding

lstm_model = Sequential()
lstm_model.add(LSTM(128, batch_input_shape=(10,6,1), dropout=0.0, recurrent_dropout=0.0, stateful=True, kernel_initializer='random_uniform'))
lstm_model.add(Dropout(0.3))
lstm_model.add(Dense(64,activation='relu'))
lstm_model.add(Dense(1,activation='softmax'))
lstm_model.compile(loss='binary_crossentropy', optimizer="adam")

In [44]:
lstm_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_4 (LSTM)                (10, 128)                 66560     
_________________________________________________________________
dropout_4 (Dropout)          (10, 128)                 0         
_________________________________________________________________
dense_7 (Dense)              (10, 64)                  8256      
_________________________________________________________________
dense_8 (Dense)              (10, 1)                   65        
Total params: 74,881
Trainable params: 74,881
Non-trainable params: 0
_________________________________________________________________


In [45]:
lstm_model.fit(X_train_lstm, Y_train, epochs=2, verbose=3, batch_size=10, shuffle=True)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x1678c9c7c18>

In [47]:
pred = lstm_model.predict_classes(X_train_lstm, batch_size=10)
print("accuracy", accuracy_score(Y_train, pred))
print("f1", f1_score(Y_train, pred))
print("precision", precision_score(Y_train, pred))
print("recall", recall_score(Y_train, pred))

accuracy 0.7278461413087768
f1 0.8424895294872279
precision 0.7278461413087768
recall 1.0


In [48]:
pred = lstm_model.predict_classes(X_test_lstm, batch_size=10)
print("accuracy", accuracy_score(Y_test, pred))
print("f1", f1_score(Y_test, pred))
print("precision", precision_score(Y_test, pred))
print("recall", recall_score(Y_test, pred))

accuracy 0.7469570233631382
f1 0.855152145557812
precision 0.7469570233631382
recall 1.0
