In [1]:
import pandas as pd
import numpy as np
from plotnine import *

%matplotlib inline

In [44]:
data_lag_n = pd.read_csv("../../data/processed/pagos_por_unidad_lag_6.csv",
                  dtype = {
                                "unidad_id": "object",
                            },
                  parse_dates = ["expensa_mes_pago"])

In [6]:
data = pd.read_csv("../../data/processed/expensas_full_processed_vis_nona.csv",
                  dtype = {
                                "expensa_id": "object",
                                "unidad_id": "object",
                                "consorcio_id": "object",
                                "expensa_mes":"object",
                            },
                  parse_dates = ["expensa_fecha", "expensa_mes_pago", "expensa_mes_pago_anterior"])

In [45]:
data.target.mean()

0.2056626173102312

In [46]:
data_lag_n.shape

(151480, 9)

In [47]:
final_data = pd.merge(
    data[["expensa_id", "unidad_id", "unidad_tipo", "expensa_mes", "expensa_mes_pago"]],
    data_lag_n,
    left_on = ["unidad_id", "expensa_mes_pago"],
    right_on = ["unidad_id", "expensa_mes_pago"],
    how = "inner"
)

In [48]:
final_data.shape

(151480, 12)

In [49]:
final_data.target.mean()

0.18820306311064167

In [50]:
model_columns = ['unidad_tipo', 'expensa_mes', 'pago_metodo_lag_6', 'pago_metodo_lag_5',
       'pago_metodo_lag_4', 'pago_metodo_lag_3', 'pago_metodo_lag_2',
       'pago_metodo_lag_1'] 

final_data = final_data.sort_values("expensa_mes_pago")
min_cv_id = final_data.loc[final_data.expensa_mes_pago == "2018-06-01", "expensa_mes_pago"].idxmin()
min_test_id = final_data.loc[final_data.expensa_mes_pago == "2018-09-01", "expensa_mes_pago"].idxmin()

In [51]:
def get_train_test_split(X_columns):
    X_train = pd.get_dummies(final_data.loc[0:min_test_id, X_columns])
    X_test = pd.get_dummies(final_data.loc[min_test_id:, X_columns])
    Y_train = final_data.loc[0:min_test_id, "target"]
    Y_test = final_data.loc[min_test_id:, "target"]
    
    drop_test_columns = [column for column in X_test.columns if column not in X_train.columns]
    X_test = X_test.drop(columns=drop_test_columns)

    for column in [column for column in X_train.columns if column not in X_test.columns]:
        X_test[column] = 0
    
    X_test = X_test[X_train.columns]
    print(X_train.shape, Y_train.shape)
    print(X_test.shape, Y_test.shape)
    return X_train, Y_train, X_test, Y_test

In [52]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
def get_train_test_meassures(model, X_train, Y_train, X_test, Y_test):
    train_preds = model.predict(X_train)
    print("accuracy:", accuracy_score(Y_train, train_preds))
    print("precision:", precision_score(Y_train, train_preds))
    print("recall:", recall_score(Y_train, train_preds))
    print("f1:", f1_score(Y_train, train_preds))
    print("--------------------------------------------------")
    test_preds = model.predict(X_test)
    print("accuracy:", accuracy_score(Y_test, test_preds))
    print("precision:", precision_score(Y_test, test_preds))
    print("recall:", recall_score(Y_test, test_preds))
    print("f1:", f1_score(Y_test, test_preds))

In [53]:
X_train, Y_train, X_test, Y_test = get_train_test_split(model_columns)

(118262, 51) (118262,)
(33219, 51) (33219,)


In [54]:
from sklearn.linear_model import LogisticRegression
logReg = LogisticRegression(
    C = 1000000,
    solver = "lbfgs",
    penalty = "l2",
    max_iter = 100,
)
logReg.fit(X_train, Y_train) 
get_train_test_meassures(logReg, X_train, Y_train, X_test, Y_test)

accuracy: 0.8861595440631818
precision: 0.7639719261762412
recall: 0.6221949360657126
f1: 0.6858329638531725
--------------------------------------------------
accuracy: 0.8874439326891237
precision: 0.6548482406661295
recall: 0.4983646770237122
f1: 0.5659895531050494


## Extreme Gradient Boosting

In [56]:
from xgboost import XGBClassifier

xgb = XGBClassifier( 
    learning_rate = 0.0026674284564537237,
    max_depth = 2,
    n_estimators = 1667,
    reg_alpha = 0.9)

In [57]:
xgb.fit(X_train, Y_train)

KeyboardInterrupt: 