In [1]:
import os
import copy
from collections import OrderedDict
from collections import defaultdict
import pickle

import joblib
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn as sk
import matplotlib.pyplot as plt

%matplotlib inline
np.random.seed(42)

In [2]:
data_static = pd.read_csv('raw_data/Static.csv', sep=';').drop('CLIENT_ID', axis=1)
data_repay  = pd.read_csv('raw_data/Repayments.csv', sep=';')

In [3]:
# API for DataTable and feature engineering.

class DataTable:
    def __init__(self):
        self.matrix = None
        
        self.log_payment_features = []
        self.id_payment_features   = ['REPAYMENT_SCHEDULED']
        self.id_relative_payment_features  = []
        self.log_relative_payment_features = []
        
        self.id_target_exact_key  = 'REPAYMENT_ACTUAL'
        self.log_target_exact_key = 'LOG_REPAYMENT_ACTUAL'
        self.id_target_relative_key  = 'PERCENT_ACTUAL'
        self.log_target_relative_key = 'LOG_PERCENT_ACTUAL'
        
        self.explicit_target_keys = [ 
            self.id_target_exact_key, self.log_target_exact_key, 
            self.id_target_relative_key, self.log_target_relative_key
        ]

        self.indicator_features   = ['GENDER']
        self.id_numeric_features  = [
            'PERIOD_ID', 'TERM', 'CONTRACT_SUM', 'AGE', 
            'LOAN_TO_INCOME', 'PAYMENT_TO_INCOME', 'DOWNPAYMENT', 
            'CAR_CATEGORY', 'GRACE_PERIOD', 'RATE_CHANGE_AFTER_GRACE'
        ]
        self.log_numeric_features = []
        

def _make_payment_cumulative_and_average_features(data_table):
    data = data_table.matrix
    group_structure = data.groupby('CONTRACT_ID')
    
    cumulative_sum_scheduled = list()
    cumulative_sum_actual = list()
    average_repayment_actual = list()
    
    for key, df in group_structure:
        cumulative_sum_scheduled += df['REPAYMENT_SCHEDULED'].cumsum().tolist()
        cumulative_sum_actual += (df['REPAYMENT_ACTUAL'].cumsum() - df['REPAYMENT_ACTUAL']).tolist()
        
        cumsum_actual = np.array(df['REPAYMENT_ACTUAL'].cumsum().tolist())
        counts_actual = np.array(range(1, len(cumsum_actual) + 1))
        average_actual = cumsum_actual / counts_actual
        corrected_average_history = [0]
        for item in average_actual.tolist():
            corrected_average_history.append(item)
        corrected_average_history.pop()
        average_repayment_actual += corrected_average_history
        
    data['CUMSUM_REPAYMENT_SCHEDULED'] = cumulative_sum_scheduled
    data['CUMSUM_REPAYMENT_ACTUAL'] = cumulative_sum_actual
    data['AVERAGE_REPAYMENT_ACTUAL'] = average_repayment_actual
    data_table.id_payment_features += [
        'CUMSUM_REPAYMENT_SCHEDULED', 'CUMSUM_REPAYMENT_ACTUAL', 'AVERAGE_REPAYMENT_ACTUAL']
        

def _make_payment_history_features(data_table):
    data = data_table.matrix
    group_structure = data.groupby('CONTRACT_ID')
    
    lagged_actual_payments = defaultdict(list)
    lagged_scheduled_payments = defaultdict(list)
    
    configurations = [
        (lagged_actual_payments[1], 1, 'REPAYMENT_ACTUAL'),
        (lagged_scheduled_payments[1], 1, 'REPAYMENT_SCHEDULED'),
        (lagged_actual_payments[2], 2, 'REPAYMENT_ACTUAL'),
        (lagged_scheduled_payments[2], 2, 'REPAYMENT_SCHEDULED')
    ]
        
    for key, df in group_structure:
        for list_holder, shift, key in configurations:
            contract_list = df[key].shift(shift).tolist()
            for idx in range(shift):
                contract_list[idx] = df['REPAYMENT_SCHEDULED'].values[idx]
            list_holder += contract_list
    
    for list_holder, shift, key in configurations:
        new_key = 'HISTORY_SHIFT_{}_'.format(shift) + key
        data[new_key] = list_holder
        data_table.id_payment_features.append(new_key)
        

def _make_payment_relative_features(data_table):
    matrix = data_table.matrix
    exact_features = copy.deepcopy(data_table.id_payment_features)
    for item in exact_features:
        relative_key = 'RELATIVE_' + item
        matrix['RELATIVE_' + item] = matrix[item] / matrix['CONTRACT_SUM'] * 100
        data_table.id_relative_payment_features.append(relative_key)

        
def _make_payment_log_features(data_table):
    matrix = data_table.matrix
    configurations = [
        (data_table.id_payment_features, data_table.log_payment_features),
        (data_table.id_relative_payment_features, data_table.log_relative_payment_features)
    ]
    for source, target in configurations:
        for feature_key in source:
            log_key = 'LOG_' + feature_key
            matrix[log_key] = np.log1p(matrix[feature_key])
            target.append(log_key)
    
    
def _make_regular_log_features(data_table):
    matrix = data_table.matrix
    for feature_key in data_table.id_numeric_features:
        log_key = 'LOG_' + feature_key
        matrix[log_key] = np.log1p(matrix[feature_key])
        data_table.log_numeric_features.append(log_key)
        
        
def _make_custom_features(data_table):
    matrix = data_table.matrix
    matrix['BEFORE_GRACE'] = matrix['PAYMENT_TO_INCOME'] / matrix['LOAN_TO_INCOME'] * 100
    matrix['GRACE_ON'] = 1 * np.array(matrix['PERIOD_ID'] <= matrix['GRACE_PERIOD'])
    
    matrix['RATIO_5'] = 1 * np.array(matrix['LOAN_TO_INCOME'] >= 5)
    matrix['RATIO_10'] = 1 * np.array(matrix['LOAN_TO_INCOME'] >= 10)
    matrix['RATIO_20'] = 1 * np.array(matrix['LOAN_TO_INCOME'] >= 20)
    matrix['RATIO_30'] = 1 * np.array(matrix['LOAN_TO_INCOME'] >= 30)
    matrix['RATIO_40'] = 1 * np.array(matrix['LOAN_TO_INCOME'] >= 40)
    
    matrix['IS_PERIOD_1'] = 1 * np.array(matrix['PERIOD_ID'] == 1)
    matrix['IS_PERIOD_2'] = 1 * np.array(matrix['PERIOD_ID'] == 2)
    matrix['IS_PERIOD_3'] = 1 * np.array(matrix['PERIOD_ID'] == 3)
    
    matrix['IS_CAR_1'] = 1 * np.array(matrix['CAR_CATEGORY'] == 1)
    matrix['IS_CAR_2'] = 1 * np.array(matrix['CAR_CATEGORY'] == 2)
    matrix['IS_CAR_3'] = 1 * np.array(matrix['CAR_CATEGORY'] == 3)
    matrix['IS_CAR_4'] = 1 * np.array(matrix['CAR_CATEGORY'] == 4)
    matrix['IS_CAR_5'] = 1 * np.array(matrix['CAR_CATEGORY'] == 5)
    
    data_table.id_numeric_features.append('BEFORE_GRACE')
    data_table.indicator_features += ['GRACE_ON', 'RATIO_5', 'RATIO_10', 'RATIO_20', 'RATIO_30', 'RATIO_40'
                                     'IS_PERIOD_1', 'IS_PERIOD_2', 'IS_PERIOD_3',
                                     'IS_CAR_1', 'IS_CAR_2', 'IS_CAR_3', 'IS_CAR_4', 'IS_CAR_5']


def _make_is_grace_constant_feature(data_table):
    is_grace_constant = []
    matrix = data_table.matrix
    for key, df in matrix.groupby('CONTRACT_ID'):
        df_grace_on = np.array(df['GRACE_ON'])
        check_vals  = np.array(df['REPAYMENT_ACTUAL'])
        grace_values = set()    
        for on, val in zip(df_grace_on, check_vals):
            if on == 1:
                grace_values.add(val)
        append = np.zeros_like(check_vals)
        if len(grace_values) < 2:
            append += 1
        is_grace_constant += append.tolist()

    matrix['IS_GRACE_CONSTANT'] = np.array(is_grace_constant)
    data_table.indicator_features.append('IS_GRACE_CONSTANT')

    
def _make_target_columns(data_table):
    matrix = data_table.matrix
    matrix['LOG_REPAYMENT_ACTUAL'] = np.log1p(matrix['REPAYMENT_ACTUAL'])
    matrix['PERCENT_ACTUAL'] = matrix['REPAYMENT_ACTUAL'] / matrix['CONTRACT_SUM'] * 100
    matrix['LOG_PERCENT_ACTUAL'] = np.log1p(matrix['PERCENT_ACTUAL'])

        
def make_source_datatable(data_static, data_repay):
    data_join = pd.merge(data_static, data_repay, on='CONTRACT_ID')
    data_table = DataTable()
    data_join['GENDER'] = 1 * (data_join['GENDER'] == 'M')
    data_table.matrix = data_join
    
    _make_payment_cumulative_and_average_features(data_table)
    _make_payment_history_features(data_table)
    _make_payment_relative_features(data_table)
    _make_payment_log_features(data_table)
    
    _make_regular_log_features(data_table)
    _make_custom_features(data_table)
    _make_is_grace_constant_feature(data_table)    
    _make_target_columns(data_table)
    return data_table

In [4]:
def train_test_split(data_table):
    contracts = np.array(data_table.matrix['CONTRACT_ID'].tolist())
    contracts = np.unique(contracts)
    np.random.shuffle(contracts)
    length = contracts.shape[0]
    RATIO = 0.1
    test_contracts  = contracts[:int(length * RATIO)].tolist()
    train_contracts = contracts[int(length * RATIO):].tolist()

    data_learn = copy.deepcopy(data_table.matrix.dropna(axis=0))

    data_train = copy.deepcopy(data_learn[data_learn['CONTRACT_ID'].isin(train_contracts)])
    data_test  = copy.deepcopy(data_learn[data_learn['CONTRACT_ID'].isin(test_contracts)])
    
    train_table = copy.deepcopy(data_table)
    train_table.matrix = data_train
    test_table = copy.deepcopy(data_table)
    test_table.matrix = data_test
    dropna_table = copy.deepcopy(data_table)
    dropna_table.matrix = data_learn
    return train_table, test_table, dropna_table
    

In [5]:
all_data_table = make_source_datatable(data_static, data_repay)
train_data_table, test_data_table, dropna_table = train_test_split(all_data_table)

if not os.path.exists('prepared_data'):
    os.makedirs('prepared_data')

def check_existence(list_of_paths):
    for item in list_of_paths:
        if not os.path.exists(item):
            return False
    return True

paths_list = ['prepared_data/train_data.csv', 'prepared_data/train_data.jbl', 
              'prepared_data/test_data.csv', 'prepared_data/test_data.jbl',
              'prepared_data/all_data_dropped_na.csv', 'prepared_data/all_data_dropped_na.jbl',
              'prepared_data/all_data_with_na.csv', 'prepared_data/all_data_with_na.jbl'
             ]

if not check_existence(paths_list):
    train_data_table.matrix.to_csv('prepared_data/train_series.csv')
    joblib.dump(train_data_table,  "prepared_data/train_data.jbl")
    test_data_table.matrix.to_csv('prepared_data/test_series.csv')
    joblib.dump(test_data_table,  "prepared_data/test_data.jbl")
    all_data_table.matrix.to_csv('prepared_data/all_data_with_na.csv')
    joblib.dump(all_data_table,  "prepared_data/all_data_with_na.jbl")
    dropna_table.matrix.to_csv('prepared_data/all_data_dropped_na.csv')
    joblib.dump(dropna_table,  'prepared_data/all_data_dropped_na.jbl')