In [None]:
from dataset import MysqlIO
import pandas as pd
# import pandas_profiling as pp
# from IPython.display import display
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from utils import add_prefix_to_colnames, onehot, summary_group_cate_data, summary_group_num_data, summary_df, day_to_int

%matplotlib inline

# Load Dataset

In [None]:
db = MysqlIO(host='relational.fit.cvut.cz', 
            database='financial', 
            user='guest', 
            password='relational')

print(db.execute('SHOW TABLES'))

loan_df = db.execute_to_df('SELECT * FROM loan')
order_df = db.execute_to_df('SELECT * FROM `order`')
trans_df = db.execute_to_df('''
    SELECT * FROM trans WHERE account_id IN 
    (SELECT account_id FROM loan)
   ''')
disp_df = db.execute_to_df('SELECT * FROM disp')
account_df = db.execute_to_df('SELECT * FROM account')
client_df = db.execute_to_df('SELECT * FROM client')
card_df = db.execute_to_df('SELECT * FROM card')
district_df = db.execute_to_df('SELECT * FROM district')

In [None]:
summary_df(loan_df, "loan")
summary_df(order_df, "order")
summary_df(trans_df, "trans")
summary_df(disp_df, "disp")
summary_df(account_df, "account")
summary_df(client_df, "client")
summary_df(card_df, "card")
summary_df(district_df, "district")

# Facts you need to know about Dataset

![title](img/Data_description.png)

Data Description
* relation account (4500 records) - each record describes static characteristics of an account,
* relation client (5369 records) - each record describes characteristics of a client,
* relation disposition (5369 records) - each record relates together a client with an account i.e. this relation describes the rights of clients to operate accounts,
* relation permanent order (6471 records) - each record describes characteristics of a payment order,
* relation transaction (1056320 records) - each record describes one transaction on an account,
* relation loan (682 records) - each record describes a loan granted for a given account,
* relation credit card (892 records) - each record describes a credit card issued to an account,
* relation demographic data (77 records) - each record describes demographic characteristics of a district.

Deeper (you don't need to know)
* 1 client can have multiple accounts.
Also, 1 account can have multiple clients (owner, desponent).
* sanction if negative balance. sometimes amount is 0.

# Data manipulation

## loan table

In [None]:
loan_df_ = add_prefix_to_colnames(loan_df, prefix='loan_')
print(loan_df_.shape)
loan_df_.head()

In [None]:
# loan_df_.to_csv('table_csv/loan.csv', index=False)

## account table
* Edit 'frequency' field to English.
* Create feature 'day_before_loan'.

In [None]:
account_df.loc[account_df['frequency']=="POPLATEK MESICNE", 'frequency'] = 'monthly'
account_df.loc[account_df['frequency']=="POPLATEK TYDNE", 'frequency'] = 'weekly'
account_df.loc[account_df['frequency']=="POPLATEK PO OBRATU", 'frequency'] = 'after_transaction'
account_df.head()

In [None]:
# account_df.to_csv('table_csv/account.csv', index=False)

In [None]:
account_df_ = add_prefix_to_colnames(account_df, prefix='acc_')
print(account_df_.shape)
account_df_.head()

In [None]:
df = pd.merge(loan_df_, account_df_, how='left', on=['account_id'])
df['day_before_loan'] = df['loan_date'] - df['acc_date']
df['day_before_loan'] = day_to_int(df['day_before_loan'])
df = df.drop(['acc_date'], axis=1)
print(df.shape)
df.head()

## order manipulation

each record describes characteristics of a payment order (monthly).

We can extract all of the features in this table from the transaction table.

So, we decided not to use this table.

In [None]:
order_df.head()

order_df there are "spacing". so, we add NaN.

In [None]:
# order_df['k_symbol'].value_counts()

In [None]:
# order_df = order_df.replace("", np.nan)
# order_df['k_symbol'].value_counts()

In [None]:
# order_df.loc[order_df['k_symbol']=="POJISTNE", 'k_symbol'] = 'insurrance'
# order_df.loc[order_df['k_symbol']=="SIPO", 'k_symbol'] = 'household'
# order_df.loc[order_df['k_symbol']=="LEASING", 'k_symbol'] = 'leasing'
# order_df.loc[order_df['k_symbol']=="UVER", 'k_symbol'] = 'loan'

# # print(order_df.shape)
# order_df.head()

In [None]:
# add_prefix_to_colnames(order_df, prefix='order_').to_csv('table_csv/order.csv', index=False)

drop k_symbol nan because we don't know what k_symbol it is.

In [None]:
# order_df = order_df[order_df['k_symbol'].notna()]
# order_df.head()

k_symbol onehot encoding

this tells us, each account have had monthly payment in each k_symbol.

In [None]:
# order_df_ = onehot(order_df, col_name='k_symbol', prefix='monthly_payment_for')
# order_df_.head()

multiple each k_symbol with its amount. because this tells us each mont the account need to pay how much. not just have had pay for what.

In [None]:
# order_df_['monthly_payment_for_household'] = order_df_['monthly_payment_for_household'] * order_df_['amount'] 
# order_df_['monthly_payment_for_insurrance'] = order_df_['monthly_payment_for_insurrance'] * order_df_['amount'] 
# order_df_['monthly_payment_for_leasing'] = order_df_['monthly_payment_for_leasing'] * order_df_['amount'] 
# order_df_['monthly_payment_for_loan'] = order_df_['monthly_payment_for_loan'] * order_df_['amount']
# order_df_.head()

Drop bank_to, account_to, k_symbol

In [None]:
# order_df_ = order_df_.drop(['order_id', 'bank_to', 'account_to', 'amount'], axis=1)
# order_df_.head()

In [None]:
# order_df_ = add_prefix_to_colnames(order_df_, prefix='order_')
# order_df_.head()

Order -> useless เพราะ k_symbol ของ transaction ละเอียดกว่า

In [None]:
# df = pd.merge(df, order_df_, how='left', on=['account_id'])
# df = pd.merge(df, order_ksymbol, how='left', on=['account_id'])
df.head()

## Transaction

* Replace None (null values), spacing with NaN
* Rename feature 'type', 'operation', 'k_symbol' to English.
* Money in, Money out -> separate amount to amount_in, amount_out
* Separate data to Before loan and After loan (we can use only Before loan data)

In [None]:
summary_df(trans_df)

replace None (null values), spacing with NaN

In [None]:
# check values not in list (show spacing)
trans_df.loc[~trans_df['k_symbol'].isin(["POJISTNE", "SLUZBY", "UROK", "SANKC. UROK", "SIPO", "DUCHOD", "UVER", np.nan]) , :].head()

In [None]:
# fill spacing with nan
trans_df = trans_df.replace(' ', np.nan)
# fill None with nan
trans_df = trans_df.fillna(value=np.nan)
# check
trans_df.loc[~trans_df['k_symbol'].isin(["POJISTNE", "SLUZBY", "UROK", "SANKC. UROK", "SIPO", "DUCHOD", "UVER", np.nan]) , :]

type have class "PRIJEM", "VYDAJ". Also "VYBER" (not show in https://web.archive.org/web/20180506035658/http://lisp.vse.cz/pkdd99/Challenge/berka.htm)

include "VYBER" to "VYDAJ" as withdrawal.

In [None]:
trans_df = trans_df.sort_values(by=['account_id', 'date'])
trans_df = trans_df.copy()
# type
trans_df.loc[trans_df['type']=="PRIJEM", 'type'] = 'credit'
trans_df.loc[trans_df['type']=="VYDAJ", 'type'] = 'withdrawal'
trans_df.loc[trans_df['type']=="VYBER", 'type'] = 'withdrawal'

# operation
trans_df.loc[trans_df['operation']=="VYBER KARTOU", 'operation'] = 'credit_card_withdrawal'
trans_df.loc[trans_df['operation']=="VKLAD", 'operation'] = 'credit_in_cash'
trans_df.loc[trans_df['operation']=="PREVOD Z UCTU", 'operation'] = 'collection_from_anotherbank'
trans_df.loc[trans_df['operation']=="VYBER", 'operation'] = 'withdrawal_in_cash'
trans_df.loc[trans_df['operation']=="PREVOD NA UCET", 'operation'] = 'remittance_to_anotherbank'

# k_symbol
trans_df.loc[trans_df['k_symbol']=="POJISTNE", 'k_symbol'] = 'insurrance'
trans_df.loc[trans_df['k_symbol']=="SLUZBY", 'k_symbol'] = 'payment_for_statement'
trans_df.loc[trans_df['k_symbol']=="UROK", 'k_symbol'] = 'interest_credited'
trans_df.loc[trans_df['k_symbol']=="SANKC. UROK", 'k_symbol'] = 'sanction_interest_neg_bal'
trans_df.loc[trans_df['k_symbol']=="SIPO", 'k_symbol'] = 'household'
trans_df.loc[trans_df['k_symbol']=="DUCHOD", 'k_symbol'] = 'old-age_pension'
trans_df.loc[trans_df['k_symbol']=="UVER", 'k_symbol'] = 'loan'

trans_df

In [None]:
# add_prefix_to_colnames(trans_df, prefix='trans_').to_csv('table_csv/transaction.csv', index=False)

### money in, money out

In [149]:
trans_df_in = trans_df.copy()
trans_df_in.head()

Unnamed: 0,trans_id,account_id,date,type,operation,amount,balance,k_symbol,bank,account
0,276,2,1993-02-26,credit,credit_in_cash,1100,1100,,,
1,279,2,1993-03-12,credit,collection_from_anotherbank,20236,21336,,ST,66487163.0
338,697,2,1993-03-28,credit,credit_in_cash,3700,25036,,,
408,3530483,2,1993-03-31,credit,,14,25050,interest_credited,,
2,280,2,1993-04-12,credit,collection_from_anotherbank,20236,45286,,ST,66487163.0


In [163]:
def money_in_out(df, focus_colname, cond_colname='type', in_colname='credit', out_colname='withdrawal'):
    money_in = df[df[cond_colname] == in_colname][focus_colname]
    money_out = df[df[cond_colname] == out_colname][focus_colname]

    return [money_in, money_out]

trans_df_in.loc[trans_df_in['type'] == 'credit', 'amount_in'] = money_in_out(trans_df_in, focus_colname='amount')[0] # trans_df_before_loan[trans_df_before_loan['type'] == 'credit']['amount']
trans_df_in.loc[trans_df_in['type'] == 'withdrawal', 'amount_out'] = money_in_out(trans_df_in, focus_colname='amount')[1]
trans_df_in.head()

Unnamed: 0,trans_id,account_id,date,type,operation,amount,balance,k_symbol,bank,account,amount_in,amount_out
0,276,2,1993-02-26,credit,credit_in_cash,1100,1100,,,,1100.0,
1,279,2,1993-03-12,credit,collection_from_anotherbank,20236,21336,,ST,66487163.0,20236.0,
338,697,2,1993-03-28,credit,credit_in_cash,3700,25036,,,,3700.0,
408,3530483,2,1993-03-31,credit,,14,25050,interest_credited,,,14.0,
2,280,2,1993-04-12,credit,collection_from_anotherbank,20236,45286,,ST,66487163.0,20236.0,


In [164]:
# replace amount in out nan with 0
trans_df_in.loc[:, ['amount_in', 'amount_out']] = trans_df_in.loc[:, ['amount_in', 'amount_out']].replace(np.nan, 0)
trans_df_in

Unnamed: 0,trans_id,account_id,date,type,operation,amount,balance,k_symbol,bank,account,amount_in,amount_out
0,276,2,1993-02-26,credit,credit_in_cash,1100,1100,,,,1100.0,0.0
1,279,2,1993-03-12,credit,collection_from_anotherbank,20236,21336,,ST,66487163.0,20236.0,0.0
338,697,2,1993-03-28,credit,credit_in_cash,3700,25036,,,,3700.0,0.0
408,3530483,2,1993-03-31,credit,,14,25050,interest_credited,,,14.0,0.0
2,280,2,1993-04-12,credit,collection_from_anotherbank,20236,45286,,ST,66487163.0,20236.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
191318,3424092,11362,1998-12-08,withdrawal,remittance_to_anotherbank,56,51420,,MN,78507822.0,0.0,56.0
191284,3424044,11362,1998-12-10,withdrawal,remittance_to_anotherbank,4780,46640,household,YZ,70641225.0,0.0,4780.0
191420,3424236,11362,1998-12-12,withdrawal,remittance_to_anotherbank,5392,41248,loan,MN,61540514.0,0.0,5392.0
191425,3424248,11362,1998-12-19,withdrawal,withdrawal_in_cash,2880,38368,,,,0.0,2880.0


### Focus only transaction before loan

In [165]:
trans_df_in.head()

Unnamed: 0,trans_id,account_id,date,type,operation,amount,balance,k_symbol,bank,account,amount_in,amount_out
0,276,2,1993-02-26,credit,credit_in_cash,1100,1100,,,,1100.0,0.0
1,279,2,1993-03-12,credit,collection_from_anotherbank,20236,21336,,ST,66487163.0,20236.0,0.0
338,697,2,1993-03-28,credit,credit_in_cash,3700,25036,,,,3700.0,0.0
408,3530483,2,1993-03-31,credit,,14,25050,interest_credited,,,14.0,0.0
2,280,2,1993-04-12,credit,collection_from_anotherbank,20236,45286,,ST,66487163.0,20236.0,0.0


In [166]:
# drop unnecessary features
trans_df_in = trans_df_in.drop(['trans_id', 'bank', 'account'], axis=1)

In [167]:
trans_df_before_loan = pd.merge(trans_df_in, loan_df_[['account_id', 'loan_date']], how='left', on='account_id')
trans_df_before_loan['day_after_loan'] = day_to_int(trans_df_before_loan['loan_date'] - trans_df_before_loan['date'])

# after loan
trans_df_after_loan = trans_df_before_loan[trans_df_before_loan['day_after_loan']<=0]
# before loan
trans_df_before_loan = trans_df_before_loan[trans_df_before_loan['day_after_loan']>0]

# trans_df_before_loan = trans_df_before_loan.drop(['day_after_loan', 'loan_date'], axis=1)
print(trans_df_before_loan.shape)
trans_df_before_loan.head()

(54694, 11)


Unnamed: 0,account_id,date,type,operation,amount,balance,k_symbol,amount_in,amount_out,loan_date,day_after_loan
0,2,1993-02-26,credit,credit_in_cash,1100,1100,,1100.0,0.0,1994-01-05,313.0
1,2,1993-03-12,credit,collection_from_anotherbank,20236,21336,,20236.0,0.0,1994-01-05,299.0
2,2,1993-03-28,credit,credit_in_cash,3700,25036,,3700.0,0.0,1994-01-05,283.0
3,2,1993-03-31,credit,,14,25050,interest_credited,14.0,0.0,1994-01-05,280.0
4,2,1993-04-12,credit,collection_from_anotherbank,20236,45286,,20236.0,0.0,1994-01-05,268.0


In [168]:
# trans_df_after_loan = trans_df_after_loan.drop(['day_after_loan', 'loan_date'], axis=1)
print(trans_df_after_loan.shape)
trans_df_after_loan.head()

(136862, 11)


Unnamed: 0,account_id,date,type,operation,amount,balance,k_symbol,amount_in,amount_out,loan_date,day_after_loan
54,2,1994-01-05,withdrawal,remittance_to_anotherbank,7266,20575,household,0.0,7266.0,1994-01-05,0.0
55,2,1994-01-12,credit,collection_from_anotherbank,20236,40811,,20236.0,0.0,1994-01-05,-7.0
56,2,1994-01-12,withdrawal,remittance_to_anotherbank,3373,35338,loan,0.0,3373.0,1994-01-05,-7.0
57,2,1994-01-12,withdrawal,withdrawal_in_cash,2100,38711,,0.0,2100.0,1994-01-05,-7.0
58,2,1994-01-18,withdrawal,withdrawal_in_cash,2400,32938,,0.0,2400.0,1994-01-05,-13.0


In [None]:
# print(trans_df_before_loan['type'].value_counts(), '\n')
# print(trans_df_before_loan['operation'].value_counts(), '\n')
# print(trans_df_before_loan['k_symbol'].value_counts(), '\n')

In [None]:
# print(trans_df_after_loan['type'].value_counts(), '\n')
# print(trans_df_after_loan['operation'].value_counts(), '\n')
# print(trans_df_after_loan['k_symbol'].value_counts(), '\n')

In [169]:
trans_df_out = trans_df_before_loan.copy()

In [None]:
# trans_df_before_loan.to_csv('data/trans_before_loan.csv', index=False)

In [None]:
# trans_df_after_loan.to_csv('data/trans_after_loan.csv', index=False)

### Explore Operation & k_symbol

In [170]:
trans_df_in = trans_df_out.copy()

In [171]:
trans_df_in['operation'].fillna('Others', inplace=True)
trans_df_in['k_symbol'].fillna('Others', inplace=True)
trans_df_in.head()

Unnamed: 0,account_id,date,type,operation,amount,balance,k_symbol,amount_in,amount_out,loan_date,day_after_loan
0,2,1993-02-26,credit,credit_in_cash,1100,1100,Others,1100.0,0.0,1994-01-05,313.0
1,2,1993-03-12,credit,collection_from_anotherbank,20236,21336,Others,20236.0,0.0,1994-01-05,299.0
2,2,1993-03-28,credit,credit_in_cash,3700,25036,Others,3700.0,0.0,1994-01-05,283.0
3,2,1993-03-31,credit,Others,14,25050,interest_credited,14.0,0.0,1994-01-05,280.0
4,2,1993-04-12,credit,collection_from_anotherbank,20236,45286,Others,20236.0,0.0,1994-01-05,268.0


In [None]:
sns.countplot(data=trans_df_in, x='type', hue='operation')
plt.show()

In [None]:
sns.countplot(data=trans_df_in, x='type', hue='k_symbol')
plt.show()

In [None]:
plt.figure(figsize=(20, 5))
sns.countplot(data=trans_df_in, x='operation', hue='k_symbol')
plt.show()

The operation categorical simply tells us how money enters and leaves.
As a result, it is unimportant. So we don't mind the operation.

In [172]:
trans_df_out = trans_df_in.copy()

### Monthly payment (permanent order)

focus only monthly transaction. \
We can see this using k_symbol.
1. one-hot encoding to k_symbol

2. Create features for total monthly payments 
(count sanction, money amount in each k_symbol) 

*This tells us how much each account has paid us and whether or not they have been sanctioned.*

3. Create features for current monthly payments by determining the previous month's payment for each account. (amount of money paid in each k symbol last month)

*This tells us what you have to pay each month and how much you have to pay now.*

4. merge Total monthly payment features and Current monthly payment.

In [173]:
trans_df_in = trans_df_out[trans_df_out['k_symbol']!='Others']
print(trans_df_in.shape)
trans_df_in.head()

(20272, 11)


Unnamed: 0,account_id,date,type,operation,amount,balance,k_symbol,amount_in,amount_out,loan_date,day_after_loan
3,2,1993-03-31,credit,Others,14,25050,interest_credited,14.0,0.0,1994-01-05,280.0
6,2,1993-04-30,credit,Others,110,34395,interest_credited,110.0,0.0,1994-01-05,250.0
9,2,1993-05-31,credit,Others,145,37176,interest_credited,145.0,0.0,1994-01-05,219.0
12,2,1993-06-30,credit,Others,160,45290,interest_credited,160.0,0.0,1994-01-05,189.0
17,2,1993-07-31,withdrawal,withdrawal_in_cash,15,39069,payment_for_statement,0.0,15.0,1994-01-05,158.0


In [174]:
trans_df_ = trans_df_in.copy()

In [175]:
# onehot
trans_df_ = onehot(trans_df_, col_name='k_symbol', prefix='out',drop=False)
trans_df_ = trans_df_.rename(columns={'out_interest_credited':'in_interest_credited'})
trans_df_.head()

Unnamed: 0,account_id,date,type,operation,amount,balance,k_symbol,amount_in,amount_out,loan_date,day_after_loan,out_household,out_insurrance,in_interest_credited,out_payment_for_statement,out_sanction_interest_neg_bal
3,2,1993-03-31,credit,Others,14,25050,interest_credited,14.0,0.0,1994-01-05,280.0,0,0,1,0,0
6,2,1993-04-30,credit,Others,110,34395,interest_credited,110.0,0.0,1994-01-05,250.0,0,0,1,0,0
9,2,1993-05-31,credit,Others,145,37176,interest_credited,145.0,0.0,1994-01-05,219.0,0,0,1,0,0
12,2,1993-06-30,credit,Others,160,45290,interest_credited,160.0,0.0,1994-01-05,189.0,0,0,1,0,0
17,2,1993-07-31,withdrawal,withdrawal_in_cash,15,39069,payment_for_statement,0.0,15.0,1994-01-05,158.0,0,0,0,1,0


In [176]:
out_trans = ['out_household', 'out_insurrance', 'out_payment_for_statement', 'out_sanction_interest_neg_bal']
# count sanction if neg balance. 
trans_df_['count_out_sanction_interest_neg_bal'] = trans_df_['out_sanction_interest_neg_bal']

trans_df_.loc[:, out_trans] = trans_df_[out_trans].multiply(trans_df_["amount_out"], axis=0)


trans_df_.loc[:, 'in_interest_credited'] = trans_df_['in_interest_credited'].multiply(trans_df_["amount_in"], axis=0)
trans_df_.head()

Unnamed: 0,account_id,date,type,operation,amount,balance,k_symbol,amount_in,amount_out,loan_date,day_after_loan,out_household,out_insurrance,in_interest_credited,out_payment_for_statement,out_sanction_interest_neg_bal,count_out_sanction_interest_neg_bal
3,2,1993-03-31,credit,Others,14,25050,interest_credited,14.0,0.0,1994-01-05,280.0,0.0,0.0,14.0,0.0,0.0,0
6,2,1993-04-30,credit,Others,110,34395,interest_credited,110.0,0.0,1994-01-05,250.0,0.0,0.0,110.0,0.0,0.0,0
9,2,1993-05-31,credit,Others,145,37176,interest_credited,145.0,0.0,1994-01-05,219.0,0.0,0.0,145.0,0.0,0.0,0
12,2,1993-06-30,credit,Others,160,45290,interest_credited,160.0,0.0,1994-01-05,189.0,0.0,0.0,160.0,0.0,0.0,0
17,2,1993-07-31,withdrawal,withdrawal_in_cash,15,39069,payment_for_statement,0.0,15.0,1994-01-05,158.0,0.0,0.0,0.0,15.0,0.0,0


In [177]:
trans_monthly_payment = trans_df_.loc[:, ['account_id', 'amount', 'amount_in', 'amount_out', 'out_household', 'out_insurrance', 'in_interest_credited', 'out_payment_for_statement', 'out_sanction_interest_neg_bal', 'count_out_sanction_interest_neg_bal']].copy()
trans_monthly_payment = trans_monthly_payment.groupby(by='account_id').sum().reset_index()

trans_monthly_payment['count_monthly_trans'] = trans_df_.groupby(by='account_id', as_index=False).count()['amount']

trans_monthly_payment = add_prefix_to_colnames(trans_monthly_payment, prefix='sum_monthly_payment_')
trans_monthly_payment = trans_monthly_payment.rename(columns={'sum_monthly_payment_count_monthly_trans':'count_monthly_payment_trans', 'sum_monthly_payment_count_out_sanction_interest_neg_bal':'count_out_sanction_interest_neg_bal'})
trans_monthly_payment

Unnamed: 0,account_id,sum_monthly_payment_amount,sum_monthly_payment_amount_in,sum_monthly_payment_amount_out,sum_monthly_payment_out_household,sum_monthly_payment_out_insurrance,sum_monthly_payment_in_interest_credited,sum_monthly_payment_out_payment_for_statement,sum_monthly_payment_out_sanction_interest_neg_bal,count_out_sanction_interest_neg_bal,count_monthly_payment_trans
0,2,37711,1291.0,36420.0,36330.0,0.0,1291.0,90.0,0.0,0,21
1,19,130355,2835.0,127520.0,127400.0,0.0,2835.0,120.0,0.0,0,40
2,25,102031,4087.0,97944.0,95438.0,2296.0,4087.0,210.0,0.0,0,57
3,37,31436,2303.0,29133.0,27804.0,1164.0,2303.0,165.0,0.0,1,50
4,38,29750,1114.0,28636.0,27604.0,912.0,1114.0,120.0,0.0,0,20
...,...,...,...,...,...,...,...,...,...,...,...
674,11327,24415,2167.0,22248.0,22158.0,0.0,2167.0,90.0,0.0,0,22
675,11328,3636,3411.0,225.0,0.0,0.0,3411.0,225.0,0.0,0,34
676,11349,11643,757.0,10886.0,10886.0,0.0,757.0,0.0,0.0,0,5
677,11359,80673,3597.0,77076.0,76806.0,0.0,3597.0,270.0,0.0,0,57


In [178]:
trans_df_['last_month'] = trans_df_['loan_date'] - pd.Timedelta(days=31)

In [179]:
current_monthly_payment = trans_df_[day_to_int(trans_df_['date']-trans_df_['last_month']) >= 0][['date', 'account_id', 'amount_out', 'out_household', 'out_insurrance', 'out_payment_for_statement', 'in_interest_credited']]
current_monthly_payment = current_monthly_payment.groupby(by='account_id', as_index=False).sum()
current_monthly_payment = add_prefix_to_colnames(current_monthly_payment, prefix='current_monthly_payment_')
current_monthly_payment

Unnamed: 0,account_id,current_monthly_payment_amount_out,current_monthly_payment_out_household,current_monthly_payment_out_insurrance,current_monthly_payment_out_payment_for_statement,current_monthly_payment_in_interest_credited
0,2,7281.0,7266.0,0.0,15.0,138.0
1,19,18215.0,18200.0,0.0,15.0,168.0
2,25,6996.0,6817.0,164.0,15.0,372.0
3,37,2429.0,2317.0,97.0,15.0,94.0
4,38,7159.0,6901.0,228.0,30.0,117.0
...,...,...,...,...,...,...
674,11327,3708.0,3693.0,0.0,15.0,272.0
675,11328,15.0,0.0,0.0,15.0,189.0
676,11349,10886.0,10886.0,0.0,0.0,263.0
677,11359,4533.0,4518.0,0.0,15.0,110.0


In [180]:
trans_monthly_payment_summary = pd.merge(trans_monthly_payment, current_monthly_payment, how='left', on=['account_id'])
trans_monthly_payment_summary

Unnamed: 0,account_id,sum_monthly_payment_amount,sum_monthly_payment_amount_in,sum_monthly_payment_amount_out,sum_monthly_payment_out_household,sum_monthly_payment_out_insurrance,sum_monthly_payment_in_interest_credited,sum_monthly_payment_out_payment_for_statement,sum_monthly_payment_out_sanction_interest_neg_bal,count_out_sanction_interest_neg_bal,count_monthly_payment_trans,current_monthly_payment_amount_out,current_monthly_payment_out_household,current_monthly_payment_out_insurrance,current_monthly_payment_out_payment_for_statement,current_monthly_payment_in_interest_credited
0,2,37711,1291.0,36420.0,36330.0,0.0,1291.0,90.0,0.0,0,21,7281.0,7266.0,0.0,15.0,138.0
1,19,130355,2835.0,127520.0,127400.0,0.0,2835.0,120.0,0.0,0,40,18215.0,18200.0,0.0,15.0,168.0
2,25,102031,4087.0,97944.0,95438.0,2296.0,4087.0,210.0,0.0,0,57,6996.0,6817.0,164.0,15.0,372.0
3,37,31436,2303.0,29133.0,27804.0,1164.0,2303.0,165.0,0.0,1,50,2429.0,2317.0,97.0,15.0,94.0
4,38,29750,1114.0,28636.0,27604.0,912.0,1114.0,120.0,0.0,0,20,7159.0,6901.0,228.0,30.0,117.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
674,11327,24415,2167.0,22248.0,22158.0,0.0,2167.0,90.0,0.0,0,22,3708.0,3693.0,0.0,15.0,272.0
675,11328,3636,3411.0,225.0,0.0,0.0,3411.0,225.0,0.0,0,34,15.0,0.0,0.0,15.0,189.0
676,11349,11643,757.0,10886.0,10886.0,0.0,757.0,0.0,0.0,0,5,10886.0,10886.0,0.0,0.0,263.0
677,11359,80673,3597.0,77076.0,76806.0,0.0,3597.0,270.0,0.0,0,57,4533.0,4518.0,0.0,15.0,110.0


In [181]:
# replace NaN with zero.
df = pd.merge(df, trans_monthly_payment_summary, how='left', on=['account_id'])
df = df.replace(np.nan, 0)
df.isna().any().any() # check that there is nan or not

False

In [None]:
# df.to_csv('incloud_monthly_trans.csv', index=False)

divide transaction amount sum, count with day before loan

In [None]:
# t = df.copy()

In [None]:
# t_list = ['sum_monthly_payment_amount', 'sum_monthly_payment_amount_in', 'sum_monthly_payment_amount_out', 'sum_monthly_payment_out_household','sum_monthly_payment_out_insurrance','sum_monthly_payment_in_interest_credited','sum_monthly_payment_out_payment_for_statement','sum_monthly_payment_out_sanction_interest_neg_bal','count_out_sanction_interest_neg_bal', 'count_monthly_payment_trans','current_monthly_payment_amount_out','current_monthly_payment_out_household','current_monthly_payment_out_insurrance','current_monthly_payment_out_payment_for_statement','current_monthly_payment_in_interest_credited']
# t.loc[:, t_list] = t[t_list].div(t['day_before_loan'], axis=0)

In [None]:
# t.to_csv('incloud_monthly_trans_normbyday.csv', index=False)

### Other transactions

Create features 
* sum other amount
* sum_other_amount_in, sum_other_amount_out
* count other trans

การใช้จ่ายอื่น ๆ (ที่ไม่ใช่จ่ายรายเดือนใน k_symbol)

In [182]:
trans_df_in = trans_df_out[trans_df_out['k_symbol']=='Others']
print(trans_df_in.shape)
trans_df_in.head()

(34422, 11)


Unnamed: 0,account_id,date,type,operation,amount,balance,k_symbol,amount_in,amount_out,loan_date,day_after_loan
0,2,1993-02-26,credit,credit_in_cash,1100,1100,Others,1100.0,0.0,1994-01-05,313.0
1,2,1993-03-12,credit,collection_from_anotherbank,20236,21336,Others,20236.0,0.0,1994-01-05,299.0
2,2,1993-03-28,credit,credit_in_cash,3700,25036,Others,3700.0,0.0,1994-01-05,283.0
4,2,1993-04-12,credit,collection_from_anotherbank,20236,45286,Others,20236.0,0.0,1994-01-05,268.0
5,2,1993-04-27,withdrawal,withdrawal_in_cash,11000,34286,Others,0.0,11000.0,1994-01-05,253.0


In [203]:
trans_orthers = trans_df_in.loc[:, ['account_id', 'amount', 'amount_in', 'amount_out']].groupby(by='account_id').sum().reset_index()
trans_orthers.loc[:, ['count_other_trans']] = trans_df_in.groupby(by='account_id', as_index=False).count()['amount']

trans_orthers = add_prefix_to_colnames(trans_orthers, prefix='sum_other_')
trans_orthers = trans_orthers.rename(columns={'sum_other_count_other_trans':'count_other_trans'})
trans_orthers

Unnamed: 0,account_id,sum_other_amount,sum_other_amount_in,sum_other_amount_out,count_other_trans
0,2,391823,227396.0,164427.0,33
1,19,338153,239345.0,98808.0,40
2,25,1884662,1005069.0,879593.0,107
3,37,846920,445057.0,401863.0,66
4,38,229641,147290.0,82351.0,35
...,...,...,...,...,...
677,11327,406396,244296.0,162100.0,32
678,11328,859072,443592.0,415480.0,72
679,11349,428034,231434.0,196600.0,13
680,11359,1199517,649336.0,550181.0,90


In [204]:
df = pd.merge(df, trans_orthers, how='left', on=['account_id'])
df.head()

Unnamed: 0,loan_loan_id,account_id,loan_date,loan_amount,loan_duration,loan_payments,loan_status,acc_district_id,acc_frequency,day_before_loan,...,count_monthly_payment_trans_y,current_monthly_payment_amount_out_y,current_monthly_payment_out_household_y,current_monthly_payment_out_insurrance_y,current_monthly_payment_out_payment_for_statement_y,current_monthly_payment_in_interest_credited_y,sum_other_amount,sum_other_amount_in,sum_other_amount_out,count_other_trans_y
0,4959,2,1994-01-05,80952,24,3373.0,A,1,monthly,313.0,...,21.0,7281.0,7266.0,0.0,15.0,138.0,391823,227396.0,164427.0,33
1,4961,19,1996-04-29,30276,12,2523.0,B,21,monthly,388.0,...,40.0,18215.0,18200.0,0.0,15.0,168.0,338153,239345.0,98808.0,40
2,4962,25,1997-12-08,30276,12,2523.0,A,68,monthly,498.0,...,57.0,6996.0,6817.0,164.0,15.0,372.0,1884662,1005069.0,879593.0,107
3,4967,37,1998-10-14,318480,60,5308.0,D,20,monthly,422.0,...,50.0,2429.0,2317.0,97.0,15.0,94.0,846920,445057.0,401863.0,66
4,4968,38,1998-04-19,110736,48,2307.0,C,19,weekly,254.0,...,20.0,7159.0,6901.0,228.0,30.0,117.0,229641,147290.0,82351.0,35


### balance latest

In [207]:
trans_df_before_loan.head()

Unnamed: 0,account_id,date,type,operation,amount,balance,k_symbol,amount_in,amount_out,loan_date,day_after_loan
0,2,1993-02-26,credit,credit_in_cash,1100,1100,,1100.0,0.0,1994-01-05,313.0
1,2,1993-03-12,credit,collection_from_anotherbank,20236,21336,,20236.0,0.0,1994-01-05,299.0
2,2,1993-03-28,credit,credit_in_cash,3700,25036,,3700.0,0.0,1994-01-05,283.0
3,2,1993-03-31,credit,,14,25050,interest_credited,14.0,0.0,1994-01-05,280.0
4,2,1993-04-12,credit,collection_from_anotherbank,20236,45286,,20236.0,0.0,1994-01-05,268.0


In [215]:
trans_df_before_loan = trans_df_before_loan.sort_values(by=['account_id', 'date'])
df['balance_latest'] = trans_df_before_loan.groupby('account_id').tail(1)['balance'].reset_index(drop=True)
df.head()

Unnamed: 0,loan_loan_id,account_id,loan_date,loan_amount,loan_duration,loan_payments,loan_status,acc_district_id,acc_frequency,day_before_loan,...,current_monthly_payment_amount_out_y,current_monthly_payment_out_household_y,current_monthly_payment_out_insurrance_y,current_monthly_payment_out_payment_for_statement_y,current_monthly_payment_in_interest_credited_y,sum_other_amount,sum_other_amount_in,sum_other_amount_out,count_other_trans_y,balance_latest
0,4959,2,1994-01-05,80952,24,3373.0,A,1,monthly,313.0,...,7281.0,7266.0,0.0,15.0,138.0,391823,227396.0,164427.0,33,27855
1,4961,19,1996-04-29,30276,12,2523.0,B,21,monthly,388.0,...,18215.0,18200.0,0.0,15.0,168.0,338153,239345.0,98808.0,40,15854
2,4962,25,1997-12-08,30276,12,2523.0,A,68,monthly,498.0,...,6996.0,6817.0,164.0,15.0,372.0,1884662,1005069.0,879593.0,107,31622
3,4967,37,1998-10-14,318480,60,5308.0,D,20,monthly,422.0,...,2429.0,2317.0,97.0,15.0,94.0,846920,445057.0,401863.0,66,16368
4,4968,38,1998-04-19,110736,48,2307.0,C,19,weekly,254.0,...,7159.0,6901.0,228.0,30.0,117.0,229641,147290.0,82351.0,35,37417


# Disp & Client & Card

In [216]:
summary_df(disp_df)
summary_df(client_df)
summary_df(card_df)


(5369, 4)


Unnamed: 0,disp_id,client_id,account_id,type
0,1,1,1,OWNER
1,2,2,2,OWNER
2,3,3,2,DISPONENT
3,4,4,3,OWNER
4,5,5,3,DISPONENT


missing value


disp_id       0
client_id     0
account_id    0
type          0
dtype: int64


(5369, 4)


Unnamed: 0,client_id,gender,birth_date,district_id
0,1,F,1970-12-13,18
1,2,M,1945-02-04,1
2,3,F,1940-10-09,1
3,4,M,1956-12-01,5
4,5,F,1960-07-03,5


missing value


client_id      0
gender         0
birth_date     0
district_id    0
dtype: int64


(892, 4)


Unnamed: 0,card_id,disp_id,type,issued
0,1,9,gold,1998-10-16
1,2,19,classic,1998-03-13
2,3,41,gold,1995-09-03
3,4,42,classic,1998-11-26
4,5,51,junior,1995-04-24


missing value


card_id    0
disp_id    0
type       0
issued     0
dtype: int64

In [217]:
dcc_df = pd.merge(disp_df, client_df, how='left', on='client_id')
dcc_df = pd.merge(dcc_df, card_df, how='left', on='disp_id')

dcc_df = dcc_df.rename(columns={'type_x': 'client_type', 'type_y':'card_type', 'issued':'card_issued'})
dcc_df.head()

Unnamed: 0,disp_id,client_id,account_id,client_type,gender,birth_date,district_id,card_id,card_type,card_issued
0,1,1,1,OWNER,F,1970-12-13,18,,,
1,2,2,2,OWNER,M,1945-02-04,1,,,
2,3,3,2,DISPONENT,F,1940-10-09,1,,,
3,4,4,3,OWNER,M,1956-12-01,5,,,
4,5,5,3,DISPONENT,F,1960-07-03,5,,,


birth_date -> age

In [218]:
dcc_df['birth_date'] = pd.to_datetime(dcc_df['birth_date'], format='%Y-%m-%d')
today = pd.to_datetime("1999-01-01", format='%Y-%m-%d')
dcc_df['age'] = [today.year - d.year for d in dcc_df['birth_date']]
dcc_df = dcc_df.drop(['disp_id', 'birth_date', 'district_id', 'card_id'], axis=1)
dcc_df

Unnamed: 0,client_id,account_id,client_type,gender,card_type,card_issued,age
0,1,1,OWNER,F,,,29
1,2,2,OWNER,M,,,54
2,3,2,DISPONENT,F,,,59
3,4,3,OWNER,M,,,43
4,5,3,DISPONENT,F,,,39
...,...,...,...,...,...,...,...
5364,13955,11349,OWNER,F,,,54
5365,13956,11349,DISPONENT,M,,,56
5366,13968,11359,OWNER,M,classic,1995-06-13,31
5367,13971,11362,OWNER,F,,,37


In [219]:
dcc_df['client_type'].value_counts()

OWNER        4500
DISPONENT     869
Name: client_type, dtype: int64

In [220]:
dcc_df['gender'].value_counts()

M    2724
F    2645
Name: gender, dtype: int64

client_id is unique. So, there is no more client in 1 account.

only have 1 account more client.

In [221]:
dcc_df[dcc_df.duplicated(['client_id'])]

Unnamed: 0,client_id,account_id,client_type,gender,card_type,card_issued,age


only owner can issue permanent orders and ask for a loan

In [222]:
dcc_df_ = dcc_df[dcc_df['client_type']=='OWNER'].reset_index(drop=True)

dcc_df_['all_client_mean_age'] = dcc_df.groupby(by='account_id')['age'].mean().reset_index(drop=True)
dcc_df_['all_client_count'] = dcc_df.groupby(by='account_id')['client_id'].count().reset_index(drop=True)
dcc_df_

Unnamed: 0,client_id,account_id,client_type,gender,card_type,card_issued,age,all_client_mean_age,all_client_count
0,1,1,OWNER,F,,,29,29.0,1
1,2,2,OWNER,M,,,54,56.5,2
2,4,3,OWNER,M,,,43,41.0,2
3,6,4,OWNER,M,,,80,80.0,1
4,7,5,OWNER,M,,,70,70.0,1
...,...,...,...,...,...,...,...,...,...
4495,13931,11333,OWNER,M,,,57,57.0,1
4496,13955,11349,OWNER,F,,,54,55.0,2
4497,13968,11359,OWNER,M,classic,1995-06-13,31,31.0,1
4498,13971,11362,OWNER,F,,,37,37.0,1


In [223]:
dcc_df_ = dcc_df_.drop(['client_id', 'client_type', 'card_issued'], axis=1)
dcc_df_ = dcc_df_.rename(columns={'gender':'client_gender', 'age':'client_age'})
dcc_df_

Unnamed: 0,account_id,client_gender,card_type,client_age,all_client_mean_age,all_client_count
0,1,F,,29,29.0,1
1,2,M,,54,56.5,2
2,3,M,,43,41.0,2
3,4,M,,80,80.0,1
4,5,M,,70,70.0,1
...,...,...,...,...,...,...
4495,11333,M,,57,57.0,1
4496,11349,F,,54,55.0,2
4497,11359,M,classic,31,31.0,1
4498,11362,F,,37,37.0,1


In [224]:
dcc_df_final = onehot(dcc_df_, col_name='card_type', prefix='card_type')
# dcc_df_final = onehot(dcc_df_, col_name='client_gender', prefix='client_gender')

dcc_df_final

Unnamed: 0,account_id,client_gender,client_age,all_client_mean_age,all_client_count,card_type_classic,card_type_gold,card_type_junior
0,1,F,29,29.0,1,0,0,0
1,2,M,54,56.5,2,0,0,0
2,3,M,43,41.0,2,0,0,0
3,4,M,80,80.0,1,0,0,0
4,5,M,70,70.0,1,0,0,0
...,...,...,...,...,...,...,...,...
4495,11333,M,57,57.0,1,0,0,0
4496,11349,F,54,55.0,2,0,0,0
4497,11359,M,31,31.0,1,1,0,0
4498,11362,F,37,37.0,1,0,0,0


In [225]:
df = pd.merge(df, dcc_df_final, how='left', on='account_id')

In [226]:
df

Unnamed: 0,loan_loan_id,account_id,loan_date,loan_amount,loan_duration,loan_payments,loan_status,acc_district_id,acc_frequency,day_before_loan,...,sum_other_amount_out,count_other_trans_y,balance_latest,client_gender,client_age,all_client_mean_age,all_client_count,card_type_classic,card_type_gold,card_type_junior
0,4959,2,1994-01-05,80952,24,3373.00,A,1,monthly,313.0,...,164427.0,33,27855,M,54,56.5,2,0,0,0
1,4961,19,1996-04-29,30276,12,2523.00,B,21,monthly,388.0,...,98808.0,40,15854,F,60,60.0,1,0,0,0
2,4962,25,1997-12-08,30276,12,2523.00,A,68,monthly,498.0,...,879593.0,107,31622,M,37,37.0,1,0,0,0
3,4967,37,1998-10-14,318480,60,5308.00,D,20,monthly,422.0,...,401863.0,66,16368,M,47,47.0,1,0,0,0
4,4968,38,1998-04-19,110736,48,2307.00,C,19,weekly,254.0,...,82351.0,35,37417,F,59,59.0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
677,7294,11327,1998-09-27,39168,24,1632.00,C,7,monthly,347.0,...,162100.0,32,62119,F,20,20.0,1,0,0,0
678,7295,11328,1998-07-18,280440,60,4674.00,C,54,monthly,620.0,...,415480.0,72,31303,F,47,47.0,1,0,0,0
679,7304,11349,1995-10-29,419880,60,6998.00,C,1,weekly,156.0,...,196600.0,13,24704,F,54,55.0,2,0,0,0
680,7305,11359,1996-08-06,54024,12,4502.00,A,61,monthly,675.0,...,550181.0,90,25697,M,31,31.0,1,1,0,0


## district

In [227]:
district_df

Unnamed: 0,district_id,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,1,Hl.m. Praha,Prague,1204953,0,0,0,1,1,100.0,12541,0.2,0.43,167,85677.0,99107
1,2,Benesov,central Bohemia,88884,80,26,6,2,5,46.7,8507,1.6,1.85,132,2159.0,2674
2,3,Beroun,central Bohemia,75232,55,26,4,1,5,41.7,8980,1.9,2.21,111,2824.0,2813
3,4,Kladno,central Bohemia,149893,63,29,6,2,6,67.4,9753,4.6,5.05,109,5244.0,5892
4,5,Kolin,central Bohemia,95616,65,30,4,1,6,51.4,9307,3.8,4.43,118,2616.0,3040
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,73,Opava,north Moravia,182027,17,49,12,2,7,56.4,8746,3.3,3.74,90,4355.0,4433
73,74,Ostrava - mesto,north Moravia,323870,0,0,0,1,1,100.0,10673,4.7,5.44,100,18782.0,18347
74,75,Prerov,north Moravia,138032,67,30,4,2,5,64.6,8819,5.3,5.66,99,4063.0,4505
75,76,Sumperk,north Moravia,127369,31,32,13,2,7,51.2,8369,4.7,5.88,107,3736.0,2807


In [228]:
selected_district_feature = ['district_id', 'A4', 'A10', 'A11', 'A14']
district_df_ = district_df[selected_district_feature]
district_df_.loc[district_df_.index, ['unemploy_rate']] = district_df[['A12', 'A13']].mean(axis=1)
district_df_.loc[district_df_.index, ['number_crimes']] = district_df[['A15', 'A16']].mean(axis=1)
district_df_.loc[district_df_.index, ['A10']] = district_df_['A10'] / 100

district_df_ = district_df_.rename(columns={'A4':'num_inhabitants', 'A10':'urban_rate', 'A11':'avg_salary', 'A14':'num_enterpreneurs_per1000inhabitants'})
district_df_

Unnamed: 0,district_id,num_inhabitants,urban_rate,avg_salary,num_enterpreneurs_per1000inhabitants,unemploy_rate,number_crimes
0,1,1204953,1.0,12541,167,0.315,92392.0
1,2,88884,0.467,8507,132,1.725,2416.5
2,3,75232,0.417,8980,111,2.055,2818.5
3,4,149893,0.674,9753,109,4.825,5568.0
4,5,95616,0.514,9307,118,4.115,2828.0
...,...,...,...,...,...,...,...
72,73,182027,0.564,8746,90,3.520,4394.0
73,74,323870,1.0,10673,100,5.070,18564.5
74,75,138032,0.646,8819,99,5.480,4284.0
75,76,127369,0.512,8369,107,5.290,3271.5


In [229]:
df = df.rename(columns={'acc_district_id':'district_id'})

In [230]:
final_df = pd.merge(df, district_df_, how='left', on='district_id')
final_df

Unnamed: 0,loan_loan_id,account_id,loan_date,loan_amount,loan_duration,loan_payments,loan_status,district_id,acc_frequency,day_before_loan,...,all_client_count,card_type_classic,card_type_gold,card_type_junior,num_inhabitants,urban_rate,avg_salary,num_enterpreneurs_per1000inhabitants,unemploy_rate,number_crimes
0,4959,2,1994-01-05,80952,24,3373.00,A,1,monthly,313.0,...,2,0,0,0,1204953,1.0,12541,167,0.315,92392.0
1,4961,19,1996-04-29,30276,12,2523.00,B,21,monthly,388.0,...,1,0,0,0,103347,0.67,9104,123,1.785,2326.5
2,4962,25,1997-12-08,30276,12,2523.00,A,68,monthly,498.0,...,1,0,0,0,228848,0.572,9893,96,4.360,5755.0
3,4967,37,1998-10-14,318480,60,5308.00,D,20,monthly,422.0,...,1,0,0,0,70646,0.584,8547,120,3.120,1552.5
4,4968,38,1998-04-19,110736,48,2307.00,C,19,weekly,254.0,...,1,0,0,0,51428,0.527,8402,120,3.540,1049.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
677,7294,11327,1998-09-27,39168,24,1632.00,C,7,monthly,347.0,...,1,0,0,0,94725,0.634,9920,130,2.535,4567.5
678,7295,11328,1998-07-18,280440,60,4674.00,C,54,monthly,620.0,...,1,0,0,0,387570,1.0,9897,140,1.780,18708.5
679,7304,11349,1995-10-29,419880,60,6998.00,C,1,weekly,156.0,...,2,0,0,0,1204953,1.0,12541,167,0.315,92392.0
680,7305,11359,1996-08-06,54024,12,4502.00,A,61,monthly,675.0,...,1,1,0,0,117897,0.538,8814,107,5.220,2085.5


In [231]:
final_df.shape

(682, 62)

In [None]:
final_df.to_csv('transformed_data/prepared_data.csv', index=False)

# Add more feature

* All time balance (avg balance, min, max)
* last month, last 3 months balance min. mean, max, median, var
* average balance from first month, last month ,and last 3 months. 
* avg balance / loan payment

In [None]:
import pandas as pd

In [None]:
pd.read_csv('prepared_data_beforeloan_normbyday.csv').columns

In [None]:
df = pd.read_csv('prepared_data_beforeloan_normbyday.csv')
df.head()

In [None]:
trans_before_loan = pd.read_csv('data/trans_before_loan.csv')
trans_before_loan.head()

In [None]:
trans_before_loan.groupby('account_id')['balance'].min().hist()

In [None]:
trans_before_loan.groupby(['account_id'])['balance'].head(1).hist()

In [None]:
trans_before_loan[trans_before_loan['account_id']==25]

All time balance
average balance

In [None]:
trans_before_loan.groupby(['account_id'])['balance'].mean().hist()

In [None]:
trans_before_loan.groupby(['account_id'])['balance'].mean().reset_index(drop=True)

In [None]:
df['balance_mean'] = trans_before_loan.groupby(['account_id'])['balance'].mean().reset_index(drop=True)
df['balance_min'] = trans_before_loan.groupby(['account_id'])['balance'].min().reset_index(drop=True)
df['balance_max'] = trans_before_loan.groupby(['account_id'])['balance'].max().reset_index(drop=True)
# df['balance_median'] = trans_before_loan.groupby(['account_id'])['balance'].median().reset_index(drop=True)
# df['balance_var'] = trans_before_loan.groupby(['account_id'])['balance'].var().reset_index(drop=True)
df.head()

last month, last 3 months balance min

In [None]:
trans_before_loan['loan_date'] = pd.to_datetime(trans_before_loan['loan_date'])
trans_before_loan['last_month_before_loan'] = trans_before_loan['loan_date'] - pd.Timedelta(days=31)
trans_before_loan['last_3_months_before_loan'] = trans_before_loan['loan_date'] - pd.Timedelta(days=31*3)
trans_before_loan.head()

In [None]:
last_month_before_loan = trans_before_loan[trans_before_loan['date'] >= trans_before_loan['last_month_before_loan']]
last_month_before_loan.head()

In [None]:
df['balance_min_last_month'] = last_month_before_loan.groupby(by='account_id')['balance'].min().reset_index(drop=True)
df['balance_max_last_month'] = last_month_before_loan.groupby(by='account_id')['balance'].max().reset_index(drop=True)
df['balance_mean_last_month'] = last_month_before_loan.groupby(by='account_id')['balance'].mean().reset_index(drop=True)
# df['balance_median_last_month'] = last_month_before_loan.groupby(by='account_id')['balance'].median().reset_index(drop=True)
# df['balance_var_last_month'] = last_month_before_loan.groupby(by='account_id')['balance'].var().reset_index(drop=True)
df.head()

In [None]:
last_3months_before_loan = trans_before_loan[trans_before_loan['date'] >= trans_before_loan['last_3_months_before_loan']]
last_3months_before_loan.head()

In [None]:
df['balance_min_last_3months'] = last_3months_before_loan.groupby(by='account_id')['balance'].min().reset_index(drop=True)
df['balance_max_last_3months'] = last_3months_before_loan.groupby(by='account_id')['balance'].max().reset_index(drop=True)
df['balance_mean_last_3months'] = last_3months_before_loan.groupby(by='account_id')['balance'].mean().reset_index(drop=True)
# df['balance_median_last_3months'] = last_3months_before_loan.groupby(by='account_id')['balance'].median().reset_index(drop=True)
# df['balance_var_last_3months'] = last_3months_before_loan.groupby(by='account_id')['balance'].var().reset_index(drop=True)
df.head()

avg balance / loan payment
avg balance / loan amount

In [None]:
df['balance_mean_per_loan_payment'] = df['balance_mean'].div(df['loan_payments'], axis=0)
df['balance_mean_last_month_per_loan_payment'] = df['balance_mean_last_month'].div(df['loan_payments'], axis=0)
# df['balance_mean_last_3months_per_loan_payment'] = df['balance_mean_last_3months'].div(df['loan_payments'], axis=0)

df['balance_mean_per_loan_amount'] = df['balance_mean'].div(df['loan_amount'], axis=0)
df['balance_mean_last_month_per_loan_amount'] = df['balance_mean_last_month'].div(df['loan_amount'], axis=0)
# df['balance_mean_last_3months_per_loan_amount'] = df['balance_mean_last_3months'].div(df['loan_amount'], axis=0)

In [None]:
# df[['balance_mean', 'balance_mean_last_month', 'balance_mean_last_3months']]

average balance, average amount growth from first month, last month ,and last 3 months. 

In [None]:
last_3months_ago = trans_before_loan[(trans_before_loan['date'] > trans_before_loan['last_3_months_before_loan']) & (trans_before_loan['date'] <= trans_before_loan['last_3_months_before_loan'] + pd.Timedelta(days=31))]
last_3months_ago.head()

In [None]:
df['growth_balance'] = (df['balance_mean_last_month'] - last_3months_ago.groupby('account_id')['balance'].mean().reset_index(drop=True)) / (30*3)

In [None]:
df['growth_balance'].hist()

In [None]:
df.columns

* amount growth
* sum amount out per month
* sum amount out per month / loan payments

In [None]:
last_month_before_loan = trans_before_loan[trans_before_loan['date'] >= trans_before_loan['last_month_before_loan']]

last_3months_before_loan = trans_before_loan[trans_before_loan['date'] >= trans_before_loan['last_3_months_before_loan']]

In [None]:
df[['amount_last_month', 'amount_last_month_in', 'amount_last_month_out']] = last_month_before_loan.groupby('account_id')[['amount', 'amount_in', 'amount_out']].sum().reset_index(drop=True)
df[['amount_last_month_per_loan_payments', 'amount_last_mount_in_per_loan_payments', 'amount_last_mount_out_per_loan_payments']] = df[['amount_last_month', 'amount_last_month_in', 'amount_last_month_out']].div(df['loan_payments'], axis=0)
df[['amount_last_month_per_loan_amount', 'amount_last_mount_in_per_loan_amount', 'amount_last_mount_out_per_loan_amount']] = df[['amount_last_month', 'amount_last_month_in', 'amount_last_month_out']].div(df['loan_amount'], axis=0)

df[['amount_last_3months_avg', 'amount_last_3months_in_avg', 'amount_last_3months_out_avg']] = last_3months_before_loan.groupby('account_id')[['amount', 'amount_in', 'amount_out']].sum().reset_index(drop=True) / 3
df[['amount_last_3months_per_loan_payments', 'amount_last_3months_in_per_loan_payments', 'amount_last_3months_out_per_loan_payments']] = df[['amount_last_3months_avg', 'amount_last_3months_in_avg', 'amount_last_3months_out_avg']].div(df['loan_payments'], axis=0)
df[['amount_last_3months_per_loan_amount', 'amount_last_3months_in_per_loan_amount', 'amount_last_3months_out_per_loan_amount']] = df[['amount_last_3months_avg', 'amount_last_3months_in_avg', 'amount_last_3months_out_avg']].div(df['loan_amount'], axis=0)
df

In [None]:
# last_3months_ago = trans_before_loan[(trans_before_loan['date'] > trans_before_loan['last_3_months_before_loan']) & (trans_before_loan['date'] <= trans_before_loan['last_3_months_before_loan'] + pd.Timedelta(days=31))]
# last_3months_ago.head()

In [None]:
# df['growth_amount_in'] = (df['amount_last_month_in'] - last_3months_ago.groupby('account_id')['amount_in'].sum().reset_index(drop=True)) / (30*3)
# df['growth_amount_out'] = (df['amount_last_month_out'] - last_3months_ago.groupby('account_id')['amount_out'].sum().reset_index(drop=True)) / (30*3)

In [None]:
# df['growth_amount_in'].hist()

In [None]:
# df['growth_amount_out'].hist()

In [None]:
df.shape

In [None]:
df.isna().any()

In [None]:
df.to_csv('test.csv', index=False)

In [None]:
df.columns

In [None]:
dff = df.copy()
dff.drop(['sum_monthly_payment_amount', 'sum_monthly_payment_amount_in', 'sum_monthly_payment_amount_out',
         'day_before_loan', 'current_monthly_payment_out_payment_for_statement', 'sum_other_amount',
         'sum_other_amount_in', 'sum_other_amount_out', 'client_age', 'all_client_mean_age', 
         'num_inhabitants', 'urban_rate', 'num_enterpreneurs_per1000inhabitants', 'unemploy_rate', 'balance_max',
         'balance_mean_per_loan_payment', 'balance_mean_per_loan_amount', 'amount_last_month', 'amount_last_month_in',
         'amount_last_month_per_loan_payments', 'amount_last_month_per_loan_amount', 'amount_last_3months_avg',
         'amount_last_3months_in_avg', 'amount_last_3months_out_avg', 'amount_last_3months_per_loan_payments', 
         'amount_last_3months_per_loan_amount', 'balance_min_last_3months', 'balance_max_last_3months', 'balance_mean_last_3months',
         'amount_last_3months_per_loan_payments'], axis=1, inplace=True)

In [None]:
dff.shape

In [None]:
dff.to_csv('test.csv', index=False)

In [None]:
dff.isna().any().any()

In [None]:
import pandas as pd

def to_labels(pos_probs, threshold):
	return (pos_probs >= threshold).astype('int')

report = pd.DataFrame({'probs':probs, 'y_true':y_true})
report['y_pred'] = to_labels(report['probs'], 0.85)
report['case_TP'] = report['y_true'].astype(int) & report['y_pred'].astype(int) 
report['case_FN'] = ((report['y_true'].astype(int) == 1) & (report['y_pred'].astype(int) == 0)).astype(int)
report['case_FP'] = ((report['y_true'].astype(int) == 0) & (report['y_pred'].astype(int) == 1)).astype(int)
report['case_TN'] = ( report['y_true'].astype(int) | report['y_pred'].astype(int) == 0).astype(int)
print(report)

In [None]:
import numpy as np

In [None]:
report = pd.read_csv('report_xgb.csv')
report['y_pred'] = np.nan
report['case_TP'] = np.nan
report['case_FN'] = np.nan
report['case_FP'] = np.nan
report['case_TN'] = np.nan
report['threshold'] = np.nan
report

In [None]:
def profit_thresholding(df, th_, profit, interest_rate):
    df['y_pred'] = to_labels(df['probs'], th_)
    # TN
    df.loc[(df['y_true']==0) & (df['y_pred']==0), ['profit']] = profit
    df.loc[(df['y_true']==0) & (df['y_pred']==0), ['conf']] = 'tn'

    # TP 
    df.loc[(df['y_true']==1) & (df['y_pred']==1), ['profit']] = 0
    df.loc[(df['y_true']==1) & (df['y_pred']==1), ['conf']] = 'tp'

    # FN
    df.loc[(df['y_true']==0) & (df['y_pred']==1), ['profit']] = 0
    df.loc[(df['y_true']==0) & (df['y_pred']==1), ['conf']] = 'fn'
    
    # FP
    df.loc[(df['y_true']==1) & (df['y_pred']==0), ['profit']] = profit
    df.loc[(df['y_true']==1) & (df['y_pred']==0), ['conf']] = 'fp'
            
    df['threshold'] = th_
    df['interest_rate'] = interest_rate
    
    return df

In [None]:
def cal_revenue(total_amount, loan_duration_month, paid_month, interest_rate_peryear):
    '''
    Interest (Compound) = P(1+i)^t - P
    P = all / (1+i)^t
    '''
    principle = total_amount / (1+interest_rate_peryear)**(loan_duration_month//12)
    expected_revenue = total_amount - principle
    revenue = paid_month * expected_revenue / loan_duration_month
    return revenue

def cal_cost(loan_payment, loan_duration_month, paid_month):
    not_paid_month = loan_duration_month - paid_month
    cost = not_paid_month * loan_payment
    return cost

In [None]:
loan_df = pd.read_csv('data/loan.csv')
loan_df.head()

In [None]:
def to_labels(pos_probs, threshold):
    return (pos_probs >= threshold).astype('int')

In [None]:
url = 'https://raw.githubusercontent.com/sorayutmild/loan-default-prediction/main/after_trans_payment_for_loan.csv?raw=true'
after_trans_for_loan = pd.read_csv(url)

interest_rate_peryear = [0.01, 0.03, 0.05, 0.1, 0.2]
profits = []
report = pd.read_csv('report_xgb.csv')
reports = []

for i in interest_rate_peryear:
    revenue = cal_revenue(total_amount=loan_df['amount'], 
                    loan_duration_month=loan_df['duration'], 
                    paid_month=after_trans_for_loan['count_loan_trans'], 
                    interest_rate_peryear=i)

    cost = cal_cost(loan_payment=loan_df['payments'], 
                    loan_duration_month=loan_df['duration'],
                    paid_month=after_trans_for_loan['count_loan_trans'])

    profit = revenue - cost



    
    
    for th in np.arange(0, 1.1, 0.1):
        th_ = th.round(decimals=1)
        r = profit_thresholding(report, th_, profit, i).reset_index(drop=True)
        reports.append(r)
          
    profits.append(profit.sum())

reports = pd.concat(reports)
reports

In [None]:
reports[(reports['threshold']==0.5) & (reports['interest_rate']==0.05)][['profit']].sum()

In [None]:
reports.to_csv('report_xgb_threshold_prfit.csv', index=False)

In [None]:
ori_profit = pd.DataFrame({'interest rate per year': interest_rate_peryear, 
              'Old profit':profits})

ori_profit.to_csv('ori_profit.csv', index=False)

In [None]:
ori_profit