In [None]:
from dataset import MysqlIO
import pandas as pd
import pandas_profiling as pp
from IPython.display import display
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
db = MysqlIO(host='relational.fit.cvut.cz', 
            database='financial', 
            user='guest', 
            password='relational')

In [None]:
db.execute('SHOW TABLES')

In [None]:
loan_df = db.execute_to_df('SELECT * FROM loan')
order_df = db.execute_to_df('SELECT * FROM `order`')
trans_df = db.execute_to_df('''
    SELECT * FROM trans WHERE account_id IN 
    (SELECT account_id FROM loan)
   ''')
disp_df = db.execute_to_df('SELECT * FROM disp')
account_df = db.execute_to_df('SELECT * FROM account')
client_df = db.execute_to_df('SELECT * FROM client')
card_df = db.execute_to_df('SELECT * FROM card')
district_df = db.execute_to_df('SELECT * FROM district')

In [None]:
def summary_df(df, table_name=''):
    print(table_name)
    print(df.shape)
    display(df.head())
    print('missing value')
    display(df.isnull().sum())

In [None]:
summary_df(loan_df, "loan")
summary_df(order_df, "order")
summary_df(trans_df, "trans")
summary_df(disp_df, "disp")
summary_df(account_df, "account")
summary_df(client_df, "client")
summary_df(card_df, "card")
summary_df(district_df, "district")

1 client can have multiple accounts.
Also, 1 account can have multiple clients (owner, desponent).

In [None]:
print(loan_df['account_id'].is_unique)
print(account_df['account_id'].is_unique)
print(disp_df['account_id'].is_unique)

# Data manipulation

In [None]:
def add_prefix_to_colnames(df, prefix, fixed_col_name='account_id'):
    df = df.add_prefix(prefix)
    df = df.rename(index=str, columns={prefix+fixed_col_name : fixed_col_name})
    return df

def percentile(n):
    def percentile_(x):
        return np.percentile(x, n)
    percentile_.__name__ = 'pct_%s' % n
    return percentile_

def onehot(df, col_name, prefix=None, drop=True):
    df = df.join(pd.get_dummies(df[col_name], prefix=prefix))
    if drop:
        df = df.drop([col_name], axis=1)
    return df

def summary_group_cate_data(main_df, add_df, col_names=list, how='left', on=['account_id'], validate='one_to_many', by="account_id"):
    t = pd.merge(main_df, add_df, how=how, on=on, validate=validate)
    t = t.groupby(by=by, as_index=False)[col_names].sum()
    return t

def summary_group_num_data(main_df, add_df, col_name=str, how='left', on=['account_id'], validate='one_to_many', by="account_id", high_freq=False):
    t = pd.merge(main_df, add_df, how=how, on=on, validate=validate)
    if high_freq:
        tt = t.groupby(by=by)[col_name].agg([min, max, np.var, percentile(25), percentile(50), percentile(75), sum]).reset_index()       
    else:
        tt = t.groupby(by=by)[col_name].agg([min, max, sum]).reset_index()

    tt['mean'] = t.groupby(by=by)[col_name].mean().reset_index(drop=True)
    tt['count'] = t.groupby(by=by)[col_name].count().reset_index(drop=True)

    return tt


In [None]:
loan_df_ = add_prefix_to_colnames(loan_df, prefix='loan_')
print(loan_df_.shape)
loan_df_.head()

In [None]:
loan_df_.to_csv('table_csv/loan.csv', index=False)

In [None]:
account_df.loc[account_df['frequency']=="POPLATEK MESICNE", 'frequency'] = 'monthly'
account_df.loc[account_df['frequency']=="POPLATEK TYDNE", 'frequency'] = 'weekly'
account_df.loc[account_df['frequency']=="POPLATEK PO OBRATU", 'frequency'] = 'after_transaction'
account_df.head()

In [None]:
account_df_ = add_prefix_to_colnames(account_df, prefix='acc_')
print(account_df_.shape)
account_df_.head()

In [None]:
account_df_.to_csv('table_csv/account.csv', index=False)

In [None]:
def day_to_int(day_serie):
    return np.array([d.days for d in day_serie]).astype('float')

In [None]:
df = pd.merge(loan_df_, account_df_, how='left', on=['account_id'])
df['day_before_loan'] = df['loan_date'] - df['acc_date']
df['day_before_loan'] = day_to_int(df['day_before_loan'])
df = df.drop(['acc_date'], axis=1)
print(df.shape)
df.head()

In [None]:
account_loan = loan_df[['account_id']]
account_loan = account_loan.values.reshape((682))
account_loan.shape

## order manipulation

due to 1 account can have more order. So, we need to manipulate order values 

such as 

amount -> min, max, sum, mean, count

k_symbol -> onehot -> sum

In [None]:
order_df.head()

order_df there are "spacing". so, we add NaN.

In [None]:
order_df['k_symbol'].value_counts()

In [None]:
order_df = order_df.replace("", np.nan)
order_df['k_symbol'].value_counts()

In [None]:
order_df.loc[order_df['k_symbol']=="POJISTNE", 'k_symbol'] = 'insurrance'
order_df.loc[order_df['k_symbol']=="SIPO", 'k_symbol'] = 'household'
order_df.loc[order_df['k_symbol']=="LEASING", 'k_symbol'] = 'leasing'
order_df.loc[order_df['k_symbol']=="UVER", 'k_symbol'] = 'loan'

# print(order_df.shape)
order_df.head()

In [None]:
add_prefix_to_colnames(order_df, prefix='order_').to_csv('table_csv/order.csv', index=False)

drop k_symbol nan because we don't know what k_symbol it is.

In [None]:
order_df = order_df[order_df['k_symbol'].notna()]
order_df.head()

k_symbol onehot encoding

this tells us, each account have had monthly payment in each k_symbol.

In [None]:
order_df_ = onehot(order_df, col_name='k_symbol', prefix='monthly_payment_for')
order_df_.head()

multiple each k_symbol with its amount. because this tells us each mont the account need to pay hoiw much. not just have had pay for what.

In [None]:
order_df_['monthly_payment_for_household'] = order_df_['monthly_payment_for_household'] * order_df_['amount'] 
order_df_['monthly_payment_for_insurrance'] = order_df_['monthly_payment_for_insurrance'] * order_df_['amount'] 
order_df_['monthly_payment_for_leasing'] = order_df_['monthly_payment_for_leasing'] * order_df_['amount'] 
order_df_['monthly_payment_for_loan'] = order_df_['monthly_payment_for_loan'] * order_df_['amount']
order_df_.head()

Drop bank_to, account_to, k_symbol

In [None]:
order_df_ = order_df_.drop(['order_id', 'bank_to', 'account_to', 'amount'], axis=1)
order_df_.head()

In [None]:
order_df_ = add_prefix_to_colnames(order_df_, prefix='order_')
order_df_.head()

Order -> useless เพราะ k_symbol ของ transaction ละเอียดกว่า

In [None]:
# df = pd.merge(df, order_df_, how='left', on=['account_id'])
# df = pd.merge(df, order_ksymbol, how='left', on=['account_id'])
df

## Transaction

In [None]:
summary_df(trans_df)

replace None (null values), spacing with NaN

In [None]:
# check values not in list (show spacing)
trans_df.loc[~trans_df['k_symbol'].isin(["POJISTNE", "SLUZBY", "UROK", "SANKC. UROK", "SIPO", "DUCHOD", "UVER", np.nan]) , :]

In [None]:
# fill spacing with nan
trans_df = trans_df.replace(' ', np.nan)
# check
trans_df.loc[~trans_df['k_symbol'].isin(["POJISTNE", "SLUZBY", "UROK", "SANKC. UROK", "SIPO", "DUCHOD", "UVER", np.nan]) , :]

In [None]:
# fill None with nan
trans_df = trans_df.fillna(value=np.nan)
trans_df

type have class "PRIJEM", "VYDAJ". Also "VYBER" (not show in https://web.archive.org/web/20180506035658/http://lisp.vse.cz/pkdd99/Challenge/berka.htm)

include "VYBER" to "VYDAJ" as withdrawal.

In [None]:
trans_df['type'].value_counts()

In [None]:
trans_df['operation'].value_counts()

In [None]:
trans_df['k_symbol'].value_counts()

In [None]:
trans_df = trans_df.sort_values(by=['account_id', 'date'])
trans_df = trans_df.copy()
# type
trans_df.loc[trans_df['type']=="PRIJEM", 'type'] = 'credit'
trans_df.loc[trans_df['type']=="VYDAJ", 'type'] = 'withdrawal'
trans_df.loc[trans_df['type']=="VYBER", 'type'] = 'withdrawal'

# operation
trans_df.loc[trans_df['operation']=="VYBER KARTOU", 'operation'] = 'credit_card_withdrawal'
trans_df.loc[trans_df['operation']=="VKLAD", 'operation'] = 'credit_in_cash'
trans_df.loc[trans_df['operation']=="PREVOD Z UCTU", 'operation'] = 'collection_from_anotherbank'
trans_df.loc[trans_df['operation']=="VYBER", 'operation'] = 'withdrawal_in_cash'
trans_df.loc[trans_df['operation']=="PREVOD NA UCET", 'operation'] = 'remittance_to_anotherbank'

# k_symbol
trans_df.loc[trans_df['k_symbol']=="POJISTNE", 'k_symbol'] = 'insurrance'
trans_df.loc[trans_df['k_symbol']=="SLUZBY", 'k_symbol'] = 'payment_for_statement'
trans_df.loc[trans_df['k_symbol']=="UROK", 'k_symbol'] = 'interest_credited'
trans_df.loc[trans_df['k_symbol']=="SANKC. UROK", 'k_symbol'] = 'sanction_interest_neg_bal'
trans_df.loc[trans_df['k_symbol']=="SIPO", 'k_symbol'] = 'household'
trans_df.loc[trans_df['k_symbol']=="DUCHOD", 'k_symbol'] = 'old-age_pension'
trans_df.loc[trans_df['k_symbol']=="UVER", 'k_symbol'] = 'loan'

trans_df

In [None]:
add_prefix_to_colnames(trans_df, prefix='trans_').to_csv('table_csv/transaction.csv', index=False)

money in, money out

In [None]:
trans_df_in = trans_df.copy()
trans_df_out = trans_df.copy()

In [None]:
def money_in_out(df, focus_colname, cond_colname='type', in_colname='credit', out_colname='withdrawal'):
    money_in = df[df[cond_colname] == in_colname][focus_colname]
    money_out = df[df[cond_colname] == out_colname][focus_colname]

    return [money_in, money_out]

trans_df_out.loc[trans_df_out['type'] == 'credit', 'amount_in'] = money_in_out(trans_df_in, focus_colname='amount')[0] # trans_df_before_loan[trans_df_before_loan['type'] == 'credit']['amount']
trans_df_out.loc[trans_df_out['type'] == 'withdrawal', 'amount_out'] = money_in_out(trans_df_in, focus_colname='amount')[1]
# replace amount in out nan with 0
trans_df_out = trans_df_out.replace(np.nan, 0)
trans_df_out

Focus only transaction before loan

In [None]:
trans_df_in = trans_df_out.copy()
trans_df_in.head()

In [None]:
trans_df_in = trans_df_in.drop(['trans_id', 'bank', 'account'], axis=1)

In [None]:
trans_df_before_loan = pd.merge(trans_df_in, loan_df_[['account_id', 'loan_date']], how='left', on='account_id')
trans_df_before_loan['day_after_loan'] = day_to_int(trans_df_before_loan['loan_date'] - trans_df_before_loan['date'])

# after loan
trans_df_after_loan = trans_df_before_loan[trans_df_before_loan['day_after_loan']<=0]
# before loan
trans_df_before_loan = trans_df_before_loan[trans_df_before_loan['day_after_loan']>0]

# trans_df_before_loan = trans_df_before_loan.drop(['day_after_loan', 'loan_date'], axis=1)
print(trans_df_before_loan.shape)
trans_df_before_loan.head()

In [None]:
# trans_df_after_loan = trans_df_after_loan.drop(['day_after_loan', 'loan_date'], axis=1)
print(trans_df_after_loan.shape)
trans_df_after_loan.head()

In [None]:
print(trans_df_before_loan['type'].value_counts(), '\n')
print(trans_df_before_loan['operation'].value_counts(), '\n')
print(trans_df_before_loan['k_symbol'].value_counts(), '\n')

In [None]:
print(trans_df_after_loan['type'].value_counts(), '\n')
print(trans_df_after_loan['operation'].value_counts(), '\n')
print(trans_df_after_loan['k_symbol'].value_counts(), '\n')

In [None]:
trans_df_out = trans_df_before_loan.copy()

In [None]:
trans_df_out['type'].value_counts()

k_symbol

In [None]:
trans_df_in = trans_df_out.copy()

In [None]:
trans_df_in['operation'].fillna('Others', inplace=True)
trans_df_in['k_symbol'].fillna('Others', inplace=True)
trans_df_in.head()

In [None]:
plt.figure(figsize=(10,5))
plt.subplot(1, 2, 1)
plt.title('operation')
trans_df_in['operation'].value_counts().plot(kind='bar')
plt.subplot(1, 2, 2)
plt.title('k_symbol')
trans_df_in['k_symbol'].value_counts().plot(kind='bar')
plt.show()

In [None]:
trans_df_in[trans_df_in['type']=='credit']['operation'].value_counts()

In [None]:
import seaborn as sns

In [None]:
sns.countplot(data=trans_df_in, x='type', hue='operation')
plt.show()

In [None]:
sns.countplot(data=trans_df_in, x='type', hue='k_symbol')
plt.show()

In [None]:
plt.figure(figsize=(20, 5))
sns.countplot(data=trans_df_in, x='operation', hue='k_symbol')
plt.show()

operation categorical is not importance. So, we don't care operation. only focus on k_symbol.

In [None]:
trans_df_out = trans_df_in.copy()

ที่ต้องจ่ายรายเดือน (k_symbol) 

แต่ละ account 

* เคยจ่ายค่าอะไรบ้าง แต่ละอันจ่ายไปแล้วเท่าไหร่ Done
* เคยผ่อนสำเร็จมั้ย เคยเบี้ยวไม่จ่ายมั้ย เหลืออีกกี่บาทที่ต้องจ่าย ? 
* ปัจจุบันต้องจ่ายค่าอะไร ต่อเดือนเท่าไหร่ Done
* เคยถูก sanction มาแล้วกี่ครั้ง Done

focus only monthly transaction.

In [None]:
trans_df_in = trans_df_out[trans_df_out['k_symbol']!='Others']
trans_df_in

In [None]:
trans_df_ = trans_df_in.copy()

In [None]:
# onehot
trans_df_ = onehot(trans_df_, col_name='k_symbol', prefix='out',drop=False)
trans_df_ = trans_df_.rename(columns={'out_interest_credited':'in_interest_credited'})
trans_df_.head()

In [None]:
out_trans = ['out_household', 'out_insurrance', 'out_payment_for_statement', 'out_sanction_interest_neg_bal']
# count sanction if neg balance. 
trans_df_['count_out_sanction_interest_neg_bal'] = trans_df_['out_sanction_interest_neg_bal']

trans_df_.loc[:, out_trans] = trans_df_[out_trans].multiply(trans_df_["amount_out"], axis=0)


trans_df_.loc[:, 'in_interest_credited'] = trans_df_['in_interest_credited'].multiply(trans_df_["amount_in"], axis=0)
trans_df_

In [None]:
trans_df_[trans_df_['out_sanction_interest_neg_bal']>0].head()

In [None]:
trans_monthly_payment = trans_df_.loc[:, ['account_id', 'amount', 'amount_in', 'amount_out', 'out_household', 'out_insurrance', 'in_interest_credited', 'out_payment_for_statement', 'out_sanction_interest_neg_bal', 'count_out_sanction_interest_neg_bal']].copy()
trans_monthly_payment = trans_monthly_payment.groupby(by='account_id').sum().reset_index()

trans_monthly_payment['count_monthly_trans'] = trans_df_.groupby(by='account_id', as_index=False).count()['amount']

trans_monthly_payment = add_prefix_to_colnames(trans_monthly_payment, prefix='sum_monthly_payment_')
trans_monthly_payment = trans_monthly_payment.rename(columns={'sum_monthly_payment_count_monthly_trans':'count_monthly_payment_trans', 'sum_monthly_payment_count_out_sanction_interest_neg_bal':'count_out_sanction_interest_neg_bal'})
trans_monthly_payment

In [None]:
trans_df_['last_month'] = trans_df_['loan_date'] - pd.Timedelta(days=31)

In [None]:
current_monthly_payment = trans_df_[day_to_int(trans_df_['date']-trans_df_['last_month']) >= 0][['date', 'account_id', 'amount_out', 'out_household', 'out_insurrance', 'out_payment_for_statement', 'in_interest_credited']]
current_monthly_payment = current_monthly_payment.groupby(by='account_id', as_index=False).sum()
current_monthly_payment = add_prefix_to_colnames(current_monthly_payment, prefix='current_monthly_payment_')
current_monthly_payment

In [None]:
trans_monthly_payment_summary = pd.merge(trans_monthly_payment, current_monthly_payment, how='left', on=['account_id'])
trans_monthly_payment_summary

In [None]:
df.isna().any()

In [None]:
pd.merge(df, trans_monthly_payment_summary, how='left', on=['account_id']).isna().any()

In [None]:
# replace NaN with zero.
df = pd.merge(df, trans_monthly_payment_summary, how='left', on=['account_id'])
df = df.replace(np.nan, 0)
df.isna().any()

In [None]:
df.to_csv('incloud_monthly_trans.csv', index=False)

divide transaction amount sum, count with day before loan

In [None]:
t = df.copy()

In [None]:
t_list = ['sum_monthly_payment_amount', 'sum_monthly_payment_amount_in', 'sum_monthly_payment_amount_out', 'sum_monthly_payment_out_household','sum_monthly_payment_out_insurrance','sum_monthly_payment_in_interest_credited','sum_monthly_payment_out_payment_for_statement','sum_monthly_payment_out_sanction_interest_neg_bal','count_out_sanction_interest_neg_bal', 'count_monthly_payment_trans','current_monthly_payment_amount_out','current_monthly_payment_out_household','current_monthly_payment_out_insurrance','current_monthly_payment_out_payment_for_statement','current_monthly_payment_in_interest_credited']
t.loc[:, t_list] = t[t_list].div(t['day_before_loan'], axis=0)

In [None]:
t.to_csv('incloud_monthly_trans_normbyday.csv', index=False)

การใช้จ่ายอื่น ๆ (ที่ไม่ใช่จ่ายรายเดือนใน k_symbol)

In [None]:
trans_df_before_loan

In [None]:
trans_df_drop = trans_df_.drop(['trans_id', 'bank', 'account'], axis=1)
trans_df_drop.head()

date data -> 

find max, min date in each account_id and find duration. บอกระยะเวลาที่ใช้บัตรทำ transaction ตั้งแต่แรกยันล่าสุด

In [None]:
trans_df_drop['date'] = pd.to_datetime(trans_df_drop['date'], format='%Y-%m-%d')
trans_date = trans_df_drop.groupby(by='account_id').agg([min, max])['date'].reset_index()


trans_date['duration'] = trans_date['max'] - trans_date['min']
trans_date = add_prefix_to_colnames(trans_date, prefix='date_')

trans_date.head()

Numerical data 

amount, balance -> min, max, sum, mean, count

In [None]:
trans_df_ = trans_df_drop.copy()
trans_df_.head()

In [None]:
trans_amount_df = summary_group_num_data(df, trans_df_, 'amount', high_freq=True)
trans_amount_df = add_prefix_to_colnames(trans_amount_df, prefix='amount_')

trans_balance_df = summary_group_num_data(df, trans_df_, 'balance', high_freq=True)
trans_balance_df = add_prefix_to_colnames(trans_balance_df, prefix='balance_')

trans_numm_data_df = pd.merge(trans_amount_df, trans_balance_df, on='account_id')

print(trans_numm_data_df.shape)
trans_numm_data_df.head()

Cateforical data

type, operation, k_symbol -> onehot -> sum

In [None]:
trans_df_.head()

In [None]:
cate_data_names = trans_df_.columns.to_list()[4:]
print(len(cate_data_names))
cate_data_names

In [None]:
trans_cate_data_df = summary_group_cate_data(df, trans_df_, col_names=cate_data_names)
print(trans_cate_data_df.shape)
trans_cate_data_df.head()

In [None]:
trans_date

In [None]:
trans_df_final = pd.merge(trans_date, trans_numm_data_df, how='right', on=['account_id'], validate='one_to_many')
trans_df_final = pd.merge(trans_df_final, trans_cate_data_df, how='left', on=['account_id'])

trans_df_final.head()

In [None]:
trans_df_final_ = add_prefix_to_colnames(trans_df_final, prefix='trans_')
trans_df_final_.head()

In [None]:
df = pd.merge(df, trans_df_final_, how='left', on=['account_id'])
summary_df(df)

# Disp & Client & Card

In [None]:
summary_df(disp_df)
summary_df(client_df)
summary_df(card_df)

In [None]:
dcc_df = pd.merge(disp_df, client_df, how='left', on='client_id')
dcc_df = pd.merge(dcc_df, card_df, how='left', on='disp_id')

dcc_df = dcc_df.rename(columns={'type_x': 'client_type', 'type_y':'card_type', 'issued':'card_issued'})
dcc_df.head()

birth_date -> age

In [None]:
# lastest transaction in database
trans_date['date_max'].max()

In [None]:
dcc_df['birth_date'] = pd.to_datetime(dcc_df['birth_date'], format='%Y-%m-%d')
today = pd.to_datetime("1999-01-01", format='%Y-%m-%d')
dcc_df['age'] = [today.year - d.year for d in dcc_df['birth_date']]
dcc_df = dcc_df.drop(['disp_id', 'birth_date', 'district_id', 'card_id'], axis=1)
dcc_df

In [None]:
dcc_df['client_type'].value_counts()

In [None]:
dcc_df['gender'].value_counts()

client_id is unique. So, there is no more client in 1 account.

only have 1 account more client.

In [None]:
dcc_df[dcc_df.duplicated(['client_id'])]

only owner can issue permanent orders and ask for a loan

In [None]:
dcc_df_ = dcc_df[dcc_df['client_type']=='OWNER'].reset_index(drop=True)

dcc_df_['all_client_mean_age'] = dcc_df.groupby(by='account_id')['age'].mean().reset_index(drop=True)
dcc_df_['all_client_count'] = dcc_df.groupby(by='account_id')['client_id'].count().reset_index(drop=True)
dcc_df_

In [None]:
dcc_df_ = dcc_df_.drop(['client_id', 'client_type', 'card_issued'], axis=1)
dcc_df_ = dcc_df_.rename(columns={'gender':'client_gender', 'age':'client_age'})
dcc_df_

In [None]:
dcc_df_final = onehot(dcc_df_, col_name='card_type', prefix='card_type')
# dcc_df_final = onehot(dcc_df_, col_name='client_gender', prefix='client_gender')

dcc_df_final

In [None]:
df = pd.merge(df, dcc_df_final, how='left', on='account_id')

In [None]:
df

## district

In [None]:
district_df

In [None]:
selected_district_feature = ['district_id', 'A4', 'A10', 'A11', 'A14']
district_df_ = district_df[selected_district_feature]
district_df_.loc[district_df_.index, ['unemploy_rate']] = district_df[['A12', 'A13']].mean(axis=1)
district_df_.loc[district_df_.index, ['number_crimes']] = district_df[['A15', 'A16']].mean(axis=1)
district_df_.loc[district_df_.index, ['A10']] = district_df_['A10'] / 100

district_df_ = district_df_.rename(columns={'A4':'num_inhabitants', 'A10':'urban_rate', 'A11':'avg_salary', 'A14':'num_enterpreneurs_per1000inhabitants'})
district_df_

In [None]:
df = df.rename(columns={'acc_district_id':'district_id'})

In [None]:
final_df = pd.merge(df, district_df_, how='left', on='district_id')

In [None]:
final_df.shape

In [None]:
final_df.to_csv('prepared_data.csv', index=False)