In [1]:
from datetime import timedelta
import datetime
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
pd.set_option('display.width', 1000)

In [2]:
def _translate_and_clean(account,card,client,disp,district,loan,order,trans):
    # Basic preprocessing / initial feature engineering:
    # Translate from Czech to English, undo funny formats like for gender, add a few agg stats...

    def num2date(x):
        if isinstance(x, str):
            return pd.to_datetime('19'+x, format='%Y%m%d %H:%M:%S')
        else:
            return pd.to_datetime(str(float(x)+19000000.), format='%Y%m%d')


    # Account:
    account['date'] = account['date'].apply(lambda x: num2date(x))
    account['frequency'].replace('POPLATEK MESICNE','monthly',inplace=True)
    account['frequency'].replace('POPLATEK TYDNE','weekly',inplace=True)
    account['frequency'].replace('POPLATEK PO OBRATU','after_tr',inplace=True) # after transaction
    account.rename(columns = {'frequency':'stmt_frq'}, inplace=True) # statement freq
    # Card:
    card['issued'] = card['issued'].apply(lambda x: num2date(x))
    card.rename(columns = {'issued':'date'}, inplace=True) # date credit card issued
    # Client:
    client['MM']=client['birth_number']//100 - client['birth_number']//10000*100
    client['gender'] = 'M'
    client.loc[client['MM']>50,'gender'] = 'F'
    client.loc[client['gender']=='F','birth_number'] -= 5000
    client['birth_number'] = client['birth_number'].apply(lambda x: num2date(x))
    client.rename(columns = {'birth_number':'date_birth'}, inplace=True) # client's birthdate
    client.drop('MM',1,inplace=True)
    # Disp:
    disp['type'].replace('OWNER','owner',inplace=True)
    disp['type'].replace('DISPONENT','disponent',inplace=True)
    # District:
    district.rename(columns = {
    'A1':'district_id','A2':'dname','A3':'region','A4':'pop','A5':'nmu500','A6':'nmu2k',
    'A7':'nmu10k','A8':'nmuinf','A9':'ncit','A10':'rurba','A11':'avgsal',
    'A12':'urat95','A13':'urat96','A14':'ent_ppt','A15':'ncri95','A16':'ncri96'}, inplace=True)
    # Loan:
    loan['date'] = loan['date'].apply(lambda x: num2date(x))
    # Order:
    order['k_symbol'].replace('POJISTNE','ins_paymt',inplace=True) # insurrance payment
    order['k_symbol'].replace('SIPO','household',inplace=True)
    order['k_symbol'].replace('LEASING','leasing',inplace=True)
    order['k_symbol'].replace('UVER','loan_payt',inplace=True) # loan payment
    order.rename(columns = {'k_symbol':'category'}, inplace=True)
    # Trans  # takes ~5min on my mbp w/ 16gb ram
    trans['date'] = trans['date'].apply(lambda x: num2date(x))
    trans['type'].replace('PRIJEM','credit',inplace=True)
    trans['type'].replace('VYDAJ','withdrawal',inplace=True)
    trans['operation'].replace('VYBER KARTOU','creditcard_wd',inplace=True) # credit card withdrawal
    trans['operation'].replace('VKLAD','credit_in_cash',inplace=True)
    trans['operation'].replace('PREVOD Z UCTU','coll_from_bank',inplace=True) # collection from another bank
    trans['operation'].replace('VYBER','cash_wd',inplace=True) # cash withdrawal
    trans['operation'].replace('PREVOD NA UCET','remi_to_bank',inplace=True) # remittance to another bank
    trans['k_symbol'].replace('POJISTNE','ins_paymt',inplace=True) # insurrance payment
    trans['k_symbol'].replace('SLUZBY','paymt_for_stmt',inplace=True) # payment for statement(?)
    trans['k_symbol'].replace('UROK','int_credited',inplace=True) # interest credited
    trans['k_symbol'].replace('SANKC. UROK','sanc_int',inplace=True) # sanction interest for neg balance
    trans['k_symbol'].replace('SIPO','household',inplace=True)
    trans['k_symbol'].replace('DUCHOD','pension',inplace=True) # old-age pension
    trans['k_symbol'].replace('UVER','loan_paymt',inplace=True) # loan payment
    trans.rename(columns = {'k_symbol':'category'}, inplace=True)
    
    # Note the snafu that pandas int columns can't contain NaNs, but the
    # trans.account (destination account#) field does, so it's type float.
    # But account_id column is always filled so it's an integer.

    return account,card,client,disp,district,loan,order,trans

In [11]:
def get_bank_data():
    '''
    Read, process, and provide the PKDD99 bank transactions data.
    Parameters
    ----------
    (none)
    Returns
    -------
    account, card, client, disp, district, loan, order, trans : Pandas dataframe
        The translated contents of the original PKDD99 dataset, contained in
        Panads dataframes.  Further details are available in meta-data attached
        to these dataframes in the .notes and .description attributes, e.g.
        account.notes.  Additionally the loan dataframe has a meta-data
        attribute of codes, i.e. loan.codes, returning the status definitions.
        The rest of the details are in the dataset's original data description
        document in the references.
    Notes
    -----
    Assumes the files account.asc, card.asc, client.asc, disp.asc, district.asc,
    loan.asc, order.asc, trans.asc from the original data distribution exist in
    the current working directory.
    Examples
    --------
    >>> import pkdd99_bank_data as pkdd99
    >>> account,card,client,disp,district,loan,order,trans = pkdd99.get_bank_data()
    References
    ----------
    Original data description document with further info about fields/format:
    http://sorry.vse.cz/~berka/challenge/pkdd1999/berka.htm
    '''
    # FIXME: ideally should have a check first that these files exist...
    account = pd.read_csv('C:\\Users\\renat\\Documents\\00_MBA\\PROJETO_APLICADO\\ML-predict-loan-MBA-applied-project\\dados_originais\\account.asc','delimiter',';')
    card = pd.read_csv('C:\\Users\\renat\\Documents\\00_MBA\\PROJETO_APLICADO\\ML-predict-loan-MBA-applied-project\\dados_originais\\card.asc','delimiter',';')
    client = pd.read_csv('C:\\Users\\renat\\Documents\\00_MBA\\PROJETO_APLICADO\\ML-predict-loan-MBA-applied-project\\dados_originais\\client.asc','delimiter',';')
    disp = pd.read_csv('C:\\Users\\renat\\Documents\\00_MBA\\PROJETO_APLICADO\\ML-predict-loan-MBA-applied-project\\dados_originais\\disp.asc','delimiter',';')
    district = pd.read_csv('C:\\Users\\renat\\Documents\\00_MBA\\PROJETO_APLICADO\\ML-predict-loan-MBA-applied-project\\dados_originais\\district.asc','delimiter',';')
    loan = pd.read_csv('C:\\Users\\renat\\Documents\\00_MBA\\PROJETO_APLICADO\\ML-predict-loan-MBA-applied-project\\dados_originais\\loan.asc','delimiter',';')
    order = pd.read_csv('C:\\Users\\renat\\Documents\\00_MBA\\PROJETO_APLICADO\\ML-predict-loan-MBA-applied-project\\dados_originais\\order.asc','delimiter',';')
    trans = pd.read_csv('C:\\Users\\renat\\Documents\\00_MBA\\PROJETO_APLICADO\\ML-predict-loan-MBA-applied-project\\dados_originais\\trans.asc','delimiter',';',low_memory=False)

    account.name = 'Account'
    card.name = 'Card'
    client.name = 'Client'
    disp.name = 'Disp'
    district.name = 'District'
    loan.name = 'Loan'
    order.name = 'Order'
    trans.name = 'Trans'

    # descriptions are cut/pasted from the Financial Data Description webpage for the data
    account.description = 'each record describes static characteristics of an account'
    card.description = 'each record describes a credit card issued to an account'
    client.description = 'each record describes characteristics of a client'
    disp.description = 'each record relates together a client with an account'
    district.description = 'each record describes demographic characteristics of a district'
    loan.description = 'each record describes a loan granted for a given account'
    order.description = 'each record describes characteristics of a payment order'
    trans.description = 'each record describes one transaction on an account'

    account.notes = '(one account can have one or more clients, e.g. married couples)'
    card.notes = '(one account can have one or more credit cards)'
    client.notes = '(one client can have one or more accounts)'
    disp.notes = '(disposition connects a client/account pair and allows to link one or more cards)'
    district.notes = '(neighborhoods for both bank/account branches and client homes.  same 16 fields as original A1-16.)'
    loan.notes = '(one account may have zero or one loan.  see loan.codes for ABCD status definitions)'
    order.notes = '(one payment order is from one account)'
    trans.notes = '(category,bank,account are NaN for some types/operations.)'
    loan.codes = 'Loan status codes:\nA = contract finished, no problems\nB = contract finished, loan not payed\nC = running contract, OK so far\nD = running contract, client in debt'

    account,card,client,disp,district,loan,order,trans = _translate_and_clean(
        account,card,client,disp,district,loan,order,trans
    )

    return account,card,client,disp,district,loan,order,trans

In [12]:
account,card,client,disp,district,loan,order, trans = get_bank_data()

In [22]:
print(account.head(), account.shape)
print(card.head(), card.shape)
print(client.head(), client.shape)
print(disp.head(), disp.shape)

   account_id  district_id stmt_frq       date
0         576           55  monthly 1993-01-01
1        3818           74  monthly 1993-01-01
2         704           55  monthly 1993-01-01
3        2378           16  monthly 1993-01-01
4        2632           24  monthly 1993-01-02 (4500, 4)
   card_id  disp_id     type       date
0     1005     9285  classic 1993-11-07
1      104      588  classic 1994-01-19
2      747     4915  classic 1994-02-05
3       70      439  classic 1994-02-08
4      577     3687  classic 1994-02-15 (892, 4)
   client_id date_birth  district_id gender
0          1 1970-12-13           18      F
1          2 1945-02-04            1      M
2          3 1940-10-09            1      F
3          4 1956-12-01            5      M
4          5 1960-07-03            5      F (5369, 4)
   disp_id  client_id  account_id       type
0        1          1           1      owner
1        2          2           2      owner
2        3          3           2  disponent
3    

In [23]:
print(district.head(), district.shape)
print(loan.head(), loan.shape)
print(order.head(), order.shape)
print(trans.head(), trans.shape)

   district_id        dname           region      pop  nmu500  nmu2k  nmu10k  nmuinf  ncit  rurba  avgsal urat95  urat96  ent_ppt ncri95  ncri96
0            1  Hl.m. Praha           Prague  1204953       0      0       0       1     1  100.0   12541   0.29    0.43      167  85677   99107
1            2      Benesov  central Bohemia    88884      80     26       6       2     5   46.7    8507   1.67    1.85      132   2159    2674
2            3       Beroun  central Bohemia    75232      55     26       4       1     5   41.7    8980   1.95    2.21      111   2824    2813
3            4       Kladno  central Bohemia   149893      63     29       6       2     6   67.4    9753   4.64    5.05      109   5244    5892
4            5        Kolin  central Bohemia    95616      65     30       4       1     6   51.4    9307   3.85    4.43      118   2616    3040 (77, 16)
   loan_id  account_id       date  amount  duration  payments status
0     5314        1787 1993-07-05   96396        12 

In [30]:
caminho_salvar = 'C:\\Users\\renat\\Documents\\00_MBA\\PROJETO_APLICADO\\ML-predict-loan-MBA-applied-project\\dados_tratados'

In [32]:
#account,card,client,disp,district,loan,order,trans

account.to_csv(f'{caminho_salvar}\\account.csv', index=False, decimal=".", sep=";")
card.to_csv(f'{caminho_salvar}\\card.csv', index=False, decimal=".", sep=";")
client.to_csv(f'{caminho_salvar}\\client.csv', index=False, decimal=".", sep=";")
disp.to_csv(f'{caminho_salvar}\\disp.csv', index=False, decimal=".", sep=";")
district.to_csv(f'{caminho_salvar}\\district.csv', index=False, decimal=".", sep=";")
loan.to_csv(f'{caminho_salvar}\\loan.csv', index=False, decimal=".", sep=";")
order.to_csv(f'{caminho_salvar}\\order.csv', index=False, decimal=".", sep=";")
trans.to_csv(f'{caminho_salvar}\\trans.csv', index=False, decimal=".", sep=";")

---