In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

import gc
from tqdm import tqdm

In [2]:
TRAIN_DATA_PATH = "train_data/"
TEST_DATA_PATH = "test_data/"

TRAIN_TARGET_PATH = "train_target.csv"

TRAIN_FE_PATH = 'train_data_fe/'
TEST_FE_PATH = 'test_data_fe/'

WE USE ONE-HOT ENCODING METHOD FOR DATA ENCODING.

FIRST WE FIND A PART WITH THE HIGHEST NUMBER OF UNIQUE VALUES IN ORDER TO PRESERVE ALL THE UNIQUE VALUES, FIT THE ENCODER ON IT  AND THEN TRANSFORM THE REST OF PARTS.

AT THE END, WE COUNT THE NUMBER OF TIMES EACH FEATURE IS OCCURRED GROUPPED BY ID

In [18]:
for num in range(12):
    # read the data
    data = pd.read_parquet(TRAIN_DATA_PATH + f'train_data_{num}.pq')
    # create custom features
    # difference between planned number of days from the loan opening date to the closing date and fact
    data['pre_term_diff'] = data['pre_pterm'] - data['pre_fterm']
    # flag whether loan is closed earlier or later than planned
    data['is_closed_earlier_term'] = data['pre_term_diff'].apply(lambda x: 1 if x > 0 else 0)
    data['is_closed_later_term'] = data['pre_term_diff'].apply(lambda x: 1 if x < 0 else 0)
    # number of unique values 
    nunique = data.iloc[:, 2:].nunique().sum()
    print(f"{num} df has {nunique} unique values.")
    
    del data, nunique

0 df has 433 unique values.
1 df has 437 unique values.
2 df has 436 unique values.
3 df has 439 unique values.
4 df has 438 unique values.
5 df has 442 unique values.
6 df has 447 unique values.
7 df has 441 unique values.
8 df has 446 unique values.
9 df has 449 unique values.
10 df has 447 unique values.
11 df has 448 unique values.


THE 9TH DF HAS THE HIGHEST NUMBER OF UNIQUE VALUES.

In [3]:
# read the data
data9 = pd.read_parquet(TRAIN_DATA_PATH + f'train_data_{9}.pq')
# create custom features
# difference between planned number of days from the loan opening date to the closing date and fact
data9['pre_term_diff'] = data9['pre_pterm'] - data9['pre_fterm']
# flag whether loan is closed earlier or later than planned
data9['is_closed_earlier_term'] = data9['pre_term_diff'].apply(lambda x: 1 if x > 0 else 0)
data9['is_closed_later_term'] = data9['pre_term_diff'].apply(lambda x: 1 if x < 0 else 0)

In [7]:
%%time
# ohe 
OHE_ENCODER = OneHotEncoder(handle_unknown='ignore', sparse=False)
# fit the encoder
OHE_ENCODER.fit(data9.iloc[:, 2:])
# get the columns names
ALL_COLUMNS = ohe_encoder.get_feature_names(data9.iloc[:, 2:].columns)

CPU times: user 2.03 s, sys: 80.6 ms, total: 2.11 s
Wall time: 2.11 s


AFTER ONE-HOT ENCODING ALL FEATURES' DTYPES BECOME INT64, SO WE WILL CHANGE THEM BACK TO INT8

In [6]:
def features_encoding(data: pd.DataFrame, fitted_encoder: OneHotEncoder, columns: np.array) -> pd.DataFrame:
    """
    Function transforms features to OHE and counts how many times each feature occured groupped by id
    
    param data: pd.DataFrame to transform
    fitted_encoder: OneHotEncoder fitted on part with highest number of unique values
    return: pd.DataFrame - transformed data

    """
    ### custom features ###
    
    # difference between planned number of days from the loan opening date to the closing date and fact
    data['pre_term_diff'] = data['pre_pterm'] - data['pre_fterm']
    # flag whether loan is closed earlier or later than planned
    data['is_closed_earlier_term'] = data['pre_term_diff'].apply(lambda x: 1 if x > 0 else 0)
    data['is_closed_later_term'] = data['pre_term_diff'].apply(lambda x: 1 if x < 0 else 0)
    
    ### ohe ###
    
    # transform the data
    encoded = fitted_encoder.transform(data.iloc[:, 2:])
    # one-hot encoder returns float64 dtype, lets'transform it to int8 and create df
    data_enc = pd.DataFrame(encoded.astype('int8'), columns=columns)
    # concat encodet data with id columns
    data_enc = pd.concat([data.iloc[:, 0], data_enc], axis=1)
    # count each event(feature) of clients
    data_fe = data_enc.groupby('id')[columns].sum()
    # count amount of credits of each client
    count_rn = data.groupby('id')['rn'].max()
    # concat amount of credits to the rest of data
    data_fe = pd.concat([data_fe, count_rn], axis=1)
    
    
    return data_fe

ENCODE THE DATA

In [9]:
# TRAIN FE

for num in tqdm(range(12)):
    # read the data
    train = pd.read_parquet(TRAIN_DATA_PATH + f'train_data_{num}.pq')
    # transform the data
    train_fe = features_encoding(train, OHE_ENCODER, ALL_COLUMNS)
    # save the data
    train_fe.to_parquet(TRAIN_FE_PATH + f"train_fe_{num}.pq")
    
    del train, train_fe

100%|███████████████████████████████████████████| 12/12 [09:07<00:00, 45.60s/it]


In [10]:
# TEST FE

for num in tqdm(range(2)):
    # read the data
    test = pd.read_parquet(TEST_DATA_PATH + f'test_data_{num}.pq')
    # transform the data
    test_fe = features_encoding(test, OHE_ENCODER, ALL_COLUMNS)
    # save the data
    test_fe.to_parquet(TEST_FE_PATH + f"test_fe_{num}.pq")
    
    del test, test_fe

100%|█████████████████████████████████████████████| 2/2 [01:39<00:00, 49.59s/it]


FEATURE ENCODING IS DONE