In [1]:
import pandas as pd
import numpy as np

import gc
from tqdm import tqdm

In [2]:
TRAIN_DATA_PATH = "train_data/"
TEST_DATA_PATH = "test_data/"

TRAIN_TARGET_PATH = "train_target.csv"

TRAIN_FE_PATH = 'train_data_fe/'
TEST_FE_PATH = 'test_data_fe/'

In [3]:
def feature_engineering(data: pd.DataFrame) -> pd.DataFrame:
    """
    Function transforms features to OHE and counts number of ones in each new column
    
    param data: pd.DataFrame to transform
    return: pd.DataFrame - transformed data

    """
    ### custom features ###
    
    # difference between planned number of days from the loan opening date to the closing date and fact
    data['pre_term_diff'] = data['pre_pterm'] - data['pre_fterm']
    # flag whether loan is closed earlier or later than planned
    data['is_closed_earlier_term'] = data['pre_term_diff'].apply(lambda x: 1 if x > 0 else 0)
    data['is_closed_later_term'] = data['pre_term_diff'].apply(lambda x: 1 if x < 0 else 0)
    
    ### ohe ###
    
    # dataframe feature names
    feauture_columns = data.columns.values[2:]
    # get dummies
    dummies = pd.get_dummies(data[feauture_columns], columns=feauture_columns)
    # concat dummies with id and rn columns
    ohe_features = pd.concat([data.iloc[:, :2], dummies], axis=1)
    # count how many times client has 1 in each column
    data_fe = ohe_features.groupby("id")[dummies.columns.values].sum()
    # count amount of credits of each client
    count_rn = data.groupby('id')['rn'].max()
    # concat amount of credits to the rest of data
    data_fe = pd.concat([data_fe, count_rn], axis=1)
    
    
    return data_fe

In [5]:
# TRAIN FE

for num in tqdm(range(12)):
    # read the data
    train = pd.read_parquet(TRAIN_DATA_PATH + f'train_data_{num}.pq')
    # transform the data
    train_fe = feature_engineering(train)
    # save the data
    train_fe.to_csv(TRAIN_FE_PATH + f"train_fe_{num}")
    
    del train, train_fe

 92%|█████████▏| 11/12 [25:25<02:18, 138.71s/it]


MemoryError: Unable to allocate 8.18 GiB for an array with shape (2450630, 448) and data type float64

WE DO NOT HAVE ENOUGH MEMORY FOR 11TH PART LET'S CHECK ITS & TEST'S SHAPE 

In [4]:
# read the data
train_11 = pd.read_parquet(TRAIN_DATA_PATH + f'train_data_11.pq')
test_0 = pd.read_parquet(TEST_DATA_PATH + f'test_data_0.pq')
test_1 = pd.read_parquet(TEST_DATA_PATH + f'test_data_1.pq')

# shape of data
train_11.shape, test_0.shape, test_1.shape

((2450630, 61), (2389773, 61), (2334828, 61))

WE SEE THAT BOTH OF 3 DATAFRAMES HAVE SIMILAR SHAPE

SPLIT THEM ALL TO 2 PARTS

In [5]:
def split_data(data: pd.DataFrame) -> tuple:
    """
    Split the data to 2 parts
    
    param: data - pd.DataFrame of data to split
    
    """
    # middle index of data
    middle_idx = np.round(len(data) / 2)
    # id of the middle index of data
    middle_idx_id = data.loc[middle_idx, 'id']
    # splt data by id
    data_1 = data[data['id'] <= middle_idx_id]
    data_2 = data[data['id'] > middle_idx_id]
    
    return data_1, data_2

In [6]:
# TRAIN

train_11_0, train_11_1 = split_data(train_11)
train_11_0.shape, train_11_1.shape

((1225324, 61), (1225306, 61))

In [7]:
# TEST

test_0_0, test_0_1 = split_data(test_0)
test_1_0, test_1_1 = split_data(test_1)

test_0_0.shape, test_0_1.shape, test_1_0.shape, test_1_1.shape

((1194889, 61), (1194884, 61), (1167428, 61), (1167400, 61))

FINISH THE FEATURE ENGINEERING

In [8]:
# transform the data
train_11_0_fe = feature_engineering(train_11_0)
train_11_1_fe = feature_engineering(train_11_1)
test_0_0_fe = feature_engineering(test_0_0)
test_0_1_fe = feature_engineering(test_0_1)
test_1_0_fe = feature_engineering(test_1_0)
test_1_1_fe = feature_engineering(test_1_1)

# save the data
train_11_0_fe.to_csv(TRAIN_FE_PATH + "train_fe_11_0")
train_11_1_fe.to_csv(TRAIN_FE_PATH + "train_fe_11_1")
test_0_0_fe.to_csv(TEST_FE_PATH + "test_fe_0_0")
test_0_1_fe.to_csv(TEST_FE_PATH + "test_fe_0_1")
test_1_0_fe.to_csv(TEST_FE_PATH + "test_fe_1_0")
test_1_1_fe.to_csv(TEST_FE_PATH + "test_fe_1_1")

del train_11_0, train_11_1
del test_0_0, test_0_1, test_1_0, test_1_1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['pre_term_diff'] = data['pre_pterm'] - data['pre_fterm']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['is_closed_earlier_term'] = data['pre_term_diff'].apply(lambda x: 1 if x > 0 else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['is_closed_later_term'] = data['pre_term_diff'].

FEATURE ENGINEERING IS DONE