In [30]:
import pandas as pd
import numpy as np
import gc

Training set divided on 12 parts and Testing set divided to 2 parts.

Let's calculate memory used by dataset.

In [13]:
# read the data
train = pd.read_parquet('train_data/train_data_0.pq')
test = pd.read_parquet('test_data/test_data_0.pq')

In [22]:
# memory usage
train_usage = round(train.memory_usage(index=True).sum() / 10**9, 2)
test_usage = round(test.memory_usage(index=True).sum() / 10**9, 2)

print(f"One training part uses {train_usage} GB of RAM")
print(f"Approximately whole trainnig set uses {train_usage * 12} GB of RAM")

print(f"\nOne testing part uses {test_usage} GB of RAM")
print(f"Approximately whole testing set uses {test_usage * 2} GB of RAM")

One training part uses 0.96 GB of RAM
Approximately whole trainnig set uses 11.52 GB of RAM

One testing part uses 1.17 GB of RAM
Approximately whole testing set uses 2.34 GB of RAM


We will change the dtypes of data to reduce memory usage.

In [28]:
train.sample(5)

Unnamed: 0,id,rn,pre_since_opened,pre_since_confirmed,pre_pterm,pre_fterm,pre_till_pclose,pre_till_fclose,pre_loans_credit_limit,pre_loans_next_pay_summ,...,enc_paym_21,enc_paym_22,enc_paym_23,enc_paym_24,enc_loans_account_holder_type,enc_loans_credit_status,enc_loans_credit_type,enc_loans_account_cur,pclose_flag,fclose_flag
1165659,148981,7,13,6,3,5,13,3,5,1,...,3,3,3,4,1,3,4,1,0,0
1645399,209358,8,6,11,14,3,1,5,10,2,...,3,3,3,4,1,3,4,1,0,0
1530527,195174,4,14,4,14,7,5,7,17,2,...,0,3,3,4,1,2,3,1,0,0
93539,12128,2,14,9,3,5,9,1,5,1,...,3,3,3,4,1,3,4,1,0,0
540062,69482,10,13,5,17,16,16,12,2,2,...,0,0,0,1,1,3,4,1,0,0


In [27]:
# data dtypes
train.dtypes.value_counts()

int64    61
dtype: int64

In [32]:
del train, test, train_usage, test_usage
gc.collect()

0

- id -> **int32**
- the rest of columns -> **int8**

In [34]:
# train
for num in range(12):  
  data = pd.read_parquet(f'train_data/train_data_{num}.pq')
  for col in data.columns[1:]:
    data[col] = data[col].astype('int8')
  data['id'] = data['id'].astype('int32')
  data.to_parquet(f'train_data/train_data_{num}.pq')

del data

In [31]:
# test
for num in range(2):  
  data = pd.read_parquet(f'test_data/test_data_{num}.pq')
  for col in data.columns[1:]:
    data[col] = data[col].astype('int8')
  data['id'] = data['id'].astype('int32')
  data.to_parquet(f'test_data/test_data_{num}.pq')

del data
gc.collect()

32

In [33]:
# read the new data
new_train = pd.read_parquet('train_data/train_data_0.pq')
new_test = pd.read_parquet('test_data/test_data_0.pq')

# memory usage
train_usage = round(new_train.memory_usage(index=True).sum() / 10**9, 2)
test_usage = round(new_test.memory_usage(index=True).sum() / 10**9, 2)

print(f"One training part uses {train_usage} GB of RAM")
print(f"Approximately whole trainnig set uses {train_usage * 12} GB of RAM")

print(f"\nOne testing part uses {test_usage} GB of RAM")
print(f"Approximately whole testing set uses {test_usage * 2} GB of RAM")

One training part uses 0.13 GB of RAM
Approximately whole trainnig set uses 1.56 GB of RAM

One testing part uses 0.15 GB of RAM
Approximately whole testing set uses 0.3 GB of RAM


CHANGING THE DATA TYPE ALLOWED US TO REDUCE MEMORY USAGE BY ALMOST 10 TIMES.