In [None]:
import pandas as pd
import numpy as np
import random
import time
import os
import gc

Considering **the size** of this month dataset and **the number of time** you're going to load the data during the competition you might consider switching to parquet format ...

[Parquet documentation](http://https://parquet.apache.org/documentation/latest/)

In [None]:
%%time 

df_train = pd.read_csv(r"../input/tabular-playground-series-oct-2021/train.csv", index_col=0)
df_test = pd.read_csv(r"../input/tabular-playground-series-oct-2021/test.csv", index_col=0)

In [None]:
del df_train
del df_test
gc.collect()

In [None]:
%%time

X = pd.read_parquet(r"../input/tps-10-21-dataset-parquet/X.parquet")
y = pd.read_parquet(r"../input/tps-10-21-dataset-parquet/y.parquet")
X_test = pd.read_parquet(r"../input/tps-10-21-dataset-parquet/X_test.parquet")

**Data compression:** 


https://www.kaggle.com/dlaststark/tps-1021-la-dee-da?kernelSessionId=76105013 by DLASTSTARK

* removed the f16 option to keep it compatible with parquet format

In [None]:
INT8_MIN    = np.iinfo(np.int8).min
INT8_MAX    = np.iinfo(np.int8).max
INT16_MIN   = np.iinfo(np.int16).min
INT16_MAX   = np.iinfo(np.int16).max
INT32_MIN   = np.iinfo(np.int32).min
INT32_MAX   = np.iinfo(np.int32).max

FLOAT16_MIN = np.finfo(np.float16).min
FLOAT16_MAX = np.finfo(np.float16).max
FLOAT32_MIN = np.finfo(np.float32).min
FLOAT32_MAX = np.finfo(np.float32).max


def memory_usage(data, detail=1):
    if detail:
        display(data.memory_usage())
    memory = data.memory_usage().sum() / (1024*1024)
    print("Memory usage : {0:.2f}MB".format(memory))
    return memory


def compress_dataset(data):
    memory_before_compress = memory_usage(data, 0)
    print()
    length_interval      = 50
    length_float_decimal = 4

    print('='*length_interval)
    for col in data.columns:
        col_dtype = data[col][:100].dtype

        if col_dtype != 'object':
            print("Name: {0:24s} Type: {1}".format(col, col_dtype))
            col_series = data[col]
            col_min = col_series.min()
            col_max = col_series.max()

            if col_dtype == 'float64':
                print(" variable min: {0:15s} max: {1:15s}".format(str(np.round(col_min, length_float_decimal)), str(np.round(col_max, length_float_decimal))))
#                 if (col_min > FLOAT16_MIN) and (col_max < FLOAT16_MAX):
#                     data[col] = data[col].astype(np.float16)
#                     print("  float16 min: {0:15s} max: {1:15s}".format(str(FLOAT16_MIN), str(FLOAT16_MAX)))
#                     print("compress float64 --> float16")
                if (col_min > FLOAT32_MIN) and (col_max < FLOAT32_MAX):
                    data[col] = data[col].astype(np.float32)
                    print("  float32 min: {0:15s} max: {1:15s}".format(str(FLOAT32_MIN), str(FLOAT32_MAX)))
                    print("compress float64 --> float32")
                else:
                    pass
                memory_after_compress = memory_usage(data, 0)
                print("Compress Rate: [{0:.2%}]".format((memory_before_compress-memory_after_compress) / memory_before_compress))
                print('='*length_interval)

            if col_dtype == 'int64':
                print(" variable min: {0:15s} max: {1:15s}".format(str(col_min), str(col_max)))
                type_flag = 64
                if (col_min > INT8_MIN/2) and (col_max < INT8_MAX/2):
                    type_flag = 8
                    data[col] = data[col].astype(np.int8)
                    print("     int8 min: {0:15s} max: {1:15s}".format(str(INT8_MIN), str(INT8_MAX)))
                elif (col_min > INT16_MIN) and (col_max < INT16_MAX):
                    type_flag = 16
                    data[col] = data[col].astype(np.int16)
                    print("    int16 min: {0:15s} max: {1:15s}".format(str(INT16_MIN), str(INT16_MAX)))
                elif (col_min > INT32_MIN) and (col_max < INT32_MAX):
                    type_flag = 32
                    data[col] = data[col].astype(np.int32)
                    print("    int32 min: {0:15s} max: {1:15s}".format(str(INT32_MIN), str(INT32_MAX)))
                    type_flag = 1
                else:
                    pass
                memory_after_compress = memory_usage(data, 0)
                print("Compress Rate: [{0:.2%}]".format((memory_before_compress-memory_after_compress) / memory_before_compress))
                if type_flag == 32:
                    print("compress (int64) ==> (int32)")
                elif type_flag == 16:
                    print("compress (int64) ==> (int16)")
                else:
                    print("compress (int64) ==> (int8)")
                print('='*length_interval)

    print()
    memory_after_compress = memory_usage(data, 0)
    print("Compress Rate: [{0:.2%}]".format((memory_before_compress-memory_after_compress) / memory_before_compress))
    
    return data

In [None]:
print("==== X ====")
X = compress_dataset(X)
print("==== y ====")
y = compress_dataset(y)
print("==== X_test ====")
X_test = compress_dataset(X_test)

In [None]:
X.to_parquet("X.parquet")
X_test.to_parquet("X_test.parquet")
y.to_parquet("y.parquet")

In [None]:
del X, X_test, y
gc.collect()

In [None]:
%%time

X = pd.read_parquet(r"./X.parquet")
y = pd.read_parquet(r"./y.parquet")
X_test = pd.read_parquet(r"./X_test.parquet")