In [None]:
import numpy as np 
import pandas as pd 
from time import time
from sklearn.preprocessing import QuantileTransformer
import os

In [None]:
def reduce_mem_usage(df, obj_to_cat=False, inplace=True):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
        obj_to_cat: turn 'object' cols to 'category'
        inplace: inplace dataframe to not mess with original df.
    """
    if not inplace:
        df = df.copy()

    for col in df.columns:
        col_type = df[col].dtype.name
        if 'datetime' in col_type:
            pass
        elif col_type == 'object':
            if obj_to_cat:
                df[col] = df[col].astype('category')
        elif col_type == 'category':
            pass
        else:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    if not inplace:
        return df

In [None]:
start = time()
train = pd.read_csv('/kaggle/input/tabular-playground-series-oct-2021/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-oct-2021/test.csv')
ss = pd.read_csv('/kaggle/input/tabular-playground-series-oct-2021/sample_submission.csv')
print(f'Took {time() - start :.2f} seconds.')

In [None]:
train.info(memory_usage='deep')

In [None]:
reduce_mem_usage(train)
reduce_mem_usage(test)

In [None]:
train.info(memory_usage='deep')

In [None]:
start = time() 
train.to_feather('train')
test.to_feather('test')
ss.to_feather('sample_submission')
print(f'Took {time() - start :2f} seconds')

In [None]:
start = time()
train = pd.read_feather('/kaggle/working/train')
test = pd.read_feather('/kaggle/working/test')
ss = pd.read_feather('/kaggle/working/sample_submission')
print(f'Took {time() - start :2f} seconds')

In [None]:
##########################################
##########################################
#Rank-Gauss
##########################################
##########################################
FLOATS = [feat for feat in train.columns if 'float' in train[feat].dtype.name]
    
#Rank Gauss
start = time()
qt = QuantileTransformer(n_quantiles=1000, output_distribution='normal')
train[FLOATS] = qt.fit_transform(train[FLOATS])
test[FLOATS] = qt.transform(test[FLOATS])
print(f'Took {time() - start :.2f} seconds')

#Memory downsize
train[FLOATS] = reduce_mem_usage(train[FLOATS], inplace=False)
test[FLOATS] = reduce_mem_usage(test[FLOATS], inplace=False)
    
#Saving Rank Gaussed Preds
train.to_feather('train_rg')
test.to_feather('test_rg')

In [None]:
train = pd.read_feather('/kaggle/working/train_rg')
train.head()