#  UMP: Create Pickle Dataset
## Import Packages

In [None]:
import os
import pandas as pd
import numpy as np
import gc
import math

## Utilities

In [None]:
def reduce_memory_usage(df, features):
    for feature in features:
        item = df[feature].astype(np.float16)
        df[feature] = item
        del item
        gc.collect()

## Import dataset

In [None]:
%%time
n_features = 300
features = [f'f_{i}' for i in range(n_features)]
feature_columns = ['investment_id', 'time_id'] + features
train = pd.read_parquet('../input/ubiquant-parquet/train_low_mem.parquet', columns=feature_columns + ["target"])
train.head()

## Reducing Memories
There are totally 3141410 records and each record has 303 columns. If we convert all data type to int16 and float16, then the total memory of training data will be  (3141410 x 303 x 2)  / (1024^3) G, which is about 1.8G.

In [None]:
train.info()

In [None]:
%%time
reduce_memory_usage(train, features + ["target"])

In [None]:
train.info()

In [None]:
train.to_pickle("train.pkl")

In [None]:
del train
gc.collect()

## Read Pickle file

In [None]:
train = pd.read_pickle("/kaggle/working/train.pkl")
train.head()