# Begin

In [None]:
import numpy as np
import pandas as pd
import tqdm.notebook as tqdm
import torch
import torch.nn as nn
import torch.optim as optim
import time
import os

# Data Loading

First, declare the column dtypes:

In [None]:
col_dtypes = {
    'row_id' : np.object,
    'time_id' : np.uint16,
    'investment_id' : np.uint16,
    'target' : np.float64,
}
for i in range(300):
    col_dtypes[f"f_{i}"] = np.float32

Load full train data using declared dtypes to save space.

In [None]:
%%time
train = pd.read_csv("../input/ubiquant-market-prediction/train.csv", dtype=col_dtypes)

In [None]:
train

# Batching
Batches will be generated for each `time_id`. Missing parts of the data (each `investment_id` missing) will be filled with zeros.

In [None]:
pardir = '/kaggle/working/by_time_id'
os.mkdir(pardir)

keep_cols = ['investment_id', 'target'] + [f"f_{i}" for i in range(300)]

n_inv_id = train.investment_id.max()
filler = range(1, n_inv_id+1)

for group in tqdm.tqdm(train.groupby(by='time_id'), desc='Grouping by time_id'):
    df = group[1]
    df['sort'] = df['investment_id']
    df = df.set_index('sort').reindex(filler).fillna(
        0).reset_index()[keep_cols]
    df.to_parquet(
        pardir+f"/train_data_time_id_{group[0]}.parquet"
    )

Test reading time of parquet batch and saving metadata of batch

In [None]:
metadata = {'time_id':[], 'paths':[]}
for i in tqdm.tqdm(train.time_id.unique()):
    try:
        temp = pd.read_parquet(pardir+f"/train_data_time_id_{i}.parquet")
        metadata['time_id'].append(i)
        metadata['paths'].append(f"train_data_time_id_{i}.parquet")
    except:
        print("Missing!")
        
metadata = pd.DataFrame(metadata)
metadata.to_csv("train_time_id_meta.csv", index=False)
metadata