In [None]:
import os
import pandas as pd
import argparse


def csv_to_pickle(data_path, output_path):

    data_files = [v for v in os.listdir(data_path) if ".csv" in v]

    if not os.path.isdir(output_path):
        os.makedirs(output_path)

    dtype_train = {
        "row_id": "str",
        "time_id": "uint16",
        "investment_id": "uint16",
        "target": "float32",
    }
    for i in range(300):
        dtype_train[f"f_{i}"] = "float32"

    for file in data_files:

        input_file_path = f"{data_path}/{file}"
        output_file = file.replace(".csv", ".p")
        output_file_path = f"{output_path}/{output_file}"

        print(f"Converting {input_file_path} to {output_file_path}")

        if "train.csv" in file:
            dtype = dtype_train
        else:
            dtype = None

        pd.to_pickle(pd.read_csv(input_file_path, dtype=dtype), output_file_path)

    print("Done")


def build_last_train(output_path, n_steps = 1000):
    print(f"Building dataframe with last {n_steps} time_ids.")
    train = pd.read_pickle(f"{output_path}/train.p")
    supplemental_train = pd.read_pickle(f"{output_path}/supplemental_train.p")

    last_train = pd.concat([train, supplemental_train[train.columns]])
    last_timesteps = sorted(last_train.time_id.unique())
    last_train = last_train[last_train.time_id > last_timesteps[-n_steps]]
    last_train = last_train.sort_values(by=["time_id","investment_id"])
    
    output_filepath = f"{output_path}/train_last_{n_steps}_timesteps.p"
    print(f"Saving at {output_filepath}")
    pd.to_pickle(last_train,output_filepath)
    print("Done")

In [None]:
data_path = "/kaggle/input/ubiquant-market-prediction/"
output_path = "/kaggle/working/"

csv_to_pickle(data_path, output_path)
build_last_train(output_path, n_steps = 1000)