# Optimizing and parquetizing CAPG data

Here we will optimize the CAPG data CSV files after generating them per-subject.

***NOTE:*** To use this notebook, apart from pandas, you also need to install `fastparquet` and `pyarrow` using pip or conda.

In [1]:
import os
import pandas as pd

In [2]:
# Uint8 columns
uint8_cols = ['subject','gesture','trial']

In [3]:
homedir = "../data/CAPG/"
db_str = ['dba','dbb','dbc']
for s in db_str:
    listdirs = os.listdir(homedir+s)
    for dir in listdirs:
        print("Optimizing "+homedir+s+'/'+dir)
        # Get file system data
        csvfiles = os.listdir(homedir+s+'/'+dir)
        csvfiles = [f for f in csvfiles if f.endswith('.csv')]
        csvfile = csvfiles[0]
        csvfileWOextension = homedir+s+'/'+dir+'/'+csvfile[:-4]
        # Read data
        df = pd.read_csv(homedir+s+'/'+dir+'/'+csvfile)
        # Optimize columns
        df[uint8_cols] = df[uint8_cols].astype('uint8')
        rest_cols = [c for c in df.columns if c not in uint8_cols]
        df[rest_cols] = df[rest_cols].astype('float32')
        # Save parquet format (comment to deactivate)
        df.to_parquet(csvfileWOextension+".parquet", compression='gzip')
        # Save CSV format but lighter (comment to deactivate)
        # df.to_csv(csvfileWOextension+"_light.csv", index=False, header=True, float_format='%.6f')
        os.remove(homedir+s+'/'+dir+'/'+csvfile)

Optimizing ../data/CAPG/dba/dba-preprocessed-001
Optimizing ../data/CAPG/dba/dba-preprocessed-002
Optimizing ../data/CAPG/dba/dba-preprocessed-003
Optimizing ../data/CAPG/dba/dba-preprocessed-004
Optimizing ../data/CAPG/dba/dba-preprocessed-005
Optimizing ../data/CAPG/dba/dba-preprocessed-006
Optimizing ../data/CAPG/dba/dba-preprocessed-007
Optimizing ../data/CAPG/dba/dba-preprocessed-008
Optimizing ../data/CAPG/dba/dba-preprocessed-009
Optimizing ../data/CAPG/dba/dba-preprocessed-010
Optimizing ../data/CAPG/dba/dba-preprocessed-011
Optimizing ../data/CAPG/dba/dba-preprocessed-012
Optimizing ../data/CAPG/dba/dba-preprocessed-013
Optimizing ../data/CAPG/dba/dba-preprocessed-014
Optimizing ../data/CAPG/dba/dba-preprocessed-015
Optimizing ../data/CAPG/dba/dba-preprocessed-016
Optimizing ../data/CAPG/dba/dba-preprocessed-017
Optimizing ../data/CAPG/dba/dba-preprocessed-018
Optimizing ../data/CAPG/dbb/dbb-preprocessed-001
Optimizing ../data/CAPG/dbb/dbb-preprocessed-002
Optimizing ../data/C

In [None]:
general_info = pd.read_csv("../data/CAPG/general_info.csv")
general_info.astype('uint8').to_csv("../data/CAPG/general_info.csv", index=False, header=True)