In [1]:
# only once
# !pip install pyreadr

# Prepare

In [2]:
# import libraries
import os.path
import pandas as pd
import pyreadr

In [3]:
# check memory usage
import os
import gc
import psutil

In [4]:
DIR = 'E:\Datasets\TEP\dataverse'  # dataset source dir
R_FILES = ('TEP_FaultFree_Training.RData',  # datasets RData, sorted by size asc
           'TEP_FaultFree_Testing.RData',
#            'TEP_Faulty_Training.RData',
#            'TEP_Faulty_Testing.RData',
        )
DTYPES_FILE = 'dtypes.json'  # dtypes of columns

In [5]:
proc = psutil.Process(os.getpid())

def print_memusage(prefix=''):
# print memory usage info
    print(prefix, f'{proc.memory_info().rss/1024**2:0.2f} MB')

In [6]:
print_memusage('Before loading')
print()

Before loading 96.90 MB


# Convert RData

In [7]:
def optimize_dtypes(df: pd.DataFrame) -> None:
    # optimize dataframe by memory usage
    uint_columns = df.columns.values[:3]  # this columns can be uint
    float_columns = df.columns.values[3:]  # other must be float
    df[uint_columns] = df[uint_columns].apply(pd.to_numeric, downcast='unsigned')
    df[float_columns] = df[float_columns].apply(pd.to_numeric, downcast='float')
    
    # saving our dtypes description
    dtypes_file = os.path.join(DIR, DTYPES_FILE)
    if not os.path.isfile(dtypes_file):
        # we need to create it
        names = df.dtypes.index  # columns names
        types = [c.name for c in df.dtypes]  # columns types
        dtypes_dict = dict(zip(names, types))  # dict for pandas.read_csv
        with open(dtypes_file, 'w') as f:
            json.dump(dtypes_dict, f)
        
    return

In [8]:
for f in R_FILES:
    r_file = os.path.join(DIR, f)
    r_data = pyreadr.read_r(r_file)
    print_memusage('After reading ' + r_file)

    for k in r_data.keys():
        print('Dataset', k, 'with shape', r_data[k].shape)
        optimize_dtypes(r_data[k])
        print_memusage('After optimizing')
        
        c_file = os.path.join(DIR, k)
        compression_opts = dict(method='zip',
                                archive_name=k+'.csv',
                               )
        r_data[k].to_csv(c_file+'.zip',
                         index=False,
                         compression=compression_opts,
                        )

    del r_data  # because need a lot of RAM
    print_memusage('After deleting')
    print()

After reading E:\Datasets\TEP\dataverse\TEP_FaultFree_Training.RData 203.80 MB
Dataset fault_free_training with shape (250000, 55)
After optimizing 149.87 MB
After deleting 104.00 MB
After reading E:\Datasets\TEP\dataverse\TEP_Faulty_Training.RData 2162.84 MB
Dataset faulty_training with shape (5000000, 55)
After optimizing 1118.68 MB
After deleting 108.83 MB
