# Convert `hdf5` to `parquet`

- The formats of fits table and hdf5 do not support hadoop file system. Hence, I need an access to *local file system* such as `file://` or such, to read `hdf5`. I eventually convert them to parquets. 

- We may need a lot of memory in the driver node. So set this option, `--driver-memory 64g` (or more than 64GB).

- This input file size is 39GB. I have to do chunk-wise conversions. HDF5 sucks..  

In [1]:
# basic packages
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.spatial import cKDTree
#import databricks.koalas as ks

# the default `sequence` option merges all partitions into a single catastrophic one : what?
#ks.set_option('compute.default_index_type', 'distributed') 

# plot settings
plt.rc('font', family='serif') 
plt.rc('font', serif='Times New Roman') 
plt.rcParams.update({'font.size': 16})

In [2]:
# PySpark packages
from pyspark import SparkContext   
#from pyspark.sql import SQLContext; SQLContex is obsolete !! using SparkSession
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .master("yarn") \
    .appName("spark-shell") \
    .config("spark.driver.maxResultSize", "32g") \
    .config("spark.driver.memory", "64g") \
    .config("spark.executor.memory", "7g") \
    .config("spark.executor.cores", "1") \
    .config("spark.executor.instances", "50") \
    .getOrCreate()


sc = spark.sparkContext
sc.setCheckpointDir("hdfs://spark00:54310/tmp/checkpoints")

import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark import Row
from pyspark.sql.window import Window as W

In [3]:
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

In [4]:
spark.conf.get("spark.sql.execution.arrow.pyspark.enabled")

'true'

In [5]:
spark.conf.get("spark.driver.memory")

'64g'

In [6]:
import h5py
import pyarrow as pa
import pyarrow.parquet as pq

## Read `hdf5` in local dir

In [7]:
!ls -lh /mnt/raid5/shong/ananke/

total 428G
-rw-rw-r-- 1 shong shong  53G  6월 26 10:31 ananke-m12i-lsr0.csv.gz
-rw-rw-r-- 1 shong shong  17K  7월 18 11:27 basic_m12i_stat.html
-rw-rw-r-- 1 shong shong  17K  7월 14 11:35 basic_stat.html
-rw-rw-r-- 1 shong shong  18G  7월 12 21:12 lsr-0-rslice-0.m12f-res7100-md-sliced-gcat-dr3.0.hdf5
-rw-rw-r-- 1 shong shong  17G  7월 12 20:37 lsr-0-rslice-1.m12f-res7100-md-sliced-gcat-dr3.0.hdf5
-rw-rw-r-- 1 shong shong  23G  7월 12 20:59 lsr-0-rslice-2.m12f-res7100-md-sliced-gcat-dr3.0.hdf5
-rw-rw-r-- 1 shong shong  24G  7월 12 20:52 lsr-0-rslice-3.m12f-res7100-md-sliced-gcat-dr3.0.hdf5
-rw-rw-r-- 1 shong shong  22G  7월 12 19:59 lsr-0-rslice-4.m12f-res7100-md-sliced-gcat-dr3.0.hdf5
-rw-rw-r-- 1 shong shong  26G  7월 12 17:57 lsr-0-rslice-5.m12f-res7100-md-sliced-gcat-dr3.0.hdf5
-rw-rw-r-- 1 shong shong  28G  7월 12 18:17 lsr-0-rslice-6.m12f-res7100-md-sliced-gcat-dr3.0.hdf5
-rw-rw-r-- 1 shong shong  24G  7월 12 17:33 lsr-0-rslice-7.m12f-res7100-md-sliced-gcat-dr3.0.hdf5
-rw-rw-r-- 

In [8]:
!pwd

/home/shong/work/ananke/notebook


In [9]:
h5name = '/mnt/raid5/shong/ananke/lsr-0-rslice-8.m12f-res7100-md-sliced-gcat-dr3.0.hdf5'

In [10]:
#with h5py.File(h5name, "r") as f:
#    for key in f.keys():
#        dataset = f[key]
#        print(key, dataset.shape, dataset.dtype)

In [11]:
try:
    f = h5py.File(h5name, "r")
except IOError as e:
    print("Error opening HDF5 file:", str(e))
# Don't forget f.close() when done! 

#### Finding out what  `f.keys()` is 

In [12]:
f.keys()

<KeysViewHDF5 ['A0', 'a_g_val', 'age', 'alpha', 'b', 'b_true', 'bp_g', 'bp_g_true', 'bp_rp', 'bp_rp_true', 'calcium', 'carbon', 'dec', 'dec_error', 'dec_true', 'dmod_true', 'e_bp_min_rp_val', 'ebv', 'feh', 'flag_wd', 'g_rp', 'g_rp_true', 'helium', 'l', 'l_true', 'logg', 'lognh', 'logteff', 'lum', 'mact', 'magnesium', 'mini', 'mtip', 'neon', 'nitrogen', 'oxygen', 'parallax', 'parallax_error', 'parallax_over_error', 'parallax_true', 'parentid', 'partid', 'phot_bp_mean_mag', 'phot_bp_mean_mag_abs', 'phot_bp_mean_mag_error', 'phot_bp_mean_mag_int', 'phot_bp_mean_mag_true', 'phot_g_mean_mag', 'phot_g_mean_mag_abs', 'phot_g_mean_mag_error', 'phot_g_mean_mag_int', 'phot_g_mean_mag_true', 'phot_rp_mean_mag', 'phot_rp_mean_mag_abs', 'phot_rp_mean_mag_error', 'phot_rp_mean_mag_int', 'phot_rp_mean_mag_true', 'pmb', 'pmb_true', 'pmdec', 'pmdec_error', 'pmdec_true', 'pml', 'pml_true', 'pmra', 'pmra_error', 'pmra_true', 'px_true', 'py_true', 'pz_true', 'ra', 'ra_cosdec_error', 'ra_error', 'ra_true',

In [13]:
list(f.keys())[2]

'age'

#### Finding out how to access the `values` for each `key` field 

- the dataset is a numpy array, which can be converted to a builtin python type by `tolist()`

In [14]:
f['age'][()][:5]

array([5.81240225, 5.81240225, 5.81240225, 5.81240225, 5.81240225])

In [15]:
numtotal = f['age'].shape[0]

In [16]:
f['age'].dtype

dtype('<f8')

In [17]:
f['age'][()].dtype

dtype('float64')

In [18]:
type(f['age'][()].tolist())

list

In [19]:
type(f['age'][()].tolist()[0])

float

#### Extracting a `schema` from `key` and `dtype` 

In [20]:
for key in f.keys():
    dataset = f[key]
    print(key, dataset.dtype)

A0 float64
a_g_val float64
age float64
alpha float64
b float64
b_true float32
bp_g float64
bp_g_true float64
bp_rp float64
bp_rp_true float64
calcium float64
carbon float64
dec float64
dec_error float64
dec_true float32
dmod_true float64
e_bp_min_rp_val float64
ebv float64
feh float64
flag_wd int64
g_rp float64
g_rp_true float64
helium float64
l float64
l_true float32
logg float32
lognh float64
logteff float32
lum float32
mact float32
magnesium float64
mini float64
mtip float32
neon float64
nitrogen float64
oxygen float64
parallax float64
parallax_error float64
parallax_over_error float64
parallax_true float64
parentid int64
partid int64
phot_bp_mean_mag float64
phot_bp_mean_mag_abs float32
phot_bp_mean_mag_error float64
phot_bp_mean_mag_int float64
phot_bp_mean_mag_true float64
phot_g_mean_mag float64
phot_g_mean_mag_abs float32
phot_g_mean_mag_error float64
phot_g_mean_mag_int float64
phot_g_mean_mag_true float64
phot_rp_mean_mag float64
phot_rp_mean_mag_abs float32
phot_rp_mean_mag_

In [21]:
nowdtype=''
for key in f.keys():
    dataset = f[key]
    #print(key, dataset.dtype)
    datatype = str(dataset.dtype)
    if "int32" in datatype:
        nowdtype = 'T.IntegerType()'
        data_list.append(list(int(val)))
    if "int16" in datatype:
        nowdtype = 'T.IntegerType()'
    if "int8" in datatype:
        nowdtype = 'T.IntegerType()'        
    if "int64" in datatype:
        nowdtype = 'T.LongType()'
    if "string" in datatype:
        nowdtype = 'T.StringType()'
    if "bool" in datatype:
        nowdtype = 'T.BooleanType()'        
    if "float32" in datatype:
        nowdtype = 'T.FloatType()'
    if "float64" in datatype:
        nowdtype = 'T.DoubleType()'
    print("T.StructField(\'"+key+"\', "+nowdtype+", True), \\")

T.StructField('A0', T.DoubleType(), True), \
T.StructField('a_g_val', T.DoubleType(), True), \
T.StructField('age', T.DoubleType(), True), \
T.StructField('alpha', T.DoubleType(), True), \
T.StructField('b', T.DoubleType(), True), \
T.StructField('b_true', T.FloatType(), True), \
T.StructField('bp_g', T.DoubleType(), True), \
T.StructField('bp_g_true', T.DoubleType(), True), \
T.StructField('bp_rp', T.DoubleType(), True), \
T.StructField('bp_rp_true', T.DoubleType(), True), \
T.StructField('calcium', T.DoubleType(), True), \
T.StructField('carbon', T.DoubleType(), True), \
T.StructField('dec', T.DoubleType(), True), \
T.StructField('dec_error', T.DoubleType(), True), \
T.StructField('dec_true', T.FloatType(), True), \
T.StructField('dmod_true', T.DoubleType(), True), \
T.StructField('e_bp_min_rp_val', T.DoubleType(), True), \
T.StructField('ebv', T.DoubleType(), True), \
T.StructField('feh', T.DoubleType(), True), \
T.StructField('flag_wd', T.LongType(), True), \
T.StructField('g_rp', 

In [22]:
input_schema = T.StructType([ \
T.StructField('A0', T.DoubleType(), True), \
T.StructField('a_g_val', T.DoubleType(), True), \
T.StructField('age', T.DoubleType(), True), \
T.StructField('alpha', T.DoubleType(), True), \
T.StructField('b', T.DoubleType(), True), \
T.StructField('b_true', T.FloatType(), True), \
T.StructField('bp_g', T.DoubleType(), True), \
T.StructField('bp_g_true', T.DoubleType(), True), \
T.StructField('bp_rp', T.DoubleType(), True), \
T.StructField('bp_rp_true', T.DoubleType(), True), \
T.StructField('calcium', T.DoubleType(), True), \
T.StructField('carbon', T.DoubleType(), True), \
T.StructField('dec', T.DoubleType(), True), \
T.StructField('dec_error', T.DoubleType(), True), \
T.StructField('dec_true', T.FloatType(), True), \
T.StructField('dmod_true', T.DoubleType(), True), \
T.StructField('e_bp_min_rp_val', T.DoubleType(), True), \
T.StructField('ebv', T.DoubleType(), True), \
T.StructField('feh', T.DoubleType(), True), \
T.StructField('flag_wd', T.LongType(), True), \
T.StructField('g_rp', T.DoubleType(), True), \
T.StructField('g_rp_true', T.DoubleType(), True), \
T.StructField('helium', T.DoubleType(), True), \
T.StructField('l', T.DoubleType(), True), \
T.StructField('l_true', T.FloatType(), True), \
T.StructField('logg', T.FloatType(), True), \
T.StructField('lognh', T.DoubleType(), True), \
T.StructField('logteff', T.FloatType(), True), \
T.StructField('lum', T.FloatType(), True), \
T.StructField('mact', T.FloatType(), True), \
T.StructField('magnesium', T.DoubleType(), True), \
T.StructField('mini', T.DoubleType(), True), \
T.StructField('mtip', T.FloatType(), True), \
T.StructField('neon', T.DoubleType(), True), \
T.StructField('nitrogen', T.DoubleType(), True), \
T.StructField('oxygen', T.DoubleType(), True), \
T.StructField('parallax', T.DoubleType(), True), \
T.StructField('parallax_error', T.DoubleType(), True), \
T.StructField('parallax_over_error', T.DoubleType(), True), \
T.StructField('parallax_true', T.DoubleType(), True), \
T.StructField('parentid', T.LongType(), True), \
T.StructField('partid', T.LongType(), True), \
T.StructField('phot_bp_mean_mag', T.DoubleType(), True), \
T.StructField('phot_bp_mean_mag_abs', T.FloatType(), True), \
T.StructField('phot_bp_mean_mag_error', T.DoubleType(), True), \
T.StructField('phot_bp_mean_mag_int', T.DoubleType(), True), \
T.StructField('phot_bp_mean_mag_true', T.DoubleType(), True), \
T.StructField('phot_g_mean_mag', T.DoubleType(), True), \
T.StructField('phot_g_mean_mag_abs', T.FloatType(), True), \
T.StructField('phot_g_mean_mag_error', T.DoubleType(), True), \
T.StructField('phot_g_mean_mag_int', T.DoubleType(), True), \
T.StructField('phot_g_mean_mag_true', T.DoubleType(), True), \
T.StructField('phot_rp_mean_mag', T.DoubleType(), True), \
T.StructField('phot_rp_mean_mag_abs', T.FloatType(), True), \
T.StructField('phot_rp_mean_mag_error', T.DoubleType(), True), \
T.StructField('phot_rp_mean_mag_int', T.DoubleType(), True), \
T.StructField('phot_rp_mean_mag_true', T.DoubleType(), True), \
T.StructField('pmb', T.DoubleType(), True), \
T.StructField('pmb_true', T.DoubleType(), True), \
T.StructField('pmdec', T.DoubleType(), True), \
T.StructField('pmdec_error', T.DoubleType(), True), \
T.StructField('pmdec_true', T.DoubleType(), True), \
T.StructField('pml', T.DoubleType(), True), \
T.StructField('pml_true', T.DoubleType(), True), \
T.StructField('pmra', T.DoubleType(), True), \
T.StructField('pmra_error', T.DoubleType(), True), \
T.StructField('pmra_true', T.DoubleType(), True), \
T.StructField('px_true', T.DoubleType(), True), \
T.StructField('py_true', T.DoubleType(), True), \
T.StructField('pz_true', T.DoubleType(), True), \
T.StructField('ra', T.DoubleType(), True), \
T.StructField('ra_cosdec_error', T.DoubleType(), True), \
T.StructField('ra_error', T.DoubleType(), True), \
T.StructField('ra_true', T.FloatType(), True), \
T.StructField('radial_velocity', T.DoubleType(), True), \
T.StructField('radial_velocity_error', T.DoubleType(), True), \
T.StructField('radial_velocity_error_corr_factor', T.DoubleType(), True), \
T.StructField('radial_velocity_true', T.DoubleType(), True), \
T.StructField('silicon', T.DoubleType(), True), \
T.StructField('sulphur', T.DoubleType(), True), \
T.StructField('vx_true', T.DoubleType(), True), \
T.StructField('vy_true', T.DoubleType(), True), \
T.StructField('vz_true', T.DoubleType(), True) \
])                             

#### Merging `keys` and `values` as a dictionary for Spark DataFrame

In [23]:
keylist = list(f.keys())

In [24]:
keyzero = keylist[0]

In [25]:
numtotal = f[keyzero].shape[0]

In [26]:
numtotal

97458877

In [27]:
# Let's try this size 
chunksize = 1000000
ichunk = 0
istart=0
iend=0

In [28]:
# inital indices 
ichunk = 0
istart = ichunk * chunksize
iend = (ichunk + 1) * chunksize

In [29]:
chunk_address = []

In [30]:
# ad-hoc iterations; i know this is not a fancy loop
while istart < numtotal:
    if iend > numtotal:
        iend = numtotal
    
    print("ichunk="+str(ichunk)+", istart="+str(istart)+", iend="+str(iend)+" : Total="+str(numtotal))
    chunk_address.append([ichunk,istart,iend])
    
    ichunk=ichunk+1
    istart = ichunk * chunksize
    iend = (ichunk + 1) * chunksize

ichunk=0, istart=0, iend=1000000 : Total=97458877
ichunk=1, istart=1000000, iend=2000000 : Total=97458877
ichunk=2, istart=2000000, iend=3000000 : Total=97458877
ichunk=3, istart=3000000, iend=4000000 : Total=97458877
ichunk=4, istart=4000000, iend=5000000 : Total=97458877
ichunk=5, istart=5000000, iend=6000000 : Total=97458877
ichunk=6, istart=6000000, iend=7000000 : Total=97458877
ichunk=7, istart=7000000, iend=8000000 : Total=97458877
ichunk=8, istart=8000000, iend=9000000 : Total=97458877
ichunk=9, istart=9000000, iend=10000000 : Total=97458877
ichunk=10, istart=10000000, iend=11000000 : Total=97458877
ichunk=11, istart=11000000, iend=12000000 : Total=97458877
ichunk=12, istart=12000000, iend=13000000 : Total=97458877
ichunk=13, istart=13000000, iend=14000000 : Total=97458877
ichunk=14, istart=14000000, iend=15000000 : Total=97458877
ichunk=15, istart=15000000, iend=16000000 : Total=97458877
ichunk=16, istart=16000000, iend=17000000 : Total=97458877
ichunk=17, istart=17000000, iend

In [31]:
outnameheader = 'hdfs://spark00:54310/common/data/external-catalogs/parquet/ananke/m12f/'
outnamebody = 'lsr-0-rslice-8.m12f-res7100-md-sliced-gcat-dr3.0'
outnametail = '.parquet.snappy'

## Save parquets on the chunk-by-chunk cases

In [32]:
%%time
for ichunk, istart, iend in chunk_address:
    print("ichunk="+str(ichunk)+", istart="+str(istart)+", iend="+str(iend)+" : Total="+str(numtotal))
    print(outnameheader+outnamebody+"-part"+str(ichunk)+outnametail)
    outname = outnameheader+outnamebody+"-part"+str(ichunk)+outnametail
    
    chunk_data_list = [f[key][()][istart:iend].tolist() for key in f.keys()]
    zipped_chunk_data_list = list(zip(*chunk_data_list))
    spark.createDataFrame(zipped_chunk_data_list,schema=input_schema) \
    .write.option("compression", "snappy") \
    .mode("overwrite") \
    .save(outname)
    

ichunk=0, istart=0, iend=1000000 : Total=97458877
hdfs://spark00:54310/common/data/external-catalogs/parquet/ananke/m12f/lsr-0-rslice-8.m12f-res7100-md-sliced-gcat-dr3.0-part0.parquet.snappy
ichunk=1, istart=1000000, iend=2000000 : Total=97458877
hdfs://spark00:54310/common/data/external-catalogs/parquet/ananke/m12f/lsr-0-rslice-8.m12f-res7100-md-sliced-gcat-dr3.0-part1.parquet.snappy
ichunk=2, istart=2000000, iend=3000000 : Total=97458877
hdfs://spark00:54310/common/data/external-catalogs/parquet/ananke/m12f/lsr-0-rslice-8.m12f-res7100-md-sliced-gcat-dr3.0-part2.parquet.snappy
ichunk=3, istart=3000000, iend=4000000 : Total=97458877
hdfs://spark00:54310/common/data/external-catalogs/parquet/ananke/m12f/lsr-0-rslice-8.m12f-res7100-md-sliced-gcat-dr3.0-part3.parquet.snappy
ichunk=4, istart=4000000, iend=5000000 : Total=97458877
hdfs://spark00:54310/common/data/external-catalogs/parquet/ananke/m12f/lsr-0-rslice-8.m12f-res7100-md-sliced-gcat-dr3.0-part4.parquet.snappy
ichunk=5, istart=5000

ichunk=41, istart=41000000, iend=42000000 : Total=97458877
hdfs://spark00:54310/common/data/external-catalogs/parquet/ananke/m12f/lsr-0-rslice-8.m12f-res7100-md-sliced-gcat-dr3.0-part41.parquet.snappy
ichunk=42, istart=42000000, iend=43000000 : Total=97458877
hdfs://spark00:54310/common/data/external-catalogs/parquet/ananke/m12f/lsr-0-rslice-8.m12f-res7100-md-sliced-gcat-dr3.0-part42.parquet.snappy
ichunk=43, istart=43000000, iend=44000000 : Total=97458877
hdfs://spark00:54310/common/data/external-catalogs/parquet/ananke/m12f/lsr-0-rslice-8.m12f-res7100-md-sliced-gcat-dr3.0-part43.parquet.snappy
ichunk=44, istart=44000000, iend=45000000 : Total=97458877
hdfs://spark00:54310/common/data/external-catalogs/parquet/ananke/m12f/lsr-0-rslice-8.m12f-res7100-md-sliced-gcat-dr3.0-part44.parquet.snappy
ichunk=45, istart=45000000, iend=46000000 : Total=97458877
hdfs://spark00:54310/common/data/external-catalogs/parquet/ananke/m12f/lsr-0-rslice-8.m12f-res7100-md-sliced-gcat-dr3.0-part45.parquet.sn

ichunk=82, istart=82000000, iend=83000000 : Total=97458877
hdfs://spark00:54310/common/data/external-catalogs/parquet/ananke/m12f/lsr-0-rslice-8.m12f-res7100-md-sliced-gcat-dr3.0-part82.parquet.snappy
ichunk=83, istart=83000000, iend=84000000 : Total=97458877
hdfs://spark00:54310/common/data/external-catalogs/parquet/ananke/m12f/lsr-0-rslice-8.m12f-res7100-md-sliced-gcat-dr3.0-part83.parquet.snappy
ichunk=84, istart=84000000, iend=85000000 : Total=97458877
hdfs://spark00:54310/common/data/external-catalogs/parquet/ananke/m12f/lsr-0-rslice-8.m12f-res7100-md-sliced-gcat-dr3.0-part84.parquet.snappy
ichunk=85, istart=85000000, iend=86000000 : Total=97458877
hdfs://spark00:54310/common/data/external-catalogs/parquet/ananke/m12f/lsr-0-rslice-8.m12f-res7100-md-sliced-gcat-dr3.0-part85.parquet.snappy
ichunk=86, istart=86000000, iend=87000000 : Total=97458877
hdfs://spark00:54310/common/data/external-catalogs/parquet/ananke/m12f/lsr-0-rslice-8.m12f-res7100-md-sliced-gcat-dr3.0-part86.parquet.sn

In [33]:
# Don't forget f.close() when done! 
f.close()

## Read the saved raw parquet and check it up

In [34]:
onepqname = 'hdfs://spark00:54310/common/data/external-catalogs/parquet/ananke/m12f/lsr-0-rslice-8.m12f-res7100-md-sliced-gcat-dr3.0-part2.parquet.snappy'

In [35]:
%%time
tmpdf = spark.read.parquet(onepqname)

CPU times: user 1.01 ms, sys: 119 µs, total: 1.12 ms
Wall time: 223 ms


In [36]:
tmpdf.printSchema()

root
 |-- A0: double (nullable = true)
 |-- a_g_val: double (nullable = true)
 |-- age: double (nullable = true)
 |-- alpha: double (nullable = true)
 |-- b: double (nullable = true)
 |-- b_true: float (nullable = true)
 |-- bp_g: double (nullable = true)
 |-- bp_g_true: double (nullable = true)
 |-- bp_rp: double (nullable = true)
 |-- bp_rp_true: double (nullable = true)
 |-- calcium: double (nullable = true)
 |-- carbon: double (nullable = true)
 |-- dec: double (nullable = true)
 |-- dec_error: double (nullable = true)
 |-- dec_true: float (nullable = true)
 |-- dmod_true: double (nullable = true)
 |-- e_bp_min_rp_val: double (nullable = true)
 |-- ebv: double (nullable = true)
 |-- feh: double (nullable = true)
 |-- flag_wd: long (nullable = true)
 |-- g_rp: double (nullable = true)
 |-- g_rp_true: double (nullable = true)
 |-- helium: double (nullable = true)
 |-- l: double (nullable = true)
 |-- l_true: float (nullable = true)
 |-- logg: float (nullable = true)
 |-- lognh: doubl

In [37]:
%%time
statdf = tmpdf.describe().toPandas().set_index('summary').transpose()

CPU times: user 13.1 ms, sys: 3.93 ms, total: 17 ms
Wall time: 10.6 s


In [38]:
statdf

summary,count,mean,stddev,min,max
A0,1000000,1.6120024995573332,1.5257996236265934,0.03321271714868559,19.995762934572404
a_g_val,1000000,1.4055111749460893,1.222482058237415,0.02933373611911705,12.284868573509556
age,1000000,7.424779756129265,0.4309452702239612,5.280919075012207,8.17567253112793
alpha,1000000,0.23922295465497673,0.020549065409474446,0.07251594960689545,0.3348192572593689
b,1000000,1.2661702887466195,4.3001033591353135,-13.970094004238529,20.229873171020532
...,...,...,...,...,...
silicon,1000000,0.19540384847520686,0.08835976018344156,-0.3461321294307709,0.6102657914161682
sulphur,1000000,0.14600577944209567,0.08908203476895939,-0.3927712142467499,0.5671384930610657
vx_true,1000000,92.11674348067359,127.20999846666125,-390.7356199422827,445.8651903107485
vy_true,1000000,-81.89664398192757,124.3120325722958,-621.7974524430161,125.11558846504727


In [39]:
%%time
# Render the DataFrame as an HTML table
outhtml = statdf.to_html()


htmlname = '/mnt/raid5/shong/ananke/basic_m12f_part_stat.html'

# Save the HTML table to a file
with open(htmlname, "w") as f:
    f.write(outhtml)

CPU times: user 5.92 ms, sys: 0 ns, total: 5.92 ms
Wall time: 5.91 ms
