# Extract subsamples from HR4 Big Data catalog

In [1]:
# generate edges 
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.spatial import cKDTree
import gc

# plot settings
plt.rc('font', family='serif') 
plt.rc('font', serif='Times New Roman') 
plt.rcParams.update({'font.size': 16})
plt.rcParams['mathtext.fontset'] = 'stix'

In [2]:
from pyspark import SparkContext   
from pyspark.sql import SQLContext

#sc = SparkContext(master='local[3]', appName='calgraph')
sqlsc = SQLContext(sc)
#sc.setCheckpointDir("./checkpoints")
#sc.setCheckpointDir("hdfs://localhost:8020/myhdfs/spark/checkpoints")
sc.setCheckpointDir("hdfs://master:54310/tmp/spark/checkpoints")

import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark import Row
from pyspark.sql.window import Window as W

In [3]:
spark.conf.set("spark.sql.execution.arrow.enabled", "true")

## Read the data 

In [4]:
halo_schema = T.StructType([ \
                            T.StructField('haloid', T.IntegerType(), False), \
                            T.StructField('px', T.FloatType(), False), \
                            T.StructField('py', T.FloatType(), False), \
                            T.StructField('pz', T.FloatType(), False), \
                            T.StructField('vx', T.FloatType(), False), \
                            T.StructField('vy', T.FloatType(), False), \
                            T.StructField('vz', T.FloatType(), False), \
                            T.StructField('halomass', T.FloatType(), False), \
                          ])

In [5]:
halodf = sqlsc.read.csv("hdfs://master:54310/data/cosmo/hr4/halo_z0.csv",\
                        header=False, schema = halo_schema)

In [6]:
halodf.show(3,truncate=True)

+---------+---------+---------+---------+----------+----------+---------+-------------+
|   haloid|       px|       py|       pz|        vx|        vy|       vz|     halomass|
+---------+---------+---------+---------+----------+----------+---------+-------------+
|322225520|106.23875|2820.2603|310.53067|   -593.39|  42.42728|117.49196|3.29162502E14|
|127093960|1015.0091| 3070.103|2687.5447|-361.36716| -34.88201|   980.29|  5.796312E14|
| 95586173|1150.7571| 656.3275|195.96417|  295.5281|-117.53244|203.30292| 7.4870011E14|
+---------+---------+---------+---------+----------+----------+---------+-------------+
only showing top 3 rows



In [7]:
halodf.printSchema()

root
 |-- haloid: integer (nullable = true)
 |-- px: float (nullable = true)
 |-- py: float (nullable = true)
 |-- pz: float (nullable = true)
 |-- vx: float (nullable = true)
 |-- vy: float (nullable = true)
 |-- vz: float (nullable = true)
 |-- halomass: float (nullable = true)



## Save as a parquet

In [8]:
import pyarrow as pa
import pyarrow.parquet as pq

In [9]:
%%time
halodf \
    .write.option("compression", "snappy") \
    .mode("overwrite") \
    .save("hdfs://master:54310/data/cosmo/hr4/hr4-fof-halo-z0.parquet.snappy")

CPU times: user 6.39 ms, sys: 10.2 ms, total: 16.6 ms
Wall time: 2min 3s
