# EDA Gaia DR3 Catalogs and Save as Parquets

In [46]:
# generate edges 
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.spatial import cKDTree
import gc

pd.options.display.max_rows = 200

# plot settings
plt.rc('font', family='serif') 
plt.rc('font', serif='Times New Roman') 
plt.rcParams.update({'font.size': 16})
plt.rcParams['mathtext.fontset'] = 'stix'

In [2]:
from pyspark import SparkContext   
from pyspark.sql import SQLContext
from pyspark.sql import SparkSession


#spark = SparkSession \
#        .builder \
#        .appName('gaia-convert-files') \
#        .getOrCreate()

sc = spark.sparkContext
sqlsc = SQLContext(sc)


#sc = SparkContext(master='local[3]', appName='calgraph')
#sqlsc = SQLContext(sc)
#sc.setCheckpointDir("./checkpoints")
#sc.setCheckpointDir("hdfs://localhost:8020/myhdfs/spark/checkpoints")
sc.setCheckpointDir("hdfs://spark00:54310/tmp/checkpoints")

import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark import Row
from pyspark.sql.window import Window as W



In [3]:
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

## Read the data 

### Extract `schema` info from the header file

In [4]:
headerinfo = !hadoop fs -cat /common/data/catalog/external-catalogs/External_Catalogs/Gaia_DR3/GaiaSource.header

In [5]:
len(headerinfo)

1002

In [6]:
headerinfo[0]

'# %ECSV 1.0'

In [7]:
headerinfo[-1]

'1636148068921376768,"Gaia DR3 4295806720",4295806720,545300884,2016.0,44.99615537864534,0.10161827,0.005615226341865997,0.10133387,0.3543305595550248,0.12266381,2.8886316,12.616485,11.93835156938502,0.13794228,-4.0806193394130865,0.13316983,0.12293493,0.13202813,-0.08891027,0.022551458,-0.3653421,-0.03690377,-0.24483804,0.06301233,0.13570854,0.3343367,184,0,183,1,2.6720488,242.20697,0.3806193,2.0765078,31,"False",1.5089388,null,null,null,null,null,null,null,22,16,0.21780181,22,9,0,0.01759732,90.23934,0,0,1.1429516,0.30795118,0.19765861,0.43010107,0.8420776,-87.75478,-30.69455,-46.20191,30.174356,"False",182,1653.39471645947,2.0757642,796.5234,17.641426,18,800.4295459066461,12.601409,63.51905,18.080235,20,1187.588003883822,15.823832,75.0506,17.061232,1.2023853,0,0,0,2,0,1.0190029,0.43880844,0.5801945,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,"NOT_AVAILABLE",176.95107618038946,-48.901520870941965,42.53372584780011,-16.3

In [8]:
headerinfo[-2]

'solution_id,designation,source_id,random_index,ref_epoch,ra,ra_error,dec,dec_error,parallax,parallax_error,parallax_over_error,pm,pmra,pmra_error,pmdec,pmdec_error,ra_dec_corr,ra_parallax_corr,ra_pmra_corr,ra_pmdec_corr,dec_parallax_corr,dec_pmra_corr,dec_pmdec_corr,parallax_pmra_corr,parallax_pmdec_corr,pmra_pmdec_corr,astrometric_n_obs_al,astrometric_n_obs_ac,astrometric_n_good_obs_al,astrometric_n_bad_obs_al,astrometric_gof_al,astrometric_chi2_al,astrometric_excess_noise,astrometric_excess_noise_sig,astrometric_params_solved,astrometric_primary_flag,nu_eff_used_in_astrometry,pseudocolour,pseudocolour_error,ra_pseudocolour_corr,dec_pseudocolour_corr,parallax_pseudocolour_corr,pmra_pseudocolour_corr,pmdec_pseudocolour_corr,astrometric_matched_transits,visibility_periods_used,astrometric_sigma5d_max,matched_transits,new_matched_transits,matched_transits_removed,ipd_gof_harmonic_amplitude,ipd_gof_harmonic_phase,ipd_frac_multi_peak,ipd_frac_odd_win,ruwe,scan_direction_strength_k1,scan_d

In [9]:
namelist = []
datatypelist = []
descriptionlist = []

In [10]:
for eachline in headerinfo:
    if "name:" in eachline:
        #print(eachline.split(':', 1)[1])
        namelist.append(eachline.split(':', 1)[1].strip())
    if "datatype:" in eachline:
        #print(eachline.split(':', 1)[1])
        if len(eachline.split(':', 1)[1].strip()) > 0:
            datatypelist.append(eachline.split(':', 1)[1].strip())
    if "description:" in eachline:
        #print(eachline.split(':', 1)[1])
        descriptionlist.append(eachline.split(':', 1)[1].strip())

In [11]:
len(namelist),len(datatypelist),len(descriptionlist) 

(152, 152, 152)

In [12]:
namelist[0],datatypelist[0],descriptionlist[0]

('solution_id', 'int64', 'Solution Identifier')

In [13]:
for name, dtype, description in zip(namelist[:5], datatypelist[:5], descriptionlist[:5]):
    print(name,dtype,description)

solution_id int64 Solution Identifier
designation string Unique source designation (unique across all Data Releases)
source_id int64 Unique source identifier (unique within a particular Data Release)
random_index int64 Random index for use when selecting subsets
ref_epoch float64 Reference epoch


In [14]:
headerdf = pd.DataFrame({'icol': list(range(1,153)),'name': namelist, \
                         'datatype': datatypelist, 'description': descriptionlist})

In [15]:
headerdf.head()

Unnamed: 0,icol,name,datatype,description
0,1,solution_id,int64,Solution Identifier
1,2,designation,string,Unique source designation (unique across all D...
2,3,source_id,int64,Unique source identifier (unique within a part...
3,4,random_index,int64,Random index for use when selecting subsets
4,5,ref_epoch,float64,Reference epoch


In [16]:
headerdf.dtypes

icol            int64
name           object
datatype       object
description    object
dtype: object

In [17]:
!pwd

/home/shong/work/gaia/notebook


In [18]:
sdf = spark.createDataFrame(headerdf)

  [(c, t) for (_, c), t in zip(pdf_slice.iteritems(), arrow_types)]


In [19]:
sdf.show()

+----+-------------------+--------+--------------------+
|icol|               name|datatype|         description|
+----+-------------------+--------+--------------------+
|   1|        solution_id|   int64| Solution Identifier|
|   2|        designation|  string|Unique source des...|
|   3|          source_id|   int64|Unique source ide...|
|   4|       random_index|   int64|Random index for ...|
|   5|          ref_epoch| float64|     Reference epoch|
|   6|                 ra| float64|     Right ascension|
|   7|           ra_error| float32|Standard error of...|
|   8|                dec| float64|         Declination|
|   9|          dec_error| float32|Standard error of...|
|  10|           parallax| float64|            Parallax|
|  11|     parallax_error| float32|Standard error of...|
|  12|parallax_over_error| float32|Parallax divided ...|
|  13|                 pm| float32| Total proper motion|
|  14|               pmra| float64|Proper motion in ...|
|  15|         pmra_error| floa

In [20]:
sdf.printSchema()

root
 |-- icol: long (nullable = true)
 |-- name: string (nullable = true)
 |-- datatype: string (nullable = true)
 |-- description: string (nullable = true)



#### Save this as `gaia-simple-header.parquet` 

In [21]:
import pyarrow as pa
import pyarrow.parquet as pq

In [22]:
filepath = \
"hdfs://spark00:54310/common/data/catalog/external-catalogs/External_Catalogs/Gaia_DR3/gaia-simple-header.parquet"

In [23]:
%%time
sdf.write.format('parquet').mode("overwrite").save(filepath)

CPU times: user 518 µs, sys: 701 µs, total: 1.22 ms
Wall time: 2.74 s


In [24]:
sdf.show()

+----+-------------------+--------+--------------------+
|icol|               name|datatype|         description|
+----+-------------------+--------+--------------------+
|   1|        solution_id|   int64| Solution Identifier|
|   2|        designation|  string|Unique source des...|
|   3|          source_id|   int64|Unique source ide...|
|   4|       random_index|   int64|Random index for ...|
|   5|          ref_epoch| float64|     Reference epoch|
|   6|                 ra| float64|     Right ascension|
|   7|           ra_error| float32|Standard error of...|
|   8|                dec| float64|         Declination|
|   9|          dec_error| float32|Standard error of...|
|  10|           parallax| float64|            Parallax|
|  11|     parallax_error| float32|Standard error of...|
|  12|parallax_over_error| float32|Parallax divided ...|
|  13|                 pm| float32| Total proper motion|
|  14|               pmra| float64|Proper motion in ...|
|  15|         pmra_error| floa

### Read the simple header parquet, and define a new schema

In [25]:
filepath = \
"hdfs://spark00:54310/common/data/catalog/external-catalogs/External_Catalogs/Gaia_DR3/gaia-simple-header.parquet"

In [26]:
%%time
tmpdf = spark.read.parquet(filepath)

CPU times: user 1.14 ms, sys: 0 ns, total: 1.14 ms
Wall time: 194 ms


In [27]:
tmpdf.orderBy('icol').show()

+----+-------------------+--------+--------------------+
|icol|               name|datatype|         description|
+----+-------------------+--------+--------------------+
|   1|        solution_id|   int64| Solution Identifier|
|   2|        designation|  string|Unique source des...|
|   3|          source_id|   int64|Unique source ide...|
|   4|       random_index|   int64|Random index for ...|
|   5|          ref_epoch| float64|     Reference epoch|
|   6|                 ra| float64|     Right ascension|
|   7|           ra_error| float32|Standard error of...|
|   8|                dec| float64|         Declination|
|   9|          dec_error| float32|Standard error of...|
|  10|           parallax| float64|            Parallax|
|  11|     parallax_error| float32|Standard error of...|
|  12|parallax_over_error| float32|Parallax divided ...|
|  13|                 pm| float32| Total proper motion|
|  14|               pmra| float64|Proper motion in ...|
|  15|         pmra_error| floa

In [28]:
ghdf = tmpdf.orderBy('icol').toPandas()

In [29]:
ghdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 152 entries, 0 to 151
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   icol         152 non-null    int64 
 1   name         152 non-null    object
 2   datatype     152 non-null    object
 3   description  152 non-null    object
dtypes: int64(1), object(3)
memory usage: 4.9+ KB


In [30]:
ghdf.datatype.unique()

array(['int64', 'string', 'float64', 'float32', 'int16', 'int8', 'bool'],
      dtype=object)

In [31]:
## Generating Schema Text
nowdtype=''
for icol,name,datatype in zip(ghdf.icol.values,ghdf.name.values,ghdf.datatype.values):
    #print(icol,name,datatype)
    if "int32" in datatype:
        nowdtype = 'T.IntegerType()'
    if "int16" in datatype:
        nowdtype = 'T.IntegerType()'
    if "int8" in datatype:
        nowdtype = 'T.IntegerType()'        
    if "int64" in datatype:
        nowdtype = 'T.LongType()'
    if "string" in datatype:
        nowdtype = 'T.StringType()'
    if "bool" in datatype:
        nowdtype = 'T.BooleanType()'        
    if "float32" in datatype:
        nowdtype = 'T.FloatType()'
    if "float64" in datatype:
        nowdtype = 'T.DoubleType()'
    print("T.StructField(\'"+name+"\', "+nowdtype+", True), \\")

T.StructField('solution_id', T.LongType(), True), \
T.StructField('designation', T.StringType(), True), \
T.StructField('source_id', T.LongType(), True), \
T.StructField('random_index', T.LongType(), True), \
T.StructField('ref_epoch', T.DoubleType(), True), \
T.StructField('ra', T.DoubleType(), True), \
T.StructField('ra_error', T.FloatType(), True), \
T.StructField('dec', T.DoubleType(), True), \
T.StructField('dec_error', T.FloatType(), True), \
T.StructField('parallax', T.DoubleType(), True), \
T.StructField('parallax_error', T.FloatType(), True), \
T.StructField('parallax_over_error', T.FloatType(), True), \
T.StructField('pm', T.FloatType(), True), \
T.StructField('pmra', T.DoubleType(), True), \
T.StructField('pmra_error', T.FloatType(), True), \
T.StructField('pmdec', T.DoubleType(), True), \
T.StructField('pmdec_error', T.FloatType(), True), \
T.StructField('ra_dec_corr', T.FloatType(), True), \
T.StructField('ra_parallax_corr', T.FloatType(), True), \
T.StructField('ra_pmra_c

In [32]:
gaia_schema = T.StructType([ \
T.StructField('solution_id', T.LongType(), True), \
T.StructField('designation', T.StringType(), True), \
T.StructField('source_id', T.LongType(), True), \
T.StructField('random_index', T.LongType(), True), \
T.StructField('ref_epoch', T.DoubleType(), True), \
T.StructField('ra', T.DoubleType(), True), \
T.StructField('ra_error', T.FloatType(), True), \
T.StructField('dec', T.DoubleType(), True), \
T.StructField('dec_error', T.FloatType(), True), \
T.StructField('parallax', T.DoubleType(), True), \
T.StructField('parallax_error', T.FloatType(), True), \
T.StructField('parallax_over_error', T.FloatType(), True), \
T.StructField('pm', T.FloatType(), True), \
T.StructField('pmra', T.DoubleType(), True), \
T.StructField('pmra_error', T.FloatType(), True), \
T.StructField('pmdec', T.DoubleType(), True), \
T.StructField('pmdec_error', T.FloatType(), True), \
T.StructField('ra_dec_corr', T.FloatType(), True), \
T.StructField('ra_parallax_corr', T.FloatType(), True), \
T.StructField('ra_pmra_corr', T.FloatType(), True), \
T.StructField('ra_pmdec_corr', T.FloatType(), True), \
T.StructField('dec_parallax_corr', T.FloatType(), True), \
T.StructField('dec_pmra_corr', T.FloatType(), True), \
T.StructField('dec_pmdec_corr', T.FloatType(), True), \
T.StructField('parallax_pmra_corr', T.FloatType(), True), \
T.StructField('parallax_pmdec_corr', T.FloatType(), True), \
T.StructField('pmra_pmdec_corr', T.FloatType(), True), \
T.StructField('astrometric_n_obs_al', T.IntegerType(), True), \
T.StructField('astrometric_n_obs_ac', T.IntegerType(), True), \
T.StructField('astrometric_n_good_obs_al', T.IntegerType(), True), \
T.StructField('astrometric_n_bad_obs_al', T.IntegerType(), True), \
T.StructField('astrometric_gof_al', T.FloatType(), True), \
T.StructField('astrometric_chi2_al', T.FloatType(), True), \
T.StructField('astrometric_excess_noise', T.FloatType(), True), \
T.StructField('astrometric_excess_noise_sig', T.FloatType(), True), \
T.StructField('astrometric_params_solved', T.IntegerType(), True), \
T.StructField('astrometric_primary_flag', T.BooleanType(), True), \
T.StructField('nu_eff_used_in_astrometry', T.FloatType(), True), \
T.StructField('pseudocolour', T.FloatType(), True), \
T.StructField('pseudocolour_error', T.FloatType(), True), \
T.StructField('ra_pseudocolour_corr', T.FloatType(), True), \
T.StructField('dec_pseudocolour_corr', T.FloatType(), True), \
T.StructField('parallax_pseudocolour_corr', T.FloatType(), True), \
T.StructField('pmra_pseudocolour_corr', T.FloatType(), True), \
T.StructField('pmdec_pseudocolour_corr', T.FloatType(), True), \
T.StructField('astrometric_matched_transits', T.IntegerType(), True), \
T.StructField('visibility_periods_used', T.IntegerType(), True), \
T.StructField('astrometric_sigma5d_max', T.FloatType(), True), \
T.StructField('matched_transits', T.IntegerType(), True), \
T.StructField('new_matched_transits', T.IntegerType(), True), \
T.StructField('matched_transits_removed', T.IntegerType(), True), \
T.StructField('ipd_gof_harmonic_amplitude', T.FloatType(), True), \
T.StructField('ipd_gof_harmonic_phase', T.FloatType(), True), \
T.StructField('ipd_frac_multi_peak', T.IntegerType(), True), \
T.StructField('ipd_frac_odd_win', T.IntegerType(), True), \
T.StructField('ruwe', T.FloatType(), True), \
T.StructField('scan_direction_strength_k1', T.FloatType(), True), \
T.StructField('scan_direction_strength_k2', T.FloatType(), True), \
T.StructField('scan_direction_strength_k3', T.FloatType(), True), \
T.StructField('scan_direction_strength_k4', T.FloatType(), True), \
T.StructField('scan_direction_mean_k1', T.FloatType(), True), \
T.StructField('scan_direction_mean_k2', T.FloatType(), True), \
T.StructField('scan_direction_mean_k3', T.FloatType(), True), \
T.StructField('scan_direction_mean_k4', T.FloatType(), True), \
T.StructField('duplicated_source', T.BooleanType(), True), \
T.StructField('phot_g_n_obs', T.IntegerType(), True), \
T.StructField('phot_g_mean_flux', T.DoubleType(), True), \
T.StructField('phot_g_mean_flux_error', T.FloatType(), True), \
T.StructField('phot_g_mean_flux_over_error', T.FloatType(), True), \
T.StructField('phot_g_mean_mag', T.FloatType(), True), \
T.StructField('phot_bp_n_obs', T.IntegerType(), True), \
T.StructField('phot_bp_mean_flux', T.DoubleType(), True), \
T.StructField('phot_bp_mean_flux_error', T.FloatType(), True), \
T.StructField('phot_bp_mean_flux_over_error', T.FloatType(), True), \
T.StructField('phot_bp_mean_mag', T.FloatType(), True), \
T.StructField('phot_rp_n_obs', T.IntegerType(), True), \
T.StructField('phot_rp_mean_flux', T.DoubleType(), True), \
T.StructField('phot_rp_mean_flux_error', T.FloatType(), True), \
T.StructField('phot_rp_mean_flux_over_error', T.FloatType(), True), \
T.StructField('phot_rp_mean_mag', T.FloatType(), True), \
T.StructField('phot_bp_rp_excess_factor', T.FloatType(), True), \
T.StructField('phot_bp_n_contaminated_transits', T.IntegerType(), True), \
T.StructField('phot_bp_n_blended_transits', T.IntegerType(), True), \
T.StructField('phot_rp_n_contaminated_transits', T.IntegerType(), True), \
T.StructField('phot_rp_n_blended_transits', T.IntegerType(), True), \
T.StructField('phot_proc_mode', T.IntegerType(), True), \
T.StructField('bp_rp', T.FloatType(), True), \
T.StructField('bp_g', T.FloatType(), True), \
T.StructField('g_rp', T.FloatType(), True), \
T.StructField('radial_velocity', T.FloatType(), True), \
T.StructField('radial_velocity_error', T.FloatType(), True), \
T.StructField('rv_method_used', T.IntegerType(), True), \
T.StructField('rv_nb_transits', T.IntegerType(), True), \
T.StructField('rv_nb_deblended_transits', T.IntegerType(), True), \
T.StructField('rv_visibility_periods_used', T.IntegerType(), True), \
T.StructField('rv_expected_sig_to_noise', T.FloatType(), True), \
T.StructField('rv_renormalised_gof', T.FloatType(), True), \
T.StructField('rv_chisq_pvalue', T.FloatType(), True), \
T.StructField('rv_time_duration', T.FloatType(), True), \
T.StructField('rv_amplitude_robust', T.FloatType(), True), \
T.StructField('rv_template_teff', T.FloatType(), True), \
T.StructField('rv_template_logg', T.FloatType(), True), \
T.StructField('rv_template_fe_h', T.FloatType(), True), \
T.StructField('rv_atm_param_origin', T.IntegerType(), True), \
T.StructField('vbroad', T.FloatType(), True), \
T.StructField('vbroad_error', T.FloatType(), True), \
T.StructField('vbroad_nb_transits', T.IntegerType(), True), \
T.StructField('grvs_mag', T.FloatType(), True), \
T.StructField('grvs_mag_error', T.FloatType(), True), \
T.StructField('grvs_mag_nb_transits', T.IntegerType(), True), \
T.StructField('rvs_spec_sig_to_noise', T.FloatType(), True), \
T.StructField('phot_variable_flag', T.StringType(), True), \
T.StructField('l', T.DoubleType(), True), \
T.StructField('b', T.DoubleType(), True), \
T.StructField('ecl_lon', T.DoubleType(), True), \
T.StructField('ecl_lat', T.DoubleType(), True), \
T.StructField('in_qso_candidates', T.BooleanType(), True), \
T.StructField('in_galaxy_candidates', T.BooleanType(), True), \
T.StructField('non_single_star', T.IntegerType(), True), \
T.StructField('has_xp_continuous', T.BooleanType(), True), \
T.StructField('has_xp_sampled', T.BooleanType(), True), \
T.StructField('has_rvs', T.BooleanType(), True), \
T.StructField('has_epoch_photometry', T.BooleanType(), True), \
T.StructField('has_epoch_rv', T.BooleanType(), True), \
T.StructField('has_mcmc_gspphot', T.BooleanType(), True), \
T.StructField('has_mcmc_msc', T.BooleanType(), True), \
T.StructField('in_andromeda_survey', T.BooleanType(), True), \
T.StructField('classprob_dsc_combmod_quasar', T.FloatType(), True), \
T.StructField('classprob_dsc_combmod_galaxy', T.FloatType(), True), \
T.StructField('classprob_dsc_combmod_star', T.FloatType(), True), \
T.StructField('teff_gspphot', T.FloatType(), True), \
T.StructField('teff_gspphot_lower', T.FloatType(), True), \
T.StructField('teff_gspphot_upper', T.FloatType(), True), \
T.StructField('logg_gspphot', T.FloatType(), True), \
T.StructField('logg_gspphot_lower', T.FloatType(), True), \
T.StructField('logg_gspphot_upper', T.FloatType(), True), \
T.StructField('mh_gspphot', T.FloatType(), True), \
T.StructField('mh_gspphot_lower', T.FloatType(), True), \
T.StructField('mh_gspphot_upper', T.FloatType(), True), \
T.StructField('distance_gspphot', T.FloatType(), True), \
T.StructField('distance_gspphot_lower', T.FloatType(), True), \
T.StructField('distance_gspphot_upper', T.FloatType(), True), \
T.StructField('azero_gspphot', T.FloatType(), True), \
T.StructField('azero_gspphot_lower', T.FloatType(), True), \
T.StructField('azero_gspphot_upper', T.FloatType(), True), \
T.StructField('ag_gspphot', T.FloatType(), True), \
T.StructField('ag_gspphot_lower', T.FloatType(), True), \
T.StructField('ag_gspphot_upper', T.FloatType(), True), \
T.StructField('ebpminrp_gspphot', T.FloatType(), True), \
T.StructField('ebpminrp_gspphot_lower', T.FloatType(), True), \
T.StructField('ebpminrp_gspphot_upper', T.FloatType(), True), \
T.StructField('libname_gspphot', T.StringType(), True),
                           ])

In [33]:
gaia_schema

StructType([StructField('solution_id', LongType(), True), StructField('designation', StringType(), True), StructField('source_id', LongType(), True), StructField('random_index', LongType(), True), StructField('ref_epoch', DoubleType(), True), StructField('ra', DoubleType(), True), StructField('ra_error', FloatType(), True), StructField('dec', DoubleType(), True), StructField('dec_error', FloatType(), True), StructField('parallax', DoubleType(), True), StructField('parallax_error', FloatType(), True), StructField('parallax_over_error', FloatType(), True), StructField('pm', FloatType(), True), StructField('pmra', DoubleType(), True), StructField('pmra_error', FloatType(), True), StructField('pmdec', DoubleType(), True), StructField('pmdec_error', FloatType(), True), StructField('ra_dec_corr', FloatType(), True), StructField('ra_parallax_corr', FloatType(), True), StructField('ra_pmra_corr', FloatType(), True), StructField('ra_pmdec_corr', FloatType(), True), StructField('dec_parallax_cor

In [34]:
filename = \
"hdfs://spark00:54310/common/data/catalog/external-catalogs/External_Catalogs/Gaia_DR3/GaiaSource.header"

In [35]:
gaiadf = sqlsc.read.csv(filename, comment='#', \
                        header=True, schema = gaia_schema)

In [36]:
gpdf = gaiadf.toPandas()

In [37]:
gpdf.head(3).transpose()

Unnamed: 0,0
solution_id,1636148068921376768
designation,Gaia DR3 4295806720
source_id,4295806720
random_index,545300884
ref_epoch,2016.0
...,...
ag_gspphot_upper,0.0143
ebpminrp_gspphot,0.0028
ebpminrp_gspphot_lower,0.0007
ebpminrp_gspphot_upper,0.0078


In [38]:
headerinfo[-1]

'1636148068921376768,"Gaia DR3 4295806720",4295806720,545300884,2016.0,44.99615537864534,0.10161827,0.005615226341865997,0.10133387,0.3543305595550248,0.12266381,2.8886316,12.616485,11.93835156938502,0.13794228,-4.0806193394130865,0.13316983,0.12293493,0.13202813,-0.08891027,0.022551458,-0.3653421,-0.03690377,-0.24483804,0.06301233,0.13570854,0.3343367,184,0,183,1,2.6720488,242.20697,0.3806193,2.0765078,31,"False",1.5089388,null,null,null,null,null,null,null,22,16,0.21780181,22,9,0,0.01759732,90.23934,0,0,1.1429516,0.30795118,0.19765861,0.43010107,0.8420776,-87.75478,-30.69455,-46.20191,30.174356,"False",182,1653.39471645947,2.0757642,796.5234,17.641426,18,800.4295459066461,12.601409,63.51905,18.080235,20,1187.588003883822,15.823832,75.0506,17.061232,1.2023853,0,0,0,2,0,1.0190029,0.43880844,0.5801945,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,"NOT_AVAILABLE",176.95107618038946,-48.901520870941965,42.53372584780011,-16.3

In [39]:
gaiadf.show()

+-------------------+-------------------+----------+------------+---------+-----------------+----------+--------------------+----------+------------------+--------------+-------------------+---------+-----------------+----------+-------------------+-----------+-----------+----------------+------------+-------------+-----------------+-------------+--------------+------------------+-------------------+---------------+--------------------+--------------------+-------------------------+------------------------+------------------+-------------------+------------------------+----------------------------+-------------------------+------------------------+-------------------------+------------+------------------+--------------------+---------------------+--------------------------+----------------------+-----------------------+----------------------------+-----------------------+-----------------------+----------------+--------------------+------------------------+--------------------------+--

### Now read the real csv catalog

In [61]:
inpath = "hdfs://spark00:54310/common/data/catalog/external-catalogs/External_Catalogs/Gaia_DR3/"

In [62]:
infilename="GaiaSource_786097-786431.csv.gz"

In [63]:
print(inpath+infilename)

hdfs://spark00:54310/common/data/catalog/external-catalogs/External_Catalogs/Gaia_DR3/GaiaSource_786097-786431.csv.gz


In [64]:
gaiadf = sqlsc.read.csv(filename, comment='#', header=True, schema = gaia_schema)

In [70]:
gaiadf.select(['source_id','ra','dec','parallax','pm']).show(3)

+-------------------+------------------+-------------------+-------------------+---------+
|          source_id|                ra|                dec|           parallax|       pm|
+-------------------+------------------+-------------------+-------------------+---------+
|6914582336478705664|310.95707253757945|-4.6247897820669985|0.23028795120969311| 8.419354|
|6914582336478706048| 310.9594538515811| -4.625302688650829| 0.3389303271534423|11.624171|
|6914582340775478016|310.95849933861984| -4.616324809353853| 0.7021635274505245| 9.450335|
+-------------------+------------------+-------------------+-------------------+---------+
only showing top 3 rows



## Save as a parquet

In [71]:
import pyarrow as pa
import pyarrow.parquet as pq

In [72]:
outpath = "hdfs://spark00:54310/common/data/catalog/external-catalogs/parquet/gaia-dr3/raw/"

In [73]:
outfilename = "GaiaSource_786097-786431.parquet.snappy"

In [74]:
print(outpath+outfilename)

hdfs://spark00:54310/common/data/catalog/external-catalogs/parquet/gaia-dr3/raw/GaiaSource_786097-786431.parquet.snappy


In [75]:
%%time
gaiadf \
    .write.option("compression", "snappy") \
    .mode("overwrite") \
    .save(outpath+outfilename)

CPU times: user 0 ns, sys: 6.41 ms, total: 6.41 ms
Wall time: 26.6 s


### Read the saved raw parquet and check them up

In [76]:
%%time
tmpdf = spark.read.parquet(outpath+outfilename)

CPU times: user 1.69 ms, sys: 0 ns, total: 1.69 ms
Wall time: 73.2 ms


In [77]:
%%time
gpdf = gaiadf.toPandas()

CPU times: user 215 ms, sys: 181 ms, total: 397 ms
Wall time: 31.2 s


In [78]:
!pwd

/home/shong/work/gaia/notebook


In [79]:
gpdf.head(3).transpose().to_html('/home/shong/work/gaia/data/show3cols.html')

In [80]:
tmpdf.select(['source_id','ra','dec','parallax','pm']).show(3)

+-------------------+------------------+-------------------+-------------------+---------+
|          source_id|                ra|                dec|           parallax|       pm|
+-------------------+------------------+-------------------+-------------------+---------+
|6914582336478705664|310.95707253757945|-4.6247897820669985|0.23028795120969311| 8.419354|
|6914582336478706048| 310.9594538515811| -4.625302688650829| 0.3389303271534423|11.624171|
|6914582340775478016|310.95849933861984| -4.616324809353853| 0.7021635274505245| 9.450335|
+-------------------+------------------+-------------------+-------------------+---------+
only showing top 3 rows



> OK! Done! Let's make a python script to covert all csvs to parquets