# EDA Gaia DR3 Catalogs and Save as Parquets

In [1]:
# generate edges 
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.spatial import cKDTree
import gc

pd.options.display.max_rows = 200

# plot settings
plt.rc('font', family='serif') 
plt.rc('font', serif='Times New Roman') 
plt.rcParams.update({'font.size': 16})
plt.rcParams['mathtext.fontset'] = 'stix'

In [2]:
from pyspark import SparkContext   
from pyspark.sql import SQLContext
from pyspark.sql import SparkSession


#spark = SparkSession \
#        .builder \
#        .appName('gaia-convert-files') \
#        .getOrCreate()

sc = spark.sparkContext
sqlsc = SQLContext(sc)


#sc = SparkContext(master='local[3]', appName='calgraph')
#sqlsc = SQLContext(sc)
#sc.setCheckpointDir("./checkpoints")
#sc.setCheckpointDir("hdfs://localhost:8020/myhdfs/spark/checkpoints")
sc.setCheckpointDir("hdfs://spark00:54310/tmp/checkpoints")

import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark import Row
from pyspark.sql.window import Window as W



In [3]:
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

### Read the data 

In [4]:
import pyarrow as pa
import pyarrow.parquet as pq

In [5]:
filepath = \
"hdfs://spark00:54310/common/data/catalog/external-catalogs/parquet/gaia-dr3/raw/"

In [6]:
filepath

'hdfs://spark00:54310/common/data/catalog/external-catalogs/parquet/gaia-dr3/raw/'

In [7]:
%%time
# Read all parquets in the directory
rawdf = spark.read.option("header","true").option("recursiveFileLookup","true").parquet(filepath)

CPU times: user 1.41 ms, sys: 0 ns, total: 1.41 ms
Wall time: 2.91 s


### EDA the full gaia catalog

In [8]:
rawdf.printSchema()

root
 |-- solution_id: long (nullable = true)
 |-- designation: string (nullable = true)
 |-- source_id: long (nullable = true)
 |-- random_index: long (nullable = true)
 |-- ref_epoch: double (nullable = true)
 |-- ra: double (nullable = true)
 |-- ra_error: float (nullable = true)
 |-- dec: double (nullable = true)
 |-- dec_error: float (nullable = true)
 |-- parallax: double (nullable = true)
 |-- parallax_error: float (nullable = true)
 |-- parallax_over_error: float (nullable = true)
 |-- pm: float (nullable = true)
 |-- pmra: double (nullable = true)
 |-- pmra_error: float (nullable = true)
 |-- pmdec: double (nullable = true)
 |-- pmdec_error: float (nullable = true)
 |-- ra_dec_corr: float (nullable = true)
 |-- ra_parallax_corr: float (nullable = true)
 |-- ra_pmra_corr: float (nullable = true)
 |-- ra_pmdec_corr: float (nullable = true)
 |-- dec_parallax_corr: float (nullable = true)
 |-- dec_pmra_corr: float (nullable = true)
 |-- dec_pmdec_corr: float (nullable = true)
 |--

In [9]:
df = rawdf.select('source_id','ra','dec','parallax','pm','pmra','pmdec', \
                  'astrometric_excess_noise','ruwe','classprob_dsc_combmod_quasar', \
                  'classprob_dsc_combmod_galaxy','classprob_dsc_combmod_star')

In [10]:
df.cache()

DataFrame[source_id: bigint, ra: double, dec: double, parallax: double, pm: float, pmra: double, pmdec: double, astrometric_excess_noise: float, ruwe: float, classprob_dsc_combmod_quasar: float, classprob_dsc_combmod_galaxy: float, classprob_dsc_combmod_star: float]

In [11]:
%%time
df.describe().toPandas().transpose()

CPU times: user 20.4 ms, sys: 598 µs, total: 21 ms
Wall time: 1min 53s


Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
source_id,1811709771,4.3515610592999424E18,1.64011514434601165E18,4295806720,6917528997577384320
ra,1811709771,229.12434364249717,77.78510767878048,3.4096239126626443E-7,359.999999939548
dec,1811709771,-18.374270531226497,36.53941026762339,-89.99287859590359,89.99005196682685
parallax,1467744818,0.36636113047677105,1.0261718025894802,-187.02939637423492,768.0665391873573
pm,1467744818,6.530035776718833,6.2084137149041405,1.9370936E-4,10393.349
pmra,1467744818,-2.559096059140126,5.642731590202928,-4406.469178827325,6765.995136250774
pmdec,1467744818,-3.115878202127639,5.752189036045848,-5817.8001940492695,10362.394206546573
astrometric_excess_noise,1811709771,1.4472481372195856,5.413304658447024,0.0,1131.5846
ruwe,1467744818,1.0973414513193889,0.6082553477854714,0.19239865,116.016365


In [12]:
%%time
stardf = df.filter((F.col('classprob_dsc_combmod_star') > 0.5) & (F.col('ruwe') < 1.4))

CPU times: user 0 ns, sys: 882 µs, total: 882 µs
Wall time: 13 ms


In [13]:
%%time
stardf.describe().toPandas().transpose()

CPU times: user 5.33 ms, sys: 1.95 ms, total: 7.28 ms
Wall time: 4.82 s


Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
source_id,1304979431,4.3132475589375657E18,1.72159960843960781E18,4295806720,6917528997577384320
ra,1304979431,224.7829530609692,81.18003284199807,3.4096239126626443E-7,359.999999939548
dec,1304979431,-16.59583009910169,38.303902165927,-89.99287859590359,89.99005196682685
parallax,1304979431,0.3725932267127605,0.8913077253711855,-117.44857690096015,768.0665391873573
pm,1304979431,6.496989366913564,6.126189085764827,1.9370936E-4,10393.349
pmra,1304979431,-2.5457758901812366,5.585905279504888,-4406.469178827325,6765.995136250774
pmdec,1304979431,-3.070122748687959,5.712451067820202,-5817.8001940492695,10362.394206546573
astrometric_excess_noise,1304979431,0.5273923610173893,0.8399921220439873,0.0,17.405478
ruwe,1304979431,1.0294852843399465,0.0727080656098288,0.20452607,1.4
