# Classification for Stars vs. Non-Stellar Objects 

#### In this notebook, we will explore the RefCat data for analyze and extract train data sets for **labels of stars**.

## Import Basic Packages 

In [1]:
import numpy as np
import pandas as pd
import glob
import sys
import h5py
#from netCDF4 import Dataset
from datetime import datetime
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from scipy.spatial import cKDTree

import pyarrow as pa
import pyarrow.parquet as pq

from functools import reduce
import operator
import gc

# Increase display width to 200 characters
pd.set_option('display.max_rows', 300)
pd.set_option('display.max_colwidth', 200)

In [2]:
import os

from astropy.table import Table
from matplotlib.ticker import MultipleLocator

from astropy.utils.exceptions import AstropyWarning
import warnings
warnings.simplefilter('ignore', category=AstropyWarning)

In [3]:
# plot settings
#plt.rc('font', family='serif') 
#plt.rc('font', serif='Times New Roman') 
plt.rcParams.update({'font.size': 16})
plt.rcParams['mathtext.fontset'] = 'stix'

## PySpark Session

In [4]:
%%time
# PySpark packages
from pyspark import SparkContext   
from pyspark.sql import SparkSession

import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark import Row
from pyspark.sql.window import Window as W


spark = SparkSession.builder \
    .master("yarn") \
    .appName("spark-shell") \
    .config("spark.driver.maxResultSize", "32g") \
    .config("spark.driver.memory", "32g") \
    .config("spark.rpc.message.maxSize", "1024") \
    .config("spark.executor.memory", "7g") \
    .config("spark.executor.cores", "1") \
    .config("spark.executor.instances", "20") \
    .getOrCreate()


sc = spark.sparkContext
sc.setCheckpointDir("hdfs://spark00:54310/tmp/checkpoints")

spark.conf.set("spark.sql.debug.maxToStringFields", 500)
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

CPU times: user 5.12 ms, sys: 13.7 ms, total: 18.8 ms
Wall time: 26.8 s


- this takes time to get resources from Yarn Cluster
- `.config("spark.rpc.message.maxSize", "1024")`: this option is necessary for handling some large pandas dataframe

## [1] Reading Reference Catalog Files

- **Parquet Format** of **astropy.Table**. 
- **Not readable from HDFS**. 
- Hence, local file system will be better to save these astropy table data. 

### [1-1] Read the List of Files 

#### Hadoop File System

In [5]:
hdfsheader = 'hdfs://spark00:54310'
workpath = '/user/shong/work/sedfit/spherex/data/temp/'
datapath = '/user/shong/data/spherex/star-classification/input-ref-cat/'

#### Local File System on Spark00

In [6]:
localrefcatpath = '/mnt/raid5/yyang/SPHEREx_RefCat/'

#### Pandas DataFrame of RefCat File List

In [7]:
pdf = pq.read_table('/home/shong/work/sed-fit/pyspark-eazypy/data/refcatlist.parquet').to_pandas()

In [8]:
pdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12288 entries, 0 to 12287
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   name    12288 non-null  object 
 1   size    12288 non-null  float64
 2   irank   12288 non-null  int32  
dtypes: float64(1), int32(1), object(1)
memory usage: 240.1+ KB


In [9]:
pdf.head()

Unnamed: 0,name,size,irank
0,Gaia_DR3.LS.PS1DR1.CatWISE.AllWISE.2MASS_NSIDE32_000030.parquet,18.7712,1
1,Gaia_DR3.LS.PS1DR1.CatWISE.AllWISE.2MASS_NSIDE32_000028.parquet,19.4544,2
2,Gaia_DR3.LS.PS1DR1.CatWISE.AllWISE.2MASS_NSIDE32_000024.parquet,20.1404,3
3,Gaia_DR3.LS.PS1DR1.CatWISE.AllWISE.2MASS_NSIDE32_000031.parquet,20.6769,4
4,Gaia_DR3.LS.PS1DR1.CatWISE.AllWISE.2MASS_NSIDE32_000025.parquet,20.8368,5


In [10]:
pdf.tail(20)

Unnamed: 0,name,size,irank
12268,Gaia_DR3.LS.PS1DR1.CatWISE.AllWISE.2MASS_NSIDE32_007316.parquet,517.627,12269
12269,Gaia_DR3.LS.PS1DR1.CatWISE.AllWISE.2MASS_NSIDE32_007265.parquet,521.287,12270
12270,Gaia_DR3.LS.PS1DR1.CatWISE.AllWISE.2MASS_NSIDE32_007195.parquet,521.818,12271
12271,Gaia_DR3.LS.PS1DR1.CatWISE.AllWISE.2MASS_NSIDE32_007319.parquet,522.251,12272
12272,Gaia_DR3.LS.PS1DR1.CatWISE.AllWISE.2MASS_NSIDE32_007217.parquet,524.478,12273
12273,Gaia_DR3.LS.PS1DR1.CatWISE.AllWISE.2MASS_NSIDE32_007242.parquet,530.077,12274
12274,Gaia_DR3.LS.PS1DR1.CatWISE.AllWISE.2MASS_NSIDE32_007318.parquet,533.152,12275
12275,Gaia_DR3.LS.PS1DR1.CatWISE.AllWISE.2MASS_NSIDE32_007220.parquet,538.911,12276
12276,Gaia_DR3.LS.PS1DR1.CatWISE.AllWISE.2MASS_NSIDE32_007312.parquet,541.143,12277
12277,Gaia_DR3.LS.PS1DR1.CatWISE.AllWISE.2MASS_NSIDE32_007296.parquet,544.204,12278


## Explore the RefCat

In [11]:
listrefcat = pdf['name'].values

In [12]:
localrefcatpath+listrefcat[0]

'/mnt/raid5/yyang/SPHEREx_RefCat/Gaia_DR3.LS.PS1DR1.CatWISE.AllWISE.2MASS_NSIDE32_000030.parquet'

In [13]:
numrefcat = len(listrefcat)
print(numrefcat)

12288


In [14]:
phot = Table.read(localrefcatpath+pdf.name[0])

In [15]:
photdf = phot.to_pandas()

In [16]:
photdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102219 entries, 0 to 102218
Columns: 101 entries, SPHERExRefID to z_ID
dtypes: float32(21), float64(61), int16(6), int32(2), int64(5), object(6)
memory usage: 66.3+ MB


In [17]:
photdf.dtypes

SPHERExRefID                   int64
Gaia_DR3_source_id             int64
LegacySurvey_uid               int64
PS1_DR1_StackObject_objID      int64
CatWISE_source_id             object
AllWISE_designation           object
2MASS_designation             object
ra                           float64
dec                          float64
ra_error                     float64
dec_error                    float64
coord_src                      int64
pmra                         float64
pmra_error                   float32
pmdec                        float64
pmdec_error                  float32
parallax                     float64
parallax_error               float32
ref_epoch                    float64
astrometric_params_solved      int16
CatWISE_PMRA                 float32
CatWISE_PMDec                float32
CatWISE_sigPMRA              float32
CatWISE_sigPMDec             float32
Gaia_G                       float32
Gaia_BP                      float32
Gaia_RP                      float32
G

In [18]:
photdf.head(4).T

Unnamed: 0,0,1,2,3
SPHERExRefID,1333155099932884993,1333155099932884994,1333155100033548292,1333155100067102720
Gaia_DR3_source_id,-9999,-9999,-9999,-9999
LegacySurvey_uid,-9999,-9999,-9999,-9999
PS1_DR1_StackObject_objID,123890492204260534,123890492217870778,-9999,123880492201128997
CatWISE_source_id,b'N/A',b'N/A',b'0496p136_b0-061999',b'0496p136_b0-016795'
AllWISE_designation,b'N/A',b'N/A',b'N/A',b'N/A'
2MASS_designation,b'N/A',b'N/A',b'N/A',b'N/A'
ra,49.220482,49.221801,49.22002,49.22015
dec,13.241687,13.2419,13.243249,13.240385
ra_error,0.000012,0.000008,0.000164,0.000016


In [19]:
photdf['CatWISE_source_id'][:5]

0                   b'N/A'
1                   b'N/A'
2    b'0496p136_b0-061999'
3    b'0496p136_b0-016795'
4    b'0496p136_b0-006048'
Name: CatWISE_source_id, dtype: object

> We have a string decode issue like `b'N/A'`. We will resolve this issue as follows. 

### Handling the byte encoded columns

In [20]:
byte_encoded_columns = []

for col in photdf.columns:
    if isinstance(photdf[col].iloc[0], bytes):
        byte_encoded_columns.append(col)

print("Byte encoded columns:", byte_encoded_columns)

Byte encoded columns: ['CatWISE_source_id', 'AllWISE_designation', '2MASS_designation', 'LS_type']


In [21]:
for col in byte_encoded_columns:
    photdf[col] = photdf[col].str.decode('utf-8')

In [22]:
photdf.head(4).T

Unnamed: 0,0,1,2,3
SPHERExRefID,1.3331550999328847e+18,1.3331550999328847e+18,1333155100033548292,1333155100067102720
Gaia_DR3_source_id,-9999.0,-9999.0,-9999,-9999
LegacySurvey_uid,-9999.0,-9999.0,-9999,-9999
PS1_DR1_StackObject_objID,1.2389049220426051e+17,1.2389049221787075e+17,-9999,123880492201128997
CatWISE_source_id,,,0496p136_b0-061999,0496p136_b0-016795
AllWISE_designation,,,,
2MASS_designation,,,,
ra,49.220482,49.221801,49.22002,49.22015
dec,13.241687,13.2419,13.243249,13.240385
ra_error,1.2e-05,8e-06,0.000164,0.000016


In [23]:
photdf.dtypes

SPHERExRefID                   int64
Gaia_DR3_source_id             int64
LegacySurvey_uid               int64
PS1_DR1_StackObject_objID      int64
CatWISE_source_id             object
AllWISE_designation           object
2MASS_designation             object
ra                           float64
dec                          float64
ra_error                     float64
dec_error                    float64
coord_src                      int64
pmra                         float64
pmra_error                   float32
pmdec                        float64
pmdec_error                  float32
parallax                     float64
parallax_error               float32
ref_epoch                    float64
astrometric_params_solved      int16
CatWISE_PMRA                 float32
CatWISE_PMDec                float32
CatWISE_sigPMRA              float32
CatWISE_sigPMDec             float32
Gaia_G                       float32
Gaia_BP                      float32
Gaia_RP                      float32
G

### Change `object` to `string` 

In [24]:
# Select object columns
object_cols = photdf.select_dtypes(include=['object']).columns

In [25]:
print(object_cols)

Index(['CatWISE_source_id', 'AllWISE_designation', '2MASS_designation',
       'LS_type', 'z_survey', 'z_ID'],
      dtype='object')


In [26]:
# Convert object columns to string type
photdf[object_cols] = photdf[object_cols].astype("string")

In [27]:
photdf.dtypes

SPHERExRefID                   int64
Gaia_DR3_source_id             int64
LegacySurvey_uid               int64
PS1_DR1_StackObject_objID      int64
CatWISE_source_id             string
AllWISE_designation           string
2MASS_designation             string
ra                           float64
dec                          float64
ra_error                     float64
dec_error                    float64
coord_src                      int64
pmra                         float64
pmra_error                   float32
pmdec                        float64
pmdec_error                  float32
parallax                     float64
parallax_error               float32
ref_epoch                    float64
astrometric_params_solved      int16
CatWISE_PMRA                 float32
CatWISE_PMDec                float32
CatWISE_sigPMRA              float32
CatWISE_sigPMDec             float32
Gaia_G                       float32
Gaia_BP                      float32
Gaia_RP                      float32
G

### Change `float32` to `float64`

> There are some conversion bugs from pandas float to spark double. I will see all float32 to float64

In [40]:
# Select object columns
float_cols = photdf.select_dtypes(include=['float32']).columns

In [41]:
print(float_cols)

Index([], dtype='object')


In [42]:
# Convert object columns to string type
photdf[float_cols] = photdf[float_cols].astype("float64")

In [43]:
photdf.dtypes

SPHERExRefID                   int64
Gaia_DR3_source_id             int64
LegacySurvey_uid               int64
PS1_DR1_StackObject_objID      int64
CatWISE_source_id             string
AllWISE_designation           string
2MASS_designation             string
ra                           float64
dec                          float64
ra_error                     float64
dec_error                    float64
coord_src                      int64
pmra                         float64
pmra_error                   float64
pmdec                        float64
pmdec_error                  float64
parallax                     float64
parallax_error               float64
ref_epoch                    float64
astrometric_params_solved      int16
CatWISE_PMRA                 float64
CatWISE_PMDec                float64
CatWISE_sigPMRA              float64
CatWISE_sigPMDec             float64
Gaia_G                       float64
Gaia_BP                      float64
Gaia_RP                      float64
G

### Change `pandas dataframe` to `pyspark dataframe`

In [44]:
photsdf = spark.createDataFrame(photdf)

In [45]:
photsdf.printSchema()

root
 |-- SPHERExRefID: long (nullable = true)
 |-- Gaia_DR3_source_id: long (nullable = true)
 |-- LegacySurvey_uid: long (nullable = true)
 |-- PS1_DR1_StackObject_objID: long (nullable = true)
 |-- CatWISE_source_id: string (nullable = true)
 |-- AllWISE_designation: string (nullable = true)
 |-- 2MASS_designation: string (nullable = true)
 |-- ra: double (nullable = true)
 |-- dec: double (nullable = true)
 |-- ra_error: double (nullable = true)
 |-- dec_error: double (nullable = true)
 |-- coord_src: long (nullable = true)
 |-- pmra: double (nullable = true)
 |-- pmra_error: double (nullable = true)
 |-- pmdec: double (nullable = true)
 |-- pmdec_error: double (nullable = true)
 |-- parallax: double (nullable = true)
 |-- parallax_error: double (nullable = true)
 |-- ref_epoch: double (nullable = true)
 |-- astrometric_params_solved: short (nullable = true)
 |-- CatWISE_PMRA: double (nullable = true)
 |-- CatWISE_PMDec: double (nullable = true)
 |-- CatWISE_sigPMRA: double (nullab

In [46]:
photsdf.show(3,truncate=True)

+-------------------+------------------+----------------+-------------------------+------------------+-------------------+-----------------+------------------+------------------+--------------------+--------------------+---------+-------+----------+-------+-----------+--------+--------------+---------+-------------------------+-----------------+------------------+---------------+-----------------+-------+-------+-------+------------+-------------+-------------+-------+-------+-------+----------+----------+----------+-----------------+-----------------+-----------------+------------------+------------------+-------------------+------------------+------------------+------------------+------------------+-------+-------+--------+-------------+-------------+--------------+------------------+------------------+-----------------+-----------------+-------+-------+-------------+-------------+---------+---------+-----------+---------+----------+------------------+------------------+-------------

## Save the spark dataframe in HDFS

In [47]:
hdfsheader+datapath

'hdfs://spark00:54310/user/shong/data/spherex/star-classification/input-ref-cat/'

In [48]:
localrefcatpath+listrefcat[0]

'/mnt/raid5/yyang/SPHEREx_RefCat/Gaia_DR3.LS.PS1DR1.CatWISE.AllWISE.2MASS_NSIDE32_000030.parquet'

In [49]:
nrefcatfile = len(listrefcat)
print(nrefcatfile)

12288


In [50]:
listrefcat[0]+'.snappy'

'Gaia_DR3.LS.PS1DR1.CatWISE.AllWISE.2MASS_NSIDE32_000030.parquet.snappy'

In [51]:
outname = hdfsheader+datapath+listrefcat[0]+'.snappy'
print(outname)

hdfs://spark00:54310/user/shong/data/spherex/star-classification/input-ref-cat/Gaia_DR3.LS.PS1DR1.CatWISE.AllWISE.2MASS_NSIDE32_000030.parquet.snappy


In [52]:
#photsdf.cache()

In [53]:
#%%time
#photsdf \
#    .write.option("compression", "snappy") \
#    .mode("overwrite") \
#    .save(outname)

# Loop the conversion job 

**from astropy.table parquet to pyspark dataframe parquet**

In [54]:
nrefcatfile = len(listrefcat)
print(nrefcatfile)

12288


In [55]:
# for testing .. 
#nrefcatfile = 5

#### Sanity Check for Paths

In [56]:
hdfsheader = 'hdfs://spark00:54310'
workpath = '/user/shong/work/sedfit/spherex/data/temp/'
datapath = '/user/shong/data/spherex/star-classification/input-ref-cat/'
localrefcatpath = '/mnt/raid5/yyang/SPHEREx_RefCat/'

In [57]:
localrefcatpath+listrefcat[12287]

'/mnt/raid5/yyang/SPHEREx_RefCat/Gaia_DR3.LS.PS1DR1.CatWISE.AllWISE.2MASS_NSIDE32_007313.parquet'

#### Loop the conversion on All RefCat Files

In [58]:
from tqdm import tqdm

In [59]:
%%time
for ifile in tqdm(range(nrefcatfile), position=0, leave=True):
#for imore in tqdm(range(88), position=0, leave=True):
    #ifile = imore + 12200
    phot = Table.read(localrefcatpath+listrefcat[ifile])
    photdf = phot.to_pandas()
    
    # Handle byte encoded columns 
    byte_encoded_columns = []
    for col in photdf.columns:
        if isinstance(photdf[col].iloc[0], bytes):
            byte_encoded_columns.append(col)
    
    for col in byte_encoded_columns:
        photdf[col] = photdf[col].str.decode('utf-8')
    
    # Handle object columns
    object_cols = photdf.select_dtypes(include=['object']).columns
    photdf[object_cols] = photdf[object_cols].astype("string")
    
    # Handle float32 columns 
    float_cols = photdf.select_dtypes(include=['float32']).columns
    photdf[float_cols] = photdf[float_cols].astype("float64")
    
    
    # Save photdf as a pyspark dataframe on HDFS
    outname = hdfsheader+datapath+listrefcat[ifile]+'.snappy'
    
    spark.createDataFrame(photdf) \
        .write.option("compression", "snappy") \
        .mode("overwrite") \
        .save(outname)    

100%|██████████| 12288/12288 [32:00:19<00:00,  9.38s/it]  

CPU times: user 7h 42min 19s, sys: 2h 3min 12s, total: 9h 45min 32s
Wall time: 1d 8h 19s





## Check out the outputs

In [60]:
nrefcatfile * 3 / 3600

10.24

In [61]:
#!hadoop fs -ls {hdfsheader+datapath}

In [62]:
import pyarrow as pa
import pyarrow.parquet as pq

In [63]:
%%time
rawdf = spark.read.option("header","true").option("recursiveFileLookup","true").parquet(hdfsheader+datapath)

CPU times: user 3 ms, sys: 0 ns, total: 3 ms
Wall time: 8.73 s


In [64]:
#%%time
#rawdf.count()

In [65]:
rawdf.printSchema()

root
 |-- SPHERExRefID: long (nullable = true)
 |-- Gaia_DR3_source_id: long (nullable = true)
 |-- LegacySurvey_uid: long (nullable = true)
 |-- PS1_DR1_StackObject_objID: long (nullable = true)
 |-- CatWISE_source_id: string (nullable = true)
 |-- AllWISE_designation: string (nullable = true)
 |-- 2MASS_designation: string (nullable = true)
 |-- ra: double (nullable = true)
 |-- dec: double (nullable = true)
 |-- ra_error: double (nullable = true)
 |-- dec_error: double (nullable = true)
 |-- coord_src: long (nullable = true)
 |-- pmra: double (nullable = true)
 |-- pmra_error: double (nullable = true)
 |-- pmdec: double (nullable = true)
 |-- pmdec_error: double (nullable = true)
 |-- parallax: double (nullable = true)
 |-- parallax_error: double (nullable = true)
 |-- ref_epoch: double (nullable = true)
 |-- astrometric_params_solved: short (nullable = true)
 |-- CatWISE_PMRA: double (nullable = true)
 |-- CatWISE_PMDec: double (nullable = true)
 |-- CatWISE_sigPMRA: double (nullab

In [66]:
#rawdf.show(3,truncate=True)