# Classification for Stars vs. Non-Stellar Objects 


### In this notebook, we will estimate and treat the imbalance of train data.

- gaia `source_id`
- gaia `ra` and `dec`
- three gaia classification columns: <br> 
`classprob_dsc_combmod_star` `classprob_dsc_combmod_galaxy` `classprob_dsc_combmod_quasar`


## Import Basic Packages 

In [1]:
import numpy as np
import pandas as pd
import glob
import sys
import h5py
#from netCDF4 import Dataset
from datetime import datetime
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from scipy.spatial import cKDTree

import pyarrow as pa
import pyarrow.parquet as pq

from functools import reduce
import operator
import gc

# Increase display width to 200 characters
pd.set_option('display.max_rows', 300)
pd.set_option('display.max_colwidth', 200)

In [2]:
import os

from astropy.table import Table
from matplotlib.ticker import MultipleLocator

from astropy.utils.exceptions import AstropyWarning
import warnings
warnings.simplefilter('ignore', category=AstropyWarning)

In [3]:
# plot settings
#plt.rc('font', family='serif') 
#plt.rc('font', serif='Times New Roman') 
plt.rcParams.update({'font.size': 16})
plt.rcParams['mathtext.fontset'] = 'stix'

## PySpark Session

- High Memory Settings

In [4]:
%%time
# PySpark packages
from pyspark import SparkContext   
from pyspark.sql import SparkSession

import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark import Row
from pyspark.sql.window import Window as W


spark = SparkSession.builder \
    .master("yarn") \
    .appName("spark-shell") \
    .config("spark.driver.maxResultSize", "32g") \
    .config("spark.driver.memory", "64g") \
    .config("spark.executor.memory", "14g") \
    .config("spark.executor.cores", "1") \
    .config("spark.executor.instances", "100") \
    .config("spark.sql.hive.filesourcePartitionFileCacheSize", "2097152000") \
    .getOrCreate()



sc = spark.sparkContext
sc.setCheckpointDir("hdfs://spark00:54310/tmp/checkpoints")

spark.conf.set("spark.sql.debug.maxToStringFields", 500)
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

CPU times: user 10.9 ms, sys: 7.76 ms, total: 18.6 ms
Wall time: 28.1 s


> This takes time to get resources from the Yarn Cluster

## Reading Train-Labeled Sample

In [5]:
!pwd

/home/shong/work/deeplearnings/star-classification/notebook


#### Train Sample

In [6]:
datapath = '/user/shong/data/spherex/star-classification/reduced-data/'
hdfsheader = 'hdfs://spark00:54310'
localdatapath= '/home/shong/work/deeplearnings/star-classification/data/'

In [7]:
hdfsheader+datapath+'RefCat-Train-Label.parquet.snappy'

'hdfs://spark00:54310/user/shong/data/spherex/star-classification/reduced-data/RefCat-Train-Label.parquet.snappy'

In [8]:
%%time
tdf = spark.read.option("header","true"). \
parquet(hdfsheader+datapath+'RefCat-Train-Label.parquet.snappy')

CPU times: user 1.95 ms, sys: 0 ns, total: 1.95 ms
Wall time: 1.88 s


In [9]:
tdf.printSchema()

root
 |-- Gaia_DR3_source_id: long (nullable = true)
 |-- SPHERExRefID: long (nullable = true)
 |-- LegacySurvey_uid: long (nullable = true)
 |-- PS1_DR1_StackObject_objID: long (nullable = true)
 |-- CatWISE_source_id: string (nullable = true)
 |-- AllWISE_designation: string (nullable = true)
 |-- 2MASS_designation: string (nullable = true)
 |-- ra: double (nullable = true)
 |-- dec: double (nullable = true)
 |-- ra_error: double (nullable = true)
 |-- dec_error: double (nullable = true)
 |-- coord_src: long (nullable = true)
 |-- pmra: double (nullable = true)
 |-- pmra_error: double (nullable = true)
 |-- pmdec: double (nullable = true)
 |-- pmdec_error: double (nullable = true)
 |-- parallax: double (nullable = true)
 |-- parallax_error: double (nullable = true)
 |-- ref_epoch: double (nullable = true)
 |-- astrometric_params_solved: short (nullable = true)
 |-- CatWISE_PMRA: double (nullable = true)
 |-- CatWISE_PMDec: double (nullable = true)
 |-- CatWISE_sigPMRA: double (nullab

#### Still, there are some Null labels for this catalog

In [10]:
traindf = tdf.dropna()

In [11]:
traindf.cache()

DataFrame[Gaia_DR3_source_id: bigint, SPHERExRefID: bigint, LegacySurvey_uid: bigint, PS1_DR1_StackObject_objID: bigint, CatWISE_source_id: string, AllWISE_designation: string, 2MASS_designation: string, ra: double, dec: double, ra_error: double, dec_error: double, coord_src: bigint, pmra: double, pmra_error: double, pmdec: double, pmdec_error: double, parallax: double, parallax_error: double, ref_epoch: double, astrometric_params_solved: smallint, CatWISE_PMRA: double, CatWISE_PMDec: double, CatWISE_sigPMRA: double, CatWISE_sigPMDec: double, Gaia_G: double, Gaia_BP: double, Gaia_RP: double, Gaia_G_error: double, Gaia_BP_error: double, Gaia_RP_error: double, LS_g: double, LS_r: double, LS_z: double, LS_g_error: double, LS_r_error: double, LS_z_error: double, PS1_g: double, PS1_r: double, PS1_i: double, PS1_z: double, PS1_y: double, PS1_g_error: double, PS1_r_error: double, PS1_i_error: double, PS1_z_error: double, PS1_y_error: double, 2MASS_J: double, 2MASS_H: double, 2MASS_Ks: double,

In [12]:
fullcols = traindf.columns
numcols = len(fullcols)

In [13]:
[[i,colname] for i,colname in zip(range(numcols),fullcols)]

[[0, 'Gaia_DR3_source_id'],
 [1, 'SPHERExRefID'],
 [2, 'LegacySurvey_uid'],
 [3, 'PS1_DR1_StackObject_objID'],
 [4, 'CatWISE_source_id'],
 [5, 'AllWISE_designation'],
 [6, '2MASS_designation'],
 [7, 'ra'],
 [8, 'dec'],
 [9, 'ra_error'],
 [10, 'dec_error'],
 [11, 'coord_src'],
 [12, 'pmra'],
 [13, 'pmra_error'],
 [14, 'pmdec'],
 [15, 'pmdec_error'],
 [16, 'parallax'],
 [17, 'parallax_error'],
 [18, 'ref_epoch'],
 [19, 'astrometric_params_solved'],
 [20, 'CatWISE_PMRA'],
 [21, 'CatWISE_PMDec'],
 [22, 'CatWISE_sigPMRA'],
 [23, 'CatWISE_sigPMDec'],
 [24, 'Gaia_G'],
 [25, 'Gaia_BP'],
 [26, 'Gaia_RP'],
 [27, 'Gaia_G_error'],
 [28, 'Gaia_BP_error'],
 [29, 'Gaia_RP_error'],
 [30, 'LS_g'],
 [31, 'LS_r'],
 [32, 'LS_z'],
 [33, 'LS_g_error'],
 [34, 'LS_r_error'],
 [35, 'LS_z_error'],
 [36, 'PS1_g'],
 [37, 'PS1_r'],
 [38, 'PS1_i'],
 [39, 'PS1_z'],
 [40, 'PS1_y'],
 [41, 'PS1_g_error'],
 [42, 'PS1_r_error'],
 [43, 'PS1_i_error'],
 [44, 'PS1_z_error'],
 [45, 'PS1_y_error'],
 [46, '2MASS_J'],
 [47, '2M

In [14]:
iselcols = [0,1,7,8,9,10,101,102,103,104]

In [15]:
iselcols = [0, 1, 7, 8, 9, 10, 101, 102, 103, 104]
tempcols = [fullcols[i] for i in iselcols]

In [16]:
tempcols + fullcols[30:46] + fullcols[52:56]

['Gaia_DR3_source_id',
 'SPHERExRefID',
 'ra',
 'dec',
 'ra_error',
 'dec_error',
 'MatchingFlag',
 'gaia_ra',
 'gaia_dec',
 'gaia_classprob_dsc_combmod_star',
 'LS_g',
 'LS_r',
 'LS_z',
 'LS_g_error',
 'LS_r_error',
 'LS_z_error',
 'PS1_g',
 'PS1_r',
 'PS1_i',
 'PS1_z',
 'PS1_y',
 'PS1_g_error',
 'PS1_r_error',
 'PS1_i_error',
 'PS1_z_error',
 'PS1_y_error',
 'WISE_W1',
 'WISE_W2',
 'WISE_W1_error',
 'WISE_W2_error']

In [17]:
selcols = tempcols + fullcols[30:46] + fullcols[52:56]

In [18]:
%%time
traindf.select(selcols).describe().toPandas().T

CPU times: user 16.7 ms, sys: 1.39 ms, total: 18.1 ms
Wall time: 24.2 s


Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
Gaia_DR3_source_id,97499546,3.0080962549692851E18,1.78566405005012813E18,4295806720,6917528997577384320
SPHERExRefID,97499546,1.34038659062521523E18,7.2809991572540365E17,7190948769431552,2629980687406465025
ra,97499546,197.88838525600755,91.75315953189538,1.6327128351173464E-6,359.9999806544865
dec,97499546,15.051231911555833,27.71722264636104,-31.382798082634444,84.77074678823259
ra_error,97499546,3.540991511655629E-7,1.3340324897400737E-6,1.2699975671169522E-9,2.4972521379476635E-4
dec_error,97499546,2.5731072723299553E-7,9.416042850986585E-7,1.2726993898266098E-9,2.77583176891009E-5
MatchingFlag,97499546,15.0,0.0,15,15
gaia_ra,97499546,197.88838525600755,91.75315953189538,1.6327128351173464E-6,359.9999806544865
gaia_dec,97499546,15.051231911555833,27.71722264636104,-31.382798082634444,84.77074678823259


> `97,499,546	` is our final TrainSample Size

## Balanced vs. Imbalanced SubSampling for Star Prob Label

#### Cumulative Distribution of Star_Prob

In [19]:
#tempbins = np.arange(101)/99.99
#tempbins = np.arange(101)/100.0
#tempbins = np.arange(51)/50.0
tempbins = np.arange(21)/20.0

print(tempbins)

[0.   0.05 0.1  0.15 0.2  0.25 0.3  0.35 0.4  0.45 0.5  0.55 0.6  0.65
 0.7  0.75 0.8  0.85 0.9  0.95 1.  ]


In [20]:
pbins = tempbins.tolist()

In [21]:
leftbins = [x - 0.01 for x in pbins]

In [22]:
%%time
bins, trainstar = \
traindf.select('gaia_classprob_dsc_combmod_star').rdd.flatMap(lambda x: x).histogram(pbins)

CPU times: user 14.8 ms, sys: 6.66 ms, total: 21.5 ms
Wall time: 2.44 s


In [23]:
rawcumtrain = np.cumsum(trainstar)

In [24]:
cumtrain = rawcumtrain/rawcumtrain[-1]

In [25]:
rawcumtrain

array([ 2368555,  2415101,  2444376,  2466786,  2485349,  2501605,
        2516510,  2530392,  2543965,  2557522,  2571070,  2590464,
        2604838,  2620698,  2645924,  2666761,  2700812,  2756639,
        2867072, 97499546])

In [26]:
rawcumtrain[-2],rawcumtrain[-1]

(2867072, 97499546)

> Sanity Check for `total` and `95 percent cut`

#### Bias and Imbalance

- Total : Gaia Full Catalog. Hence, biased (outnumbered) stellar objects than other catalogs, such as PanStarrs and Legacy Surveys (middle panel)
- Imbalanced Trainset : our trainset is an intersection of all major catalogs. Hence, this imbalance is due to the full survey catalogs, themselves, i.e., even PanStarss and Legacy Survey, they have more stellar objects than the others. 

### Balanced Training Subset for Classification

- The Prob Threshold should be 0.95, since already the fraction of <0.95 are larger than 95 percent. 

In [27]:
subtrdf = traindf.select(selcols)

In [28]:
subtrdf.cache()

DataFrame[Gaia_DR3_source_id: bigint, SPHERExRefID: bigint, ra: double, dec: double, ra_error: double, dec_error: double, MatchingFlag: smallint, gaia_ra: double, gaia_dec: double, gaia_classprob_dsc_combmod_star: float, LS_g: double, LS_r: double, LS_z: double, LS_g_error: double, LS_r_error: double, LS_z_error: double, PS1_g: double, PS1_r: double, PS1_i: double, PS1_z: double, PS1_y: double, PS1_g_error: double, PS1_r_error: double, PS1_i_error: double, PS1_z_error: double, PS1_y_error: double, WISE_W1: double, WISE_W2: double, WISE_W1_error: double, WISE_W2_error: double]

In [29]:
subtrdf.dtypes

[('Gaia_DR3_source_id', 'bigint'),
 ('SPHERExRefID', 'bigint'),
 ('ra', 'double'),
 ('dec', 'double'),
 ('ra_error', 'double'),
 ('dec_error', 'double'),
 ('MatchingFlag', 'smallint'),
 ('gaia_ra', 'double'),
 ('gaia_dec', 'double'),
 ('gaia_classprob_dsc_combmod_star', 'float'),
 ('LS_g', 'double'),
 ('LS_r', 'double'),
 ('LS_z', 'double'),
 ('LS_g_error', 'double'),
 ('LS_r_error', 'double'),
 ('LS_z_error', 'double'),
 ('PS1_g', 'double'),
 ('PS1_r', 'double'),
 ('PS1_i', 'double'),
 ('PS1_z', 'double'),
 ('PS1_y', 'double'),
 ('PS1_g_error', 'double'),
 ('PS1_r_error', 'double'),
 ('PS1_i_error', 'double'),
 ('PS1_z_error', 'double'),
 ('PS1_y_error', 'double'),
 ('WISE_W1', 'double'),
 ('WISE_W2', 'double'),
 ('WISE_W1_error', 'double'),
 ('WISE_W2_error', 'double')]

In [30]:
rawcumtrain

array([ 2368555,  2415101,  2444376,  2466786,  2485349,  2501605,
        2516510,  2530392,  2543965,  2557522,  2571070,  2590464,
        2604838,  2620698,  2645924,  2666761,  2700812,  2756639,
        2867072, 97499546])

- class 0: `gaia_classprob_dsc_combmod_star` $< 0.95$
- class 1: `gaia_classprob_dsc_combmod_star` $\geq 0.95$

In [31]:
97499546 - 2867072

94632474

#### Basic Split Sanity Check

Now the sanity check is done. Hence, commented out 

#### Sanity check for `exclusive_subsample`

> Sanity check : [2867072, 2000000, 867072]

## Strategy to select `train` and `test`

### [1] One File for All Label==0 and Multiple Baggings for Label==1

#### `Label==0`

In [32]:
nonstellardf = subtrdf \
    .filter(F.col('gaia_classprob_dsc_combmod_star') < 0.95) \
    .withColumn('stellar_class', F.lit(0).cast(T.ShortType()))

In [33]:
%%time
# Convert to Pandas DataFrame
nonstellarpdf = nonstellardf.toPandas()

CPU times: user 408 ms, sys: 629 ms, total: 1.04 s
Wall time: 7.51 s


In [34]:
nonstellarpdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2867072 entries, 0 to 2867071
Data columns (total 31 columns):
 #   Column                           Dtype  
---  ------                           -----  
 0   Gaia_DR3_source_id               int64  
 1   SPHERExRefID                     int64  
 2   ra                               float64
 3   dec                              float64
 4   ra_error                         float64
 5   dec_error                        float64
 6   MatchingFlag                     int16  
 7   gaia_ra                          float64
 8   gaia_dec                         float64
 9   gaia_classprob_dsc_combmod_star  float32
 10  LS_g                             float64
 11  LS_r                             float64
 12  LS_z                             float64
 13  LS_g_error                       float64
 14  LS_r_error                       float64
 15  LS_z_error                       float64
 16  PS1_g                            float64
 17  PS1_r   

In [35]:
len(nonstellarpdf.index)

2867072

In [36]:
localdatapath

'/home/shong/work/deeplearnings/star-classification/data/'

In [37]:
nonstellarpdf.to_parquet( \
     localdatapath+'classifier-all-label-0.pandas.parquet')

In [38]:
!ls -lh {localdatapath}

total 5.2G
-rw-rw-r-- 1 shong shong 473M  6월 26 16:36 classifier-all-label-0.pandas.parquet
-rw-rw-r-- 1 shong shong 471M  6월 26 11:58 classifier-r101-label-1.pandas.parquet
-rw-rw-r-- 1 shong shong 471M  6월 26 12:04 classifier-r113-label-1.pandas.parquet
-rw-rw-r-- 1 shong shong 471M  6월 26 12:40 classifier-r127-label-1.pandas.parquet
-rw-rw-r-- 1 shong shong 471M  6월 26 12:48 classifier-r131-label-1.pandas.parquet
-rw-rw-r-- 1 shong shong 471M  6월 26 12:58 classifier-r149-label-1.pandas.parquet
-rw-rw-r-- 1 shong shong 471M  6월 26 13:08 classifier-r157-label-1.pandas.parquet
-rw-rw-r-- 1 shong shong 471M  6월 26 13:16 classifier-r163-label-1.pandas.parquet
-rw-rw-r-- 1 shong shong 471M  6월 26 13:23 classifier-r173-label-1.pandas.parquet
-rw-rw-r-- 1 shong shong 471M  6월 26 13:55 classifier-r181-label-1.pandas.parquet
-rw-rw-r-- 1 shong shong 2.5K  5월 22 10:37 histo-flag-refcat-pdf.parquet
-rw-rw-r-- 1 shong shong  21K  5월 22 11:28 mis-null-stats.parquet
-rw-rw-r-- 1 shong

#### `Label==1`

- multiple baggings with some random seeds

In [39]:
stellardf = subtrdf \
    .filter(F.col('gaia_classprob_dsc_combmod_star') >= 0.95) \
    .withColumn('stellar_class', F.lit(1).cast(T.ShortType()))

In [40]:
#rseed = 101
#rseed = 113
#rseed = 127
#rseed = 131
#rseed =149
#rseed =157
#rseed =163
#rseed =173
#rseed =181
rseed =191


In [41]:
tempdf = stellardf.withColumn('random', F.rand(seed=rseed))

In [42]:
subsampledf = tempdf.orderBy('random').limit(2867072)

In [43]:
%%time
# Convert to Pandas DataFrame
stellarpdf = subsampledf.toPandas()

CPU times: user 462 ms, sys: 588 ms, total: 1.05 s
Wall time: 4min 43s


In [44]:
stellarpdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2867072 entries, 0 to 2867071
Data columns (total 32 columns):
 #   Column                           Dtype  
---  ------                           -----  
 0   Gaia_DR3_source_id               int64  
 1   SPHERExRefID                     int64  
 2   ra                               float64
 3   dec                              float64
 4   ra_error                         float64
 5   dec_error                        float64
 6   MatchingFlag                     int16  
 7   gaia_ra                          float64
 8   gaia_dec                         float64
 9   gaia_classprob_dsc_combmod_star  float32
 10  LS_g                             float64
 11  LS_r                             float64
 12  LS_z                             float64
 13  LS_g_error                       float64
 14  LS_r_error                       float64
 15  LS_z_error                       float64
 16  PS1_g                            float64
 17  PS1_r   

In [45]:
stellarpdf.drop(columns=['random'],inplace=True)

In [46]:
stellarpdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2867072 entries, 0 to 2867071
Data columns (total 31 columns):
 #   Column                           Dtype  
---  ------                           -----  
 0   Gaia_DR3_source_id               int64  
 1   SPHERExRefID                     int64  
 2   ra                               float64
 3   dec                              float64
 4   ra_error                         float64
 5   dec_error                        float64
 6   MatchingFlag                     int16  
 7   gaia_ra                          float64
 8   gaia_dec                         float64
 9   gaia_classprob_dsc_combmod_star  float32
 10  LS_g                             float64
 11  LS_r                             float64
 12  LS_z                             float64
 13  LS_g_error                       float64
 14  LS_r_error                       float64
 15  LS_z_error                       float64
 16  PS1_g                            float64
 17  PS1_r   

In [47]:
stellarpdf.head().T

Unnamed: 0,0,1,2,3,4
Gaia_DR3_source_id,3.586389e+18,2.954321e+17,4.228736e+18,1.792985e+18,3.585502e+18
SPHERExRefID,2.082771e+18,9.618791e+17,1.679613e+18,1.088239e+18,2.112327e+18
ra,172.5285,22.22495,311.4352,326.1078,175.5514
dec,-11.79465,26.34891,1.653315,21.76339,-12.79101
ra_error,9.041956e-08,1.229672e-07,2.59969e-07,3.080544e-08,1.24025e-08
dec_error,6.505789e-08,7.876938e-08,1.334499e-07,2.98869e-08,9.196763e-09
MatchingFlag,15.0,15.0,15.0,15.0,15.0
gaia_ra,172.5285,22.22495,311.4352,326.1078,175.5514
gaia_dec,-11.79465,26.34891,1.653315,21.76339,-12.79101
gaia_classprob_dsc_combmod_star,0.999994,0.9999614,0.9999964,0.9999697,0.9976672


In [48]:
stellarpdf.to_parquet( \
     localdatapath+'classifier-r191-label-1.pandas.parquet')

In [49]:
!ls -lh {localdatapath}

total 5.6G
-rw-rw-r-- 1 shong shong 473M  6월 26 16:36 classifier-all-label-0.pandas.parquet
-rw-rw-r-- 1 shong shong 471M  6월 26 11:58 classifier-r101-label-1.pandas.parquet
-rw-rw-r-- 1 shong shong 471M  6월 26 12:04 classifier-r113-label-1.pandas.parquet
-rw-rw-r-- 1 shong shong 471M  6월 26 12:40 classifier-r127-label-1.pandas.parquet
-rw-rw-r-- 1 shong shong 471M  6월 26 12:48 classifier-r131-label-1.pandas.parquet
-rw-rw-r-- 1 shong shong 471M  6월 26 12:58 classifier-r149-label-1.pandas.parquet
-rw-rw-r-- 1 shong shong 471M  6월 26 13:08 classifier-r157-label-1.pandas.parquet
-rw-rw-r-- 1 shong shong 471M  6월 26 13:16 classifier-r163-label-1.pandas.parquet
-rw-rw-r-- 1 shong shong 471M  6월 26 13:23 classifier-r173-label-1.pandas.parquet
-rw-rw-r-- 1 shong shong 471M  6월 26 13:55 classifier-r181-label-1.pandas.parquet
-rw-rw-r-- 1 shong shong 471M  6월 26 16:41 classifier-r191-label-1.pandas.parquet
-rw-rw-r-- 1 shong shong 2.5K  5월 22 10:37 histo-flag-refcat-pdf.parquet
-r