# Classifying stars and galaxies using machine learning

Authored by Maksim Nikiforov

NCSU ST590, Project 3

Spring, 2022

In [26]:
import os
import sys
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

import pandas as pd
import pyspark.pandas as ps
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [28]:
# Read CSV into a Spark data frame
sdss_data = spark.read.options(header="True", inferSchema='True',delimiter=',') \
  .csv("MyTable_mvnikifo.csv")
sdss_data.printSchema()

root
 |-- objID: long (nullable = true)
 |-- ra: double (nullable = true)
 |-- dec: double (nullable = true)
 |-- specObjID: long (nullable = true)
 |-- psfMag_r: double (nullable = true)
 |-- modelMag_r: double (nullable = true)
 |-- petroMag_r: double (nullable = true)
 |-- fiberMag_r: double (nullable = true)
 |-- petroRad_r: double (nullable = true)
 |-- petroR50_r: double (nullable = true)
 |-- petroR90_r: double (nullable = true)
 |-- lnLStar_r: double (nullable = true)
 |-- lnLExp_r: double (nullable = true)
 |-- lnLDeV_r: double (nullable = true)
 |-- mE1_r: double (nullable = true)
 |-- mE2_r: double (nullable = true)
 |-- mRrCc_r: double (nullable = true)
 |-- type_r: integer (nullable = true)
 |-- type: integer (nullable = true)
 |-- specClass: integer (nullable = true)



In [29]:
sdss_data.count()

1030220

There are missing values in this data, denoted by $0$ and $-9999$. These can be indicated more clearly with the designation "None". The number of missing values can be ascertained by converting the Spark DataFrame to a pandas-on-spark DataFrame and invoking the `.isnull().sum()` sequence of functions. 

In [48]:
sdss_data = sdss_data.replace(-9999, None)
sdss_data = sdss_data.replace(0, None)

In [49]:
sdss_data_ps = sdss_data.to_pandas_on_spark()

In [50]:
sdss_data_ps.isnull().sum()

objID            0
ra               0
dec              0
specObjID        0
psfMag_r         0
modelMag_r       0
petroMag_r       0
fiberMag_r       1
petroRad_r       0
petroR50_r      16
petroR90_r      16
lnLStar_r       55
lnLExp_r       104
lnLDeV_r        79
mE1_r          524
mE2_r          524
mRrCc_r        524
type_r           0
type             0
specClass     9989
dtype: int64

There are nearly 12,000 rows with missing data. These can be removed to prepare the data for machine learning algorithms, leaving a total of 1,019,910 rows.

In [52]:
sdss_data_ps = sdss_data_ps.dropna()

In [56]:
len(sdss_data_ps)

1019910

At this point, the data contains observations for 816,849 galaxies and 203,061 stars.

In [59]:
sdss_data_ps.type.value_counts()

3    816849
6    203061
Name: type, dtype: int64