# Classifying stars and galaxies using machine learning

Authored by Maksim Nikiforov

NCSU ST590, Project 3

Spring, 2022

In [1]:
import os
import sys
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

import pandas as pd
import pyspark.pandas as ps
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()



In [2]:
# Read CSV into a Spark data frame
sdss_data = spark.read.options(header="True", inferSchema='True',delimiter=',') \
  .csv("MyTable_mvnikifo.csv")
sdss_data.printSchema()

root
 |-- objID: long (nullable = true)
 |-- ra: double (nullable = true)
 |-- dec: double (nullable = true)
 |-- specObjID: long (nullable = true)
 |-- psfMag_r: double (nullable = true)
 |-- modelMag_r: double (nullable = true)
 |-- petroMag_r: double (nullable = true)
 |-- fiberMag_r: double (nullable = true)
 |-- petroRad_r: double (nullable = true)
 |-- petroR50_r: double (nullable = true)
 |-- petroR90_r: double (nullable = true)
 |-- lnLStar_r: double (nullable = true)
 |-- lnLExp_r: double (nullable = true)
 |-- lnLDeV_r: double (nullable = true)
 |-- mE1_r: double (nullable = true)
 |-- mE2_r: double (nullable = true)
 |-- mRrCc_r: double (nullable = true)
 |-- type_r: integer (nullable = true)
 |-- type: integer (nullable = true)
 |-- specClass: integer (nullable = true)



In [3]:
sdss_data.count()

1030220

There are missing values in this data, denoted by $0$ and $-9999$. These can be indicated more clearly with the designation "None". The number of missing values can be ascertained by converting the Spark DataFrame to a pandas-on-spark DataFrame and invoking the `.isnull().sum()` sequence of functions. 

In [4]:
sdss_data = sdss_data.replace(-9999, None)
sdss_data = sdss_data.replace(0, None)

There are nearly 12,000 rows with missing data. These can be removed to prepare the data for machine learning algorithms, leaving a total of 1,019,910 rows.

In [5]:
# Count total number of missing values
# Based on example from https://sparkbyexamples.com/pyspark/pyspark-find-count-of-null-none-nan-values/
from pyspark.sql.functions import col, isnan, when, count

sdss_data.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in sdss_data.columns]).show()

+-----+---+---+---------+--------+----------+----------+----------+----------+----------+----------+---------+--------+--------+-----+-----+-------+------+----+---------+
|objID| ra|dec|specObjID|psfMag_r|modelMag_r|petroMag_r|fiberMag_r|petroRad_r|petroR50_r|petroR90_r|lnLStar_r|lnLExp_r|lnLDeV_r|mE1_r|mE2_r|mRrCc_r|type_r|type|specClass|
+-----+---+---+---------+--------+----------+----------+----------+----------+----------+----------+---------+--------+--------+-----+-----+-------+------+----+---------+
|    0|  0|  0|        0|       0|         0|         0|         1|         0|        16|        16|       55|     104|      79|  524|  524|    524|     0|   0|     9989|
+-----+---+---+---------+--------+----------+----------+----------+----------+----------+----------+---------+--------+--------+-----+-----+-------+------+----+---------+



In [6]:
# 
sdss_data = sdss_data.dropna()
sdss_data.count()

1019910

The data also contains a `specClass` column with values that correspond to the following classifications (https://skyserver.sdss.org/dr7/en/help/browser/enum.asp?n=SpecClass): 

| Name      | Value | Description                                                                                      |
|-----------|-------|--------------------------------------------------------------------------------------------------|
| UNKNOWN   |   0   | Spectrum not classifiable (zConf < 0.25).                                                        |
| STAR      |   1   | Spectrum of a star.                                                                              |
| GALAXY    |   2   | Spectrum of a galaxy.                                                                            |
| QSO       |   3   | Spectrum of a quasi-stellar object.                                                              |
| HIZ_QSO   |   4   | Spectrum of a high-redshift quasar (z>2.3), whose redshift is confirmed by a Ly-alpha estimator. |
| SKY       |   5   | Spectrum of blank sky.                                                                           |
| STAR_LATE |   6   | Star dominated bt molecular bands M or later.                                                    |
| GAL_EM    |   7   | Emission line galaxy (placeholder).                                                              |

The intent of this project is to classify only stars and galaxies, and all other observations should be removed.

In [7]:
sdss_data = sdss_data.filter((sdss_data.specClass == 1) | \
                              (sdss_data.specClass == 2))

At this point, the data contains observations for 81,633 stars and 802,474 galaxies.

In [8]:
sdss_data.groupBy("type").count().show()

+----+------+
|type| count|
+----+------+
|   6| 81633|
|   3|802474|
+----+------+



## Logistic regression model

In [35]:
from pyspark.ml.feature import SQLTransformer

sqlTrans = SQLTransformer(
    statement = "SELECT psfMag_r, modelMag_r, petroMag_r, fiberMag_r, petroRad_r, petroR50_r, petroR90_r, lnLStar_r, lnLExp_r, lnLDeV_r, mE1_r, mE2_r, mRrCc_r, type as label FROM __THIS__"
)

In [36]:
sqlTrans.transform(sdss_data).show(5)

+--------+----------+----------+----------+----------+----------+----------+---------+---------+---------+----------+-----------+--------+-----+
|psfMag_r|modelMag_r|petroMag_r|fiberMag_r|petroRad_r|petroR50_r|petroR90_r|lnLStar_r| lnLExp_r| lnLDeV_r|     mE1_r|      mE2_r| mRrCc_r|label|
+--------+----------+----------+----------+----------+----------+----------+---------+---------+---------+----------+-----------+--------+-----+
|18.01274|  16.43724|  16.61505|  18.07128|  5.936419|  2.581169|   7.24652|-15946.72|-1429.818|-776.5237|-0.4130035|  0.5511445| 37.7417|    3|
| 17.7713|    16.704|  16.79972|  17.86816|  3.742817|  1.750927|  5.314383|-10572.39|-1694.458|-133.5919| 0.0779022| -0.1298255|12.46036|    3|
|18.71541|  17.15762|  17.09714|  18.76385|  6.089326|  2.881765|  6.535208|-7992.962|-337.2565|-407.3528|0.03209908|  0.4520678|43.33707|    3|
|17.59361|  16.33401|  16.40417|  17.73234|  5.358847|  2.365247|  7.632291|-15323.06| -3282.95|-114.9684|  0.082524|-0.03256264|1

The data set should be split into a training and a testing test before applying transformations.

In [39]:
train_SDSS, test_SDSS = sqlTrans.transform(sdss_data).randomSplit([0.8,0.2], seed = 1)
print(train_SDSS.count(), test_SDSS.count())

707626 176481


In [42]:
# Create our list of features.
# Obtain all column names (list)
features_list = train_SDSS.columns

# Specify the column names to remove
remove_col = ["label"]

# Remove these columns using a list comprehension loop and output final list of features
features_list = [col_name for col_name in features_list if col_name not in remove_col]
features_list

['psfMag_r',
 'modelMag_r',
 'petroMag_r',
 'fiberMag_r',
 'petroRad_r',
 'petroR50_r',
 'petroR90_r',
 'lnLStar_r',
 'lnLExp_r',
 'lnLDeV_r',
 'mE1_r',
 'mE2_r',
 'mRrCc_r']

Since the data is now split, it can be transformed.

In [43]:
from pyspark.ml.feature import VectorAssembler, StandardScaler

vecAssembler = VectorAssembler(inputCols=features_list, outputCol="unscaledFeatures")

features_df = vecAssembler.transform(train_SDSS)

stdScaler = StandardScaler(inputCol="unscaledFeatures", outputCol="scaledFeatures")

scaled_df = stdScaler.fit(features_df).transform(features_df)


scaled_df.select("scaledFeatures").show(5)

+--------------------+
|      scaledFeatures|
+--------------------+
|[15.6929391950376...|
|[15.9325044686517...|
|[15.9685972696521...|
|[15.9804840538833...|
|[16.0223871859818...|
+--------------------+
only showing top 5 rows



In [44]:
vecAssembler = VectorAssembler(inputCols=features_list, outputCol="unscaledFeatures")

features_df_test = vecAssembler.transform(test_SDSS)

stdScaler = StandardScaler(inputCol="unscaledFeatures", outputCol="scaledFeatures")

scaled_df_test = stdScaler.fit(features_df_test).transform(features_df_test)

In [46]:
from pyspark.ml.classification import RandomForestClassifier

rfc = RandomForestClassifier(labelCol="label", featuresCol="scaledFeatures")

model = rfc.fit(scaled_df)

In [56]:
pred = model.transform(scaled_df_test)
pred.show(5)

+--------+----------+----------+----------+----------+----------+----------+---------+---------+---------+-----------+------------+--------+-----+--------------------+--------------------+--------------------+--------------------+----------+
|psfMag_r|modelMag_r|petroMag_r|fiberMag_r|petroRad_r|petroR50_r|petroR90_r|lnLStar_r| lnLExp_r| lnLDeV_r|      mE1_r|       mE2_r| mRrCc_r|label|    unscaledFeatures|      scaledFeatures|       rawPrediction|         probability|prediction|
+--------+----------+----------+----------+----------+----------+----------+---------+---------+---------+-----------+------------+--------+-----+--------------------+--------------------+--------------------+--------------------+----------+
|14.42715|   14.4284|   14.4905|  14.76792|  1.134293| 0.5847612|  1.286333|-256.9894|-260.3972| -259.931| 0.02392568|  -0.0334613|2.521054|    6|[14.42715,14.4284...|[16.0016856109306...|[0.0,0.0,0.0,1.13...|[0.0,0.0,0.0,0.05...|       6.0|
|14.75226|  15.09743|  14.80867|

In [49]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator=MulticlassClassificationEvaluator(predictionCol="prediction")
acc = evaluator.evaluate(pred)
print("Prediction Accuracy: ", acc)

Prediction Accuracy:  0.9951260086324447


In [54]:
from sklearn.metrics import confusion_matrix

y_pred=pred.select("prediction").collect()
y_orig=pred.select("label").collect()

cm = confusion_matrix(y_orig, y_pred)
print("Confusion Matrix:")
print(cm)

Confusion Matrix:
[[159693    380]
 [   479  15929]]


In [60]:
faint_mag = pred.filter((pred.modelMag_r >= 20.5) & (pred.modelMag_r <= 21))

In [61]:
acc_faint = evaluator.evaluate(faint_mag)
print("Prediction Accuracy: ", acc_faint)

Prediction Accuracy:  0.9223043524303716


In [None]:
from pyspark.ml import Pipeline

pipeline = Pipeline(stages = [vecAssembler, assembler, lr])