# Classifying stars and galaxies using machine learning

Authored by Maksim Nikiforov

NCSU ST590, Project 3

Spring, 2022

## Data ingestion

In [1]:
import os
import sys
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

import pandas as pd
import pyspark.pandas as ps
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()



In [None]:
# Read CSV into a Spark data frame
sdss_data = spark.read.options(header="True", 
                               inferSchema='True',
                               delimiter=',').csv("MyTable_mvnikifo.csv")
sdss_data.printSchema()

In [None]:
sdss_data.count()

## Exploratory data analysis

There are missing values in this data, denoted by $0$ and $-9999$. These can be indicated more clearly with the designation "None". The number of missing values can be ascertained by converting the Spark DataFrame to a pandas-on-spark DataFrame and invoking the `.isnull().sum()` sequence of functions. 

In [None]:
sdss_data = sdss_data.replace(-9999, None)
sdss_data = sdss_data.replace(0, None)

There are nearly 12,000 rows with missing data. These can be removed to prepare the data for machine learning algorithms, leaving a total of 1,019,910 rows.

In [None]:
# Count total number of missing values
# Based on example from https://sparkbyexamples.com/pyspark/pyspark-find-count-of-null-none-nan-values/
from pyspark.sql.functions import col, isnan, when, count

sdss_data.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in sdss_data.columns]).show()

In [None]:
# Remove rows with missing values and calculate new row count
sdss_data = sdss_data.dropna()
sdss_data.count()

The data also contains a `specClass` column with values that correspond to the following classifications (https://skyserver.sdss.org/dr7/en/help/browser/enum.asp?n=SpecClass): 

| Name      | Value | Description                                                                                      |
|-----------|-------|--------------------------------------------------------------------------------------------------|
| UNKNOWN   |   0   | Spectrum not classifiable (zConf < 0.25).                                                        |
| STAR      |   1   | Spectrum of a star.                                                                              |
| GALAXY    |   2   | Spectrum of a galaxy.                                                                            |
| QSO       |   3   | Spectrum of a quasi-stellar object.                                                              |
| HIZ_QSO   |   4   | Spectrum of a high-redshift quasar (z>2.3), whose redshift is confirmed by a Ly-alpha estimator. |
| SKY       |   5   | Spectrum of blank sky.                                                                           |
| STAR_LATE |   6   | Star dominated bt molecular bands M or later.                                                    |
| GAL_EM    |   7   | Emission line galaxy (placeholder).                                                              |

The intent of this project is to classify only stars and galaxies, and all other observations should be removed.

In [None]:
sdss_data = sdss_data.filter((sdss_data.specClass == 1) | \
                              (sdss_data.specClass == 2))

At this point, the data contains observations for 81,633 stars and 802,474 galaxies.

In [None]:
sdss_data.groupBy("type").count().show()

The data set should be split into a training and a testing test before applying transformations.

In [None]:
train_SDSS, test_SDSS = sdss_data.randomSplit([0.8,0.2], seed = 1)
print(train_SDSS.count(), test_SDSS.count())

## Machine learning models

### Random forest classifier

#### Set up transformations

In [None]:
from pyspark.ml.feature import SQLTransformer

sqlTrans = SQLTransformer(
    statement = "SELECT psfMag_r, modelMag_r, petroMag_r, fiberMag_r, \
                        petroRad_r, petroR50_r, petroR90_r, lnLStar_r, \
                        lnLExp_r, lnLDeV_r, mE1_r, mE2_r, mRrCc_r, \
                        type as label FROM __THIS__"
)

In [None]:
# Create our list of features by dropping unused columns
features_list = sqlTrans.transform(train_SDSS).drop("label").columns
features_list

We can set up transformations for our data.

In [None]:
from pyspark.ml.feature import VectorAssembler, StandardScaler

assembler = VectorAssembler(inputCols=features_list, outputCol="unscaledFeatures")
scaler = StandardScaler(inputCol="unscaledFeatures", outputCol="features")

We can select an algorithm for our data and instantiate it.

In [None]:
from pyspark.ml.classification import RandomForestClassifier

rfc = RandomForestClassifier(labelCol="label", featuresCol="features")

Finally, we can set up a pipeline.

In [None]:
from pyspark.ml import Pipeline

pipeline = Pipeline(stages = [sqlTrans, assembler, rfc])

We can then set up cross-validation.

In [None]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator


paramGrid = ParamGridBuilder() \
    .addGrid(rfc.numTrees, [5, 10]) \
    .build()

crossVal = CrossValidator(estimator = pipeline,
                          estimatorParamMaps = paramGrid,
                          evaluator = BinaryClassificationEvaluator(),
                          numFolds = 3)

In [None]:
# Run cross-validation, and choose the best set of parameters

#cvModel = crossVal.fit(train_SDSS)

In [None]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression()
grid_lr = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
evaluator_lr = BinaryClassificationEvaluator()

In [None]:
pipeline_lr = Pipeline(stages = [sqlTrans, assembler, scaler, lr])

In [None]:
cv = CrossValidator(estimator=pipeline_lr, estimatorParamMaps=None, evaluator=evaluator_lr,
    parallelism=2)
cvModel_lr = cv.fit(test_SDSS)

In [49]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator=MulticlassClassificationEvaluator(predictionCol="prediction")
acc = evaluator.evaluate(pred)
print("Prediction Accuracy: ", acc)

Prediction Accuracy:  0.9951260086324447


In [54]:
from sklearn.metrics import confusion_matrix

y_pred=pred.select("prediction").collect()
y_orig=pred.select("label").collect()

cm = confusion_matrix(y_orig, y_pred)
print("Confusion Matrix:")
print(cm)

Confusion Matrix:
[[159693    380]
 [   479  15929]]


In [60]:
faint_mag = pred.filter((pred.modelMag_r >= 20.5) & (pred.modelMag_r <= 21))

In [61]:
acc_faint = evaluator.evaluate(faint_mag)
print("Prediction Accuracy: ", acc_faint)

Prediction Accuracy:  0.9223043524303716


In [9]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
 
evaluator = BinaryClassificationEvaluator(
    labelCol="label",                     
    rawPredictionCol="prediction",       
    metricName="areaUnderROC",
)
 
accuracy = evaluator.evaluate(pred)
print(f"Area under ROC = {accuracy} ")

NameError: name 'pred' is not defined

## Cross-validation

In [None]:
from pyspark.ml import Pipeline
pipeline = Pipeline().setStages([assembler, scaler, kmeans])