https://www.kaggle.com/datasets/fedesoriano/stellar-classification-dataset-sdss17

In [2]:
import numpy as np
import pandas as pd

In [136]:
names = ["obj_ID","alpha","delta","u","g","r","i","z","run_ID","rerun_ID","cam_col","field_ID","spec_obj_ID","class","redshift","plate","MJD","fiber_ID"]
dataset = pd.read_csv("star_classification.csv", names=names)

Content
The data consists of 100,000 observations of space taken by the SDSS (Sloan Digital Sky Survey). Every observation is described by 17 feature columns and 1 class column which identifies it to be either a star, galaxy or quasar.

obj_ID = Object Identifier, the unique value that identifies the object in the image catalog used by the CAS
alpha = Right Ascension angle (at J2000 epoch)
delta = Declination angle (at J2000 epoch)
u = Ultraviolet filter in the photometric system
g = Green filter in the photometric system
r = Red filter in the photometric system
i = Near Infrared filter in the photometric system
z = Infrared filter in the photometric system
run_ID = Run Number used to identify the specific scan
rereun_ID = Rerun Number to specify how the image was processed
cam_col = Camera column to identify the scanline within the run
field_ID = Field number to identify each field
spec_obj_ID = Unique ID used for optical spectroscopic objects (this means that 2 different observations with the same spec_obj_ID must share the output class)
class = object class (galaxy, star or quasar object)
redshift = redshift value based on the increase in wavelength
plate = plate ID, identifies each plate in SDSS
MJD = Modified Julian Date, used to indicate when a given piece of SDSS data was taken
fiber_ID = fiber ID that identifies the fiber that pointed the light at the focal plane in each observation

In [137]:
dataset.head()

Unnamed: 0,obj_ID,alpha,delta,u,g,r,i,z,run_ID,rerun_ID,cam_col,field_ID,spec_obj_ID,class,redshift,plate,MJD,fiber_ID
0,1.237661e+18,135.689107,32.494632,23.87882,22.2753,20.39501,19.16573,18.79371,3606,301,2,79,6.543777e+18,GALAXY,0.634794,5812,56354,171
1,1.237665e+18,144.826101,31.274185,24.77759,22.83188,22.58444,21.16812,21.61427,4518,301,5,119,1.176014e+19,GALAXY,0.779136,10445,58158,427
2,1.237661e+18,142.18879,35.582444,25.26307,22.66389,20.60976,19.34857,18.94827,3606,301,2,120,5.1522e+18,GALAXY,0.644195,4576,55592,299
3,1.237663e+18,338.741038,-0.402828,22.13682,23.77656,21.61162,20.50454,19.2501,4192,301,3,214,1.030107e+19,GALAXY,0.932346,9149,58039,775
4,1.23768e+18,345.282593,21.183866,19.43718,17.58028,16.49747,15.97711,15.54461,8102,301,3,137,6.891865e+18,GALAXY,0.116123,6121,56187,842


In [144]:
X = dataset.drop('class', 1)
y = dataset['class']

  X = dataset.drop('class', 1)


In [128]:
X

Unnamed: 0,obj_ID,alpha,delta,u,g,r,i,z,run_ID,rerun_ID,cam_col,field_ID,spec_obj_ID,redshift,plate,MJD,fiber_ID
0,1.237661e+18,135.689107,32.494632,23.87882,22.27530,20.39501,19.16573,18.79371,3606,301,2,79,6.543777e+18,0.634794,5812,56354,171
1,1.237665e+18,144.826101,31.274185,24.77759,22.83188,22.58444,21.16812,21.61427,4518,301,5,119,1.176014e+19,0.779136,10445,58158,427
2,1.237661e+18,142.188790,35.582444,25.26307,22.66389,20.60976,19.34857,18.94827,3606,301,2,120,5.152200e+18,0.644195,4576,55592,299
3,1.237663e+18,338.741038,-0.402828,22.13682,23.77656,21.61162,20.50454,19.25010,4192,301,3,214,1.030107e+19,0.932346,9149,58039,775
4,1.237680e+18,345.282593,21.183866,19.43718,17.58028,16.49747,15.97711,15.54461,8102,301,3,137,6.891865e+18,0.116123,6121,56187,842
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,1.237679e+18,39.620709,-2.594074,22.16759,22.97586,21.90404,21.30548,20.73569,7778,301,2,581,1.055431e+19,0.000000,9374,57749,438
99996,1.237679e+18,29.493819,19.798874,22.69118,22.38628,20.45003,19.75759,19.41526,7917,301,1,289,8.586351e+18,0.404895,7626,56934,866
99997,1.237668e+18,224.587407,15.700707,21.16916,19.26997,18.20428,17.69034,17.35221,5314,301,4,308,3.112008e+18,0.143366,2764,54535,74
99998,1.237661e+18,212.268621,46.660365,25.35039,21.63757,19.91386,19.07254,18.62482,3650,301,4,131,7.601080e+18,0.455040,6751,56368,470


In [145]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [130]:
X_train.shape

(80000, 17)

In [131]:
X_train

Unnamed: 0,obj_ID,alpha,delta,u,g,r,i,z,run_ID,rerun_ID,cam_col,field_ID,spec_obj_ID,redshift,plate,MJD,fiber_ID
10382,1.237666e+18,248.650225,17.219264,25.55758,21.40075,19.58257,18.87423,18.34557,4671,301,6,135,4.575691e+18,0.412042,4064,55366,122
73171,1.237662e+18,228.034911,4.918393,25.31210,23.30647,21.88567,20.24809,20.38933,3910,301,2,119,2.063819e+18,-0.000288,1833,54561,162
30938,1.237670e+18,39.840078,26.771964,20.14939,18.97693,18.70521,18.61588,18.52578,5817,301,2,135,2.751736e+18,-0.000222,2444,54082,134
99310,1.237680e+18,5.026235,28.058156,19.75068,19.64277,19.36223,19.06267,18.91270,8103,301,5,126,8.662929e+18,0.244059,7694,57359,926
58959,1.237662e+18,194.879190,42.777076,22.09026,22.08509,21.83683,21.89166,21.45124,3893,301,5,223,9.430604e+18,0.867342,8376,57786,242
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21243,1.237668e+18,178.659485,22.213454,23.02713,21.79717,20.15561,19.21565,18.74604,5183,301,3,310,7.231673e+18,0.479940,6423,56313,63
45891,1.237666e+18,231.520274,18.764341,20.39821,18.49582,17.39976,16.88339,16.45156,4670,301,1,226,2.430822e+18,0.108567,2159,54328,13
42613,1.237663e+18,334.856082,0.077762,18.97330,17.87081,17.44646,17.29562,17.20019,4192,301,4,188,1.288148e+18,-0.000550,1144,53238,432
43567,1.237651e+18,200.820199,65.272106,19.70816,18.40767,17.58343,17.22796,17.01092,1302,301,6,420,6.790177e+17,0.165433,603,52056,364


**Data Preprocessing**

In [146]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [114]:
X_train.shape

(80000, 1)

In [115]:
X_test.shape

(20000, 1)

**Apply Random Forest classifier and get the model**

In [133]:
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(random_state=0)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

Performance:

Time: ~39sec

[[11679   138    34]
 [  263  3571     1]
 [    8     0  4306]]

Accuracy: 0.9778

In [141]:
from sklearn.neighbors import KNeighborsClassifier

reg_knn = KNeighborsClassifier()
reg_knn.fit(X_train, y_train)
y_pred = reg_knn.predict(X_test)

Performance:

Time: ~13sec

[[11424   122   305]
 [  670  3127    38]
 [  899     9  3406]]

Accuracy: 0.89785

In [147]:
from sklearn.svm import SVC

reg_svc = SVC()
reg_svc.fit(X_train, y_train)
y_pred = reg_svc.predict(X_test)

Performance:

Time: ~2min

[[11469   135   247]
 [  393  3432    10]
 [   92     0  4222]]

Accuracy: 0.95615

In [148]:
y_pred

array(['GALAXY', 'QSO', 'GALAXY', ..., 'GALAXY', 'GALAXY', 'STAR'],
      dtype=object)

**Performance Evaluation using confusion matrix and accuracy score**

In [149]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

cm = confusion_matrix(y_test, y_pred)
print(cm)
print('\nAccuracy: ' + str(accuracy_score(y_test, y_pred)))

[[11469   135   247]
 [  393  3432    10]
 [   92     0  4222]]

Accuracy: 0.95615


In [150]:
X_train.shape

(80000, 17)

In [151]:
X_train.shape

(80000, 17)

In [152]:
X_test.shape

(20000, 17)

**Apply PCA and reduce Dimensionality**
Try different components

In [162]:
from sklearn.decomposition import PCA

pca = PCA(n_components=1)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

In [159]:
X_train.shape 

(80000, 1)

In [155]:
X_test.shape

(20000, 1)

**Apply Random forest on the Dimension reduced data**

In [163]:
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(max_depth=2, random_state=0)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

In [165]:
from sklearn.neighbors import KNeighborsClassifier

reg_knn = KNeighborsClassifier()
reg_knn.fit(X_train, y_train)
y_pred = reg_knn.predict(X_test)

In [167]:
from sklearn.svm import SVC

reg_svc = SVC()
reg_svc.fit(X_train, y_train)
y_pred = reg_svc.predict(X_test)

**Performance Evaluation on new Data**

In [166]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

cm = confusion_matrix(y_test, y_pred)
print(cm)
print('Accuracy' + str(accuracy_score(y_test, y_pred)))

[[9499 1315 1037]
 [2381 1236  218]
 [3057  336  921]]
Accuracy0.5828
