# **Required Libraries**

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_predict,cross_val_score
from sklearn.metrics import precision_score,recall_score,f1_score,confusion_matrix
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier,RandomForestClassifier
from tensorflow import keras
from keras.layers import Dense,Dropout,Input
from keras.models import Model
from keras.utils import to_categorical,plot_model

# Loading Data

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
data = pd.read_csv("/kaggle/input/sloan-digital-sky-survey/Skyserver_SQL2_27_2018 6_51_39 PM.csv")

In [None]:
data.head()

**Understanding the columns:**
* objid = Object Identifier
* ra = J2000 Right Ascension (r-band)
* dec = J2000 Declination (r-band)
* Right ascension (abbreviated RA) is the angular distance measured eastward along the celestial equator from the Sun at the March equinox to the hour circle of the point above the earth in question. When paired with declination (abbreviated dec), these astronomical coordinates specify the direction of a point on the celestial sphere (traditionally called in English the skies or the sky) in the equatorial coordinate system.

* Source: https://en.wikipedia.org/wiki/Right_ascension

* u = better of DeV/Exp magnitude fit
* g = better of DeV/Exp magnitude fit
* r = better of DeV/Exp magnitude fit
* i = better of DeV/Exp magnitude fit
* z = better of DeV/Exp magnitude fit
* **The Thuan-Gunn astronomic magnitude system. u, g, r, i, z represent the response of the 5 bands of the telescope.**

* Further education: https://www.astro.umd.edu/~ssm/ASTR620/mags.html

* run = Run Number
* rereun = Rerun Number
* camcol = Camera column
* field = Field number
* **Run, rerun, camcol and field are features which describe a field within an image taken by the SDSS. A field is basically a part of the entire image corresponding to 2048 by 1489 pixels. A field can be identified by:**

* **run number, which identifies the specific scan,
* **the camera column, or "camcol," a number from 1 to 6, identifying the scanline within the run, and
* **the field number. The field number typically starts at 11 (after an initial rampup time), and can be as large as 800 for particularly long runs
* **An additional number, rerun, specifies how the image was processed.
* View "SpecObj"
* specobjid = Object Identifier
* class = object class (galaxy, star or quasar object)
* The class identifies an object to be either a galaxy, star or quasar. This will be the response variable which we will be trying to predict.

* redshift = Final Redshift
* plate = plate number
* mjd = MJD of observation
* fiberid = fiber ID

* **In physics, redshift happens when light or other electromagnetic radiation from an object is increased in wavelength, or shifted to the red end of the spectrum.**

* **Each spectroscopic exposure employs a large, thin, circular metal plate that positions optical fibers via holes drilled at the locations of the images in the telescope focal plane. These fibers then feed into the spectrographs. Each plate has a unique serial number, which is called plate in views such as SpecObj in the CAS.**

* **Modified Julian Date, used to indicate the date that a given piece of SDSS data (image or spectrum) was taken.**

* **The SDSS spectrograph uses optical fibers to direct the light at the focal plane from individual objects to the slithead. Each object is assigned a corresponding fiberID.

* Further information on SDSS images and their attributes:

* http://www.sdss3.org/dr9/imaging/imaging_basics.php

* http://www.sdss3.org/dr8/glossary.php


# **Looking for and correcting missing values**

In [None]:
data.isnull().sum()

In [None]:
enc = LabelEncoder()
data['class'] = enc.fit_transform(data['class']) # Galaxy: 0 ,QSO : 1, Star : 2
data.drop(['objid','rerun'],1,inplace=True) #dropping objid because it is basicaly id and rerun has the same value of 301 across the entire dataset

In [None]:
data

# **BasicEDA**

In [None]:
data.describe()

**Corelation Plot**

In [None]:
plt.figure(figsize = (25,10))
sb.heatmap(data.corr(),annot = True)

* u,g,r,i,z have a very strong corelation with each other


**Ra and Class**

In [None]:
plt.figure(figsize = (25,5))
plt.plot(data['ra'],color = 'red');
plt.title('ra distribution');


In [None]:
fig,ax = plt.subplots(1,3,figsize = (30,6))
ax[0].hist([data[data['class']==0]['ra']],color = 'green',edgecolor = 'black');
ax[0].set_title('ra  distribution for galaxy');
ax[1].hist([data[data['class']==1]['ra']],color = 'pink',edgecolor = 'black');
ax[1].set_title('ra  distribution for QSO');
ax[2].hist([data[data['class']==2]['ra']],color = 'orange',edgecolor = 'black');
ax[2].set_title('ra  distribution for Star');


In [None]:
plt.figure(figsize = (20,5))
plt.hist(data[data['class']==0]['ra'].mean(),label = 'GALAXY');
plt.hist(data[data['class']==1]['ra'].mean(),label = 'QSO');
plt.hist(data[data['class']==2]['ra'].mean(),label = 'STAR');
plt.title('Mean distribution of ra for galaxy,qso and star');
plt.legend();

DEC and class

In [None]:
plt.figure(figsize = (25,5))
plt.plot(data['dec'],color = 'violet')

In [None]:
fig,ax = plt.subplots(1,3,figsize = (30,6))
ax[0].hist([data[data['class']==0]['dec']],color = 'green',edgecolor = 'black');
ax[0].set_title('dec  distribution for galaxy');
ax[1].hist([data[data['class']==1]['dec']],color = 'pink',edgecolor = 'black');
ax[1].set_title('dec  distribution for QSO');
ax[2].hist([data[data['class']==2]['dec']],color = 'orange',edgecolor = 'black');
ax[2].set_title('dec  distribution for Star');

In [None]:
plt.figure(figsize = (20,5))
plt.hist(data[data['class']==0]['dec'].mean(),label = 'GALAXY');
plt.hist(data[data['class']==1]['dec'].mean(),label = 'QSO');
plt.hist(data[data['class']==2]['dec'].mean(),label = 'STAR');
plt.xticks([10,11,12,13,14,15,16,17,18,19,20,21])
plt.title('Mean distribution of dec for galaxy,qso and star');
plt.legend();

**U,G,R,I,Z and Class**

In [None]:
fig,ax = plt.subplots(1,5,figsize = (25,5))
ax[0].plot(data['u'],color = 'orange');
ax[0].set_title('U distribution');
ax[1].plot(data['g'],color = 'purple');
ax[1].set_title('g distribution');
ax[2].plot(data['r'],color = 'red');
ax[2].set_title('R distribution');
ax[3].plot(data['i'],color = 'green');
ax[3].set_title('I distribution');
ax[4].plot(data['z'],color = 'yellow');
ax[4].set_title('Z distribution');

In [None]:
fig,ax = plt.subplots(5,3,figsize = (30,30))
ax[0][0].hist([data[data['class']==0]['u']],color = 'green',edgecolor = 'black');
ax[0][0].set_title('u  distribution for galaxy');
ax[0][1].hist([data[data['class']==1]['u']],color = 'pink',edgecolor = 'black');
ax[0][1].set_title('u  distribution for QSO');
ax[0][2].hist([data[data['class']==2]['u']],color = 'orange',edgecolor = 'black');
ax[0][2].set_title('u  distribution for Star');
ax[1][0].hist([data[data['class']==0]['g']],color = 'green',edgecolor = 'black');
ax[1][0].set_title('g distribution for galaxy');
ax[1][1].hist([data[data['class']==1]['g']],color = 'pink',edgecolor = 'black');
ax[1][1].set_title('g  distribution for QSO');
ax[1][2].hist([data[data['class']==2]['g']],color = 'orange',edgecolor = 'black');
ax[1][2].set_title('g  distribution for Star');
ax[2][0].hist([data[data['class']==0]['r']],color = 'green',edgecolor = 'black');
ax[2][0].set_title('r  distribution for galaxy');
ax[2][1].hist([data[data['class']==1]['r']],color = 'pink',edgecolor = 'black');
ax[2][1].set_title('r  distribution for QSO');
ax[2][2].hist([data[data['class']==2]['r']],color = 'orange',edgecolor = 'black');
ax[2][2].set_title('r  distribution for Star');
ax[3][0].hist([data[data['class']==0]['i']],color = 'green',edgecolor = 'black');
ax[3][0].set_title('i  distribution for galaxy');
ax[3][1].hist([data[data['class']==1]['i']],color = 'pink',edgecolor = 'black');
ax[3][1].set_title('i  distribution for QSO');
ax[3][2].hist([data[data['class']==2]['i']],color = 'orange',edgecolor = 'black');
ax[3][2].set_title('i  distribution for Star');
ax[4][0].hist([data[data['class']==0]['z']],color = 'green',edgecolor = 'black');
ax[4][0].set_title('z  distribution for galaxy');
ax[4][1].hist([data[data['class']==1]['z']],color = 'pink',edgecolor = 'black');
ax[4][1].set_title('z  distribution for QSO');
ax[4][2].hist([data[data['class']==2]['z']],color = 'orange',edgecolor = 'black');
ax[4][2].set_title('z  distribution for Star');

**Run and CLass**

In [None]:
plt.figure(figsize = (25,5))
plt.plot(data['run'],color = 'green');

In [None]:
fig,ax = plt.subplots(1,3,figsize = (30,6))
ax[0].hist([data[data['class']==0]['run']],color = 'green',edgecolor = 'black');
ax[0].set_title('run  distribution for galaxy');
ax[1].hist([data[data['class']==1]['run']],color = 'pink',edgecolor = 'black');
ax[1].set_title('run distribution for QSO');
ax[2].hist([data[data['class']==2]['run']],color = 'orange',edgecolor = 'black');
ax[2].set_title('run  distribution for Star');

In [None]:
plt.figure(figsize = (20,5))
plt.hist(data[data['class']==0]['run'].mean(),label = 'GALAXY');
plt.hist(data[data['class']==1]['run'].mean(),label = 'QSO');
plt.hist(data[data['class']==2]['run'].mean(),label = 'STAR');
plt.title('run mean distribution  for galaxy,qso and star');
plt.legend();

**camcol and class**

In [None]:
plt.figure(figsize = (25,5))
plt.plot(data['camcol'],color = 'yellow')

In [None]:
plt.figure(figsize = (25,5))
sb.boxplot(data['camcol'])

In [None]:
fig,ax = plt.subplots(1,3,figsize = (30,6))
ax[0].hist([data[data['class']==0]['camcol']],color = 'green',edgecolor = 'black');
ax[0].set_title('camcol  distribution for galaxy');
ax[1].hist([data[data['class']==1]['camcol']],color = 'pink',edgecolor = 'black');
ax[1].set_title('camcol distribution for QSO');
ax[2].hist([data[data['class']==2]['camcol']],color = 'orange',edgecolor = 'black');
ax[2].set_title('camcol  distribution for Star');

In [None]:
plt.figure(figsize = (20,5))
plt.hist(data[data['class']==0]['camcol'].mean(),label = 'GALAXY');
plt.hist(data[data['class']==1]['camcol'].mean(),label = 'QSO');
plt.hist(data[data['class']==2]['camcol'].mean(),label = 'STAR');
plt.title('mean distribution of camcol for galaxy,qso and star');
plt.legend();

**feild and class**

In [None]:
plt.figure(figsize = (25,5))
plt.plot(data['field'],color = 'pink')

In [None]:
fig,ax = plt.subplots(1,3,figsize = (30,6))
ax[0].hist([data[data['class']==0]['field']],color = 'green',edgecolor = 'black');
ax[0].set_title('field  distribution for galaxy');
ax[1].hist([data[data['class']==1]['field']],color = 'pink',edgecolor = 'black');
ax[1].set_title('field distribution for QSO');
ax[2].hist([data[data['class']==2]['field']],color = 'orange',edgecolor = 'black');
ax[2].set_title('field  distribution for Star');

In [None]:
plt.figure(figsize = (20,5))
plt.hist(data[data['class']==0]['field'].mean(),label = 'GALAXY');
plt.hist(data[data['class']==1]['field'].mean(),label = 'QSO');
plt.hist(data[data['class']==2]['field'].mean(),label = 'STAR');
plt.title('mean distribution of field for galaxy,qso and star');
plt.legend();

specobjid and class

In [None]:
plt.figure(figsize = (25,5))
plt.plot(data['specobjid'],color = 'black')


In [None]:
fig,ax = plt.subplots(1,3,figsize = (30,6))
ax[0].hist([data[data['class']==0]['specobjid']],color = 'green',edgecolor = 'black');
ax[0].set_title('specobjid  distribution for galaxy');
ax[1].hist([data[data['class']==1]['specobjid']],color = 'pink',edgecolor = 'black');
ax[1].set_title('specobjid distribution for QSO');
ax[2].hist([data[data['class']==2]['specobjid']],color = 'orange',edgecolor = 'black');
ax[2].set_title('specobjid  distribution for Star');

**RedShift and Class**

In [None]:
plt.figure(figsize = (25,5))
plt.plot(data['redshift'],color = 'crimson')

In [None]:
fig,ax = plt.subplots(1,3,figsize = (30,6))
ax[0].hist([data[data['class']==0]['redshift']],color = 'green',edgecolor = 'black');
ax[0].set_title('redshift  distribution for galaxy');
ax[1].hist([data[data['class']==1]['redshift']],color = 'pink',edgecolor = 'black');
ax[1].set_title('red shift distribution for QSO');
ax[2].hist([data[data['class']==2]['redshift']],color = 'orange',edgecolor = 'black');
ax[2].set_title('redshift  distribution for Star');

In [None]:
plt.figure(figsize = (20,5))
plt.hist(data[data['class']==0]['redshift'].mean(),label = 'GALAXY');
plt.hist(data[data['class']==1]['redshift'].mean(),label = 'QSO');
plt.hist(data[data['class']==2]['redshift'].mean(),label = 'STAR');
plt.title('mean distribution of redshift for galaxy,qso and star');
plt.legend();

**plate and class**

In [None]:
plt.figure(figsize = (25,5))
plt.plot(data['plate'],color = 'slateblue');

In [None]:
fig,ax = plt.subplots(1,3,figsize = (30,6))
ax[0].hist([data[data['class']==0]['plate']],color = 'green',edgecolor = 'black');
ax[0].set_title('plate  distribution for galaxy');
ax[1].hist([data[data['class']==1]['plate']],color = 'pink',edgecolor = 'black');
ax[1].set_title('plate  distribution for QSO');
ax[2].hist([data[data['class']==2]['plate']],color = 'orange',edgecolor = 'black');
ax[2].set_title('plate  distribution for Star');

In [None]:
fig,ax = plt.subplots(1,3,figsize = (30,6))
ax[0].hist(data[data['class']==0]['plate'].mean(),label = 'GALAXY',color = 'green');
ax[1].hist(data[data['class']==1]['plate'].mean(),label = 'QSO',color = 'pink');
ax[2].hist(data[data['class']==2]['plate'].mean(),label = 'STAR',color = 'orange');
plt.title('mean distribution of plate for galaxy,qso and star');


**mjd and class**

In [None]:
plt.figure(figsize = (25,5))
plt.plot(data['mjd'],color = 'mediumspringgreen');

In [None]:
fig,ax = plt.subplots(1,3,figsize = (30,6))
ax[0].hist([data[data['class']==0]['mjd']],color = 'green',edgecolor = 'black');
ax[0].set_title('mjd  distribution for galaxy');
ax[1].hist([data[data['class']==1]['mjd']],color = 'pink',edgecolor = 'black');
ax[1].set_title('mjd distribution for QSO');
ax[2].hist([data[data['class']==2]['mjd']],color = 'orange',edgecolor = 'black');
ax[2].set_title('mjd  distribution for Star');

In [None]:
fig,ax = plt.subplots(1,3,figsize = (30,6))
ax[0].hist(data[data['class']==0]['mjd'].mean(),label = 'GALAXY',color = 'green');
ax[1].hist(data[data['class']==1]['mjd'].mean(),label = 'QSO',color = 'pink');
ax[2].hist(data[data['class']==2]['mjd'].mean(),label = 'STAR',color = 'orange');
plt.title('mean distribution of plate for galaxy,qso and star');

fiberid and class

In [None]:
plt.figure(figsize = (25,5))
plt.plot(data['fiberid'],color = 'lawngreen');

In [None]:
fig,ax = plt.subplots(1,3,figsize = (30,6))
ax[0].hist([data[data['class']==0]['fiberid']],color = 'green',edgecolor = 'black');
ax[0].set_title('fiberid  distribution for galaxy');
ax[1].hist([data[data['class']==1]['fiberid']],color = 'pink',edgecolor = 'black');
ax[1].set_title('fibreid distribution for QSO');
ax[2].hist([data[data['class']==2]['fiberid']],color = 'orange',edgecolor = 'black');
ax[2].set_title('fiberid  distribution for Star');

In [None]:
plt.figure(figsize = (20,5))
plt.hist(data[data['class']==0]['fiberid'].mean(),label = 'GALAXY');
plt.hist(data[data['class']==1]['fiberid'].mean(),label = 'QSO');
plt.hist(data[data['class']==2]['fiberid'].mean(),label = 'STAR');
plt.title('mean distribution of fiberid for galaxy,qso and star');
plt.legend();

# Model Selection

In [None]:
y = data['class']
data.drop('class',1,inplace=True)
x = data

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = (0.25),random_state=42)

1. **KNeighborsClassifier**

In [None]:
model_params = {'n_neighbors':np.arange(1,10)}
grid = GridSearchCV(KNeighborsClassifier(),model_params)
grid.fit(x,y)

In [None]:
grid.best_params_

In [None]:
KNNmodel = KNeighborsClassifier(n_neighbors = 9)
KNNmodel.fit(x_train,y_train)

In [None]:
y_train_pred = cross_val_predict(KNNmodel,x_train,y_train,cv=3)
print("Confusion Matrix:")
print(confusion_matrix(y_train,y_train_pred))
print("")
precision_knn = precision_score(y_train_pred,y_train,average = None)
print("Precison_Score:",precision_knn.mean())
recall_knn = recall_score(y_train_pred,y_train,average = None)
print("Recall_score:",recall_knn.mean())

print("Cross Val Score in Sample",cross_val_score(KNNmodel,x_train,y_train,cv=3,scoring='accuracy').mean())
print("Cross Val Score out Sample",cross_val_score(KNNmodel,x_test,y_test,cv=3,scoring='accuracy').mean())


2. NaiveBayes using GaussianNB

In [None]:
GausianModel = GaussianNB()
GausianModel.fit(x_train,y_train)

In [None]:
y_train_pred = cross_val_predict(GausianModel,x_train,y_train,cv=3)
print("Confusion Matrix:")
print(confusion_matrix(y_train,y_train_pred))
print("")
precision_NB = precision_score(y_train_pred,y_train,average = None)
print("Precison_Score:",precision_NB.mean())
recall_NB = recall_score(y_train_pred,y_train,average = None)
print("Recall_score:",recall_NB.mean())

print("Cross Val Score in Sample",cross_val_score(GausianModel,x_train,y_train,cv=3,scoring='accuracy').mean())
print("Cross Val Score out Sample",cross_val_score(GausianModel,x_test,y_test,cv=3,scoring='accuracy').mean())


3. **Decision Tree Classifier**

In [None]:
params = {'min_samples_leaf':np.arange(2,10),'max_depth':np.arange(1,15)}
grid = GridSearchCV(DecisionTreeClassifier(),params)
grid.fit(x,y)

In [None]:
grid.best_params_

In [None]:
DecisionTree  = DecisionTreeClassifier(max_depth=9, min_samples_leaf=6)
DecisionTree.fit(x_train,y_train)

In [None]:
y_train_pred = cross_val_predict(DecisionTree,x_train,y_train,cv=3)
print("Confusion Matrix:")
print(confusion_matrix(y_train,y_train_pred))
print("")
precision_DecisionTree = precision_score(y_train_pred,y_train,average = None)
print("Precison_Score:",precision_DecisionTree.mean())
recall_DecisionTree = recall_score(y_train_pred,y_train,average = None)
print("Recall_score:",recall_DecisionTree.mean())

print("Cross Val Score in Sample",cross_val_score(DecisionTree,x_train,y_train,cv=3,scoring='accuracy').mean())
print("Cross Val Score out Sample",cross_val_score(DecisionTree,x_test,y_test,cv=3,scoring='accuracy').mean())


4.RandomForestClassifier

In [None]:
Forest  = RandomForestClassifier(max_depth = 9,min_samples_leaf = 6,n_estimators = 1000)
Forest.fit(x_train,y_train)

In [None]:
y_train_pred = cross_val_predict(Forest,x_train,y_train,cv=3)
print("Confusion Matrix:")
print(confusion_matrix(y_train,y_train_pred))
print("")
precision_Forest = precision_score(y_train_pred,y_train,average = None)
print("Precison_Score:",precision_Forest.mean())
recall_Forest = recall_score(y_train_pred,y_train,average = None)
print("Recall_score:",recall_Forest.mean())

print("Cross Val Score in Sample",cross_val_score(Forest,x_train,y_train,cv=3,scoring='accuracy').mean())
print("Cross Val Score out Sample",cross_val_score(Forest,x_test,y_test,cv=3,scoring='accuracy').mean())


5. AdaBoostClassifier on DecisionTreeClassifier

In [None]:
adb = AdaBoostClassifier(DecisionTreeClassifier(max_depth = 9,min_samples_leaf = 6))
adb.fit(x_train,y_train)

In [None]:
y_train_pred = cross_val_predict(adb,x_train,y_train,cv=3)
print("Confusion Matrix:")
print(confusion_matrix(y_train,y_train_pred))
print("")
precision_adb = precision_score(y_train_pred,y_train,average = None)
print("Precison_Score:",precision_adb.mean())
recall_adb = recall_score(y_train_pred,y_train,average = None)
print("Recall_score:",recall_adb.mean())

print("Cross Val Score in Sample",cross_val_score(adb,x_train,y_train,cv=3,scoring='accuracy').mean())
print("Cross Val Score out Sample",cross_val_score(adb,x_test,y_test,cv=3,scoring='accuracy').mean())

# Model Deployement

In [None]:
model = AdaBoostClassifier(DecisionTreeClassifier(max_depth = 9,min_samples_leaf = 6))
model.fit(x,y)

In [None]:
x

In [None]:
to_predict = [[183.531326,0.089693,19.47406,17.04240,15.94699,15.50342,15.22531,752,4,267,3.722360e+18,-0.000009,3306,54922,491]]
model.predict(to_predict)

In [None]:
y[1]