In [35]:
import numpy as np
import pandas as pd

import tensorflow as tf
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import PCA
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
import time

In [23]:
df = pd.read_csv("Skyserver_SQL2_27_2018 6_51_39 PM.csv")
df.head()

Unnamed: 0,objid,ra,dec,u,g,r,i,z,run,rerun,camcol,field,specobjid,class,redshift,plate,mjd,fiberid
0,1.23765e+18,183.531326,0.089693,19.47406,17.0424,15.94699,15.50342,15.22531,752,301,4,267,3.72236e+18,STAR,-9e-06,3306,54922,491
1,1.23765e+18,183.598371,0.135285,18.6628,17.21449,16.67637,16.48922,16.3915,752,301,4,267,3.63814e+17,STAR,-5.5e-05,323,51615,541
2,1.23765e+18,183.680207,0.126185,19.38298,18.19169,17.47428,17.08732,16.80125,752,301,4,268,3.23274e+17,GALAXY,0.123111,287,52023,513
3,1.23765e+18,183.870529,0.049911,17.76536,16.60272,16.16116,15.98233,15.90438,752,301,4,269,3.72237e+18,STAR,-0.000111,3306,54922,510
4,1.23765e+18,183.883288,0.102557,17.55025,16.26342,16.43869,16.55492,16.61326,752,301,4,269,3.72237e+18,STAR,0.00059,3306,54922,512


In [24]:
#There are 17 different columns whose details have been learned from kaggle
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 18 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   objid      10000 non-null  float64
 1   ra         10000 non-null  float64
 2   dec        10000 non-null  float64
 3   u          10000 non-null  float64
 4   g          10000 non-null  float64
 5   r          10000 non-null  float64
 6   i          10000 non-null  float64
 7   z          10000 non-null  float64
 8   run        10000 non-null  int64  
 9   rerun      10000 non-null  int64  
 10  camcol     10000 non-null  int64  
 11  field      10000 non-null  int64  
 12  specobjid  10000 non-null  float64
 13  class      10000 non-null  object 
 14  redshift   10000 non-null  float64
 15  plate      10000 non-null  int64  
 16  mjd        10000 non-null  int64  
 17  fiberid    10000 non-null  int64  
dtypes: float64(10), int64(7), object(1)
memory usage: 1.4+ MB


In [25]:
df.describe()
#This shows some main features of the dataset

Unnamed: 0,objid,ra,dec,u,g,r,i,z,run,rerun,camcol,field,specobjid,redshift,plate,mjd,fiberid
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,1.23765e+18,175.529987,14.836148,18.619355,17.371931,16.840963,16.583579,16.422833,981.0348,301.0,3.6487,302.3801,1.645022e+18,0.143726,1460.9864,52943.5333,353.0694
std,0.0,47.783439,25.212207,0.828656,0.945457,1.067764,1.141805,1.203188,273.305024,0.0,1.666183,162.577763,2.013998e+18,0.388774,1788.778371,1511.150651,206.298149
min,1.23765e+18,8.2351,-5.382632,12.98897,12.79955,12.4316,11.94721,11.61041,308.0,301.0,1.0,11.0,2.99578e+17,-0.004136,266.0,51578.0,1.0
25%,1.23765e+18,157.370946,-0.539035,18.178035,16.8151,16.173333,15.853705,15.618285,752.0,301.0,2.0,184.0,3.389248e+17,8.1e-05,301.0,51900.0,186.75
50%,1.23765e+18,180.394514,0.404166,18.853095,17.495135,16.85877,16.554985,16.389945,756.0,301.0,4.0,299.0,4.96658e+17,0.042591,441.0,51997.0,351.0
75%,1.23765e+18,201.547279,35.649397,19.259232,18.010145,17.512675,17.25855,17.141447,1331.0,301.0,5.0,414.0,2.8813e+18,0.092579,2559.0,54468.0,510.0
max,1.23765e+18,260.884382,68.542265,19.5999,19.91897,24.80204,28.17963,22.83306,1412.0,301.0,6.0,768.0,9.46883e+18,5.353854,8410.0,57481.0,1000.0


In [26]:
#the column class is the only one with alphabets others are only numeric
df['class'].unique()

array(['STAR', 'GALAXY', 'QSO'], dtype=object)

In [27]:
#Only a few columns are important, lets drop columns such as objid, run, spaceobjid etc

df.drop(['objid', 'run', 'rerun', 'camcol', 'field', 'specobjid'], axis=1, inplace=True)
df.head()

Unnamed: 0,ra,dec,u,g,r,i,z,class,redshift,plate,mjd,fiberid
0,183.531326,0.089693,19.47406,17.0424,15.94699,15.50342,15.22531,STAR,-9e-06,3306,54922,491
1,183.598371,0.135285,18.6628,17.21449,16.67637,16.48922,16.3915,STAR,-5.5e-05,323,51615,541
2,183.680207,0.126185,19.38298,18.19169,17.47428,17.08732,16.80125,GALAXY,0.123111,287,52023,513
3,183.870529,0.049911,17.76536,16.60272,16.16116,15.98233,15.90438,STAR,-0.000111,3306,54922,510
4,183.883288,0.102557,17.55025,16.26342,16.43869,16.55492,16.61326,STAR,0.00059,3306,54922,512


In [28]:
# Now, we have a few things left, such as class must be changed from alphabetic to numeric values

class_dict = {"GALAXY": 0, "QSO": 1, "STAR": 2}
df['class'] = df['class'].map(class_dict)
df.head()

Unnamed: 0,ra,dec,u,g,r,i,z,class,redshift,plate,mjd,fiberid
0,183.531326,0.089693,19.47406,17.0424,15.94699,15.50342,15.22531,2,-9e-06,3306,54922,491
1,183.598371,0.135285,18.6628,17.21449,16.67637,16.48922,16.3915,2,-5.5e-05,323,51615,541
2,183.680207,0.126185,19.38298,18.19169,17.47428,17.08732,16.80125,0,0.123111,287,52023,513
3,183.870529,0.049911,17.76536,16.60272,16.16116,15.98233,15.90438,2,-0.000111,3306,54922,510
4,183.883288,0.102557,17.55025,16.26342,16.43869,16.55492,16.61326,2,0.00059,3306,54922,512


In [29]:
#Also decrease the number of features(u,g,r,i,z) from 5 to 3
# we can utilize PCA for that purpose
df_n = df
pca = PCA(n_components=3)
new = pca.fit_transform(df_n[['u', 'g', 'r', 'i', 'z']])

df_n = pd.concat((df_n, pd.DataFrame(new)), axis = 1)
df_n.rename({0:'N1', 1:'N2', 2:'N3'}, axis = 1, inplace = True)
df_n.drop(['u', 'g', 'r', 'i', 'z'], axis=1, inplace=True)
df_n.head()

Unnamed: 0,ra,dec,class,redshift,plate,mjd,fiberid,N1,N2,N3
0,183.531326,0.089693,2,-9e-06,3306,54922,491,-1.507202,-1.377293,-0.265119
1,183.598371,0.135285,2,-5.5e-05,323,51615,541,-0.195758,-0.02841,-0.155695
2,183.680207,0.126185,0,0.123111,287,52023,513,1.297604,-0.590023,0.140338
3,183.870529,0.049911,2,-0.000111,3306,54922,510,-1.446117,0.566685,-0.009272
4,183.883288,0.102557,2,0.00059,3306,54922,512,-0.849271,1.287505,-0.397689


In [30]:
#We will take class column as y and the remaining data will be taken as x
# before ML we have to scale all the data between 0 and 1 except class column
scaler = MinMaxScaler()
df_m = scaler.fit_transform(df_n.drop('class', axis=1))

  return self.partial_fit(X, y)


In [31]:
#Lets define x and y
X_train, X_test, y_train, y_test = train_test_split(df_m, df_n['class'], test_size=0.31)

In [42]:
#Lets start with XGBOOST Classifier
xgb = XGBClassifier(n_estimators=100)
t_s = time.perf_counter()
xgb.fit(X_train, y_train)
t_e = time.perf_counter()
p_s = time.perf_counter()
preds = xgb.predict(X_test)
p_e = time.perf_counter()
acc = (preds == y_test).sum().astype(float) / len(preds)*100
train_time = t_e-t_s
Pred_time = p_e-p_s

print("XGBoost's prediction accuracy is: %4f" % (acc))
print("Time consumed for training: %4f seconds" % (train_time))
print("Time consumed for prediction: %4f seconds" % (Pred_time))



XGBoost's prediction accuracy is: 99.387097
Time consumed for training: 0.435748 seconds
Time consumed for prediction: 0.004037 seconds


In [47]:
#Support Vector Classifier
svc = SVC()
t_s = time.perf_counter()
svc.fit(X_train, y_train)
t_e = time.perf_counter()
p_s = time.perf_counter()

preds = svc.predict(X_test)
p_e = time.perf_counter()

acc = (preds == y_test).sum().astype(float) / len(preds)*100

train_time = t_e-t_s
Pred_time = p_e-p_s
print("Scikit-Learn's Support Vector Machine Classifier's prediction accuracy is: %4f" % (acc))
print("Time consumed for training: %4f seconds" % (train_time))
print("Time consumed for prediction: %4f seconds" % (Pred_time))



Scikit-Learn's Support Vector Machine Classifier's prediction accuracy is: 89.354839
Time consumed for training: 0.457525 seconds
Time consumed for prediction: 0.116381 seconds


In [48]:
#Random Forest
rfc = RandomForestClassifier(n_estimators=10)
t_s = time.perf_counter()
rfc.fit(X_train, y_train)
t_e = time.perf_counter()
p_s = time.perf_counter()
preds = rfc.predict(X_test)
p_e = time.perf_counter()
acc = (preds == y_test).sum().astype(float) / len(preds)*100

train_time = t_e-t_s
Pred_time = p_e-p_s
print("Scikit-Learn's Random Forest Classifier's prediction accuracy is: %4f" % (acc))
print("Time consumed for training: %4f seconds" % (train_time))
print("Time consumed for prediction: %4f seconds" % (Pred_time))

Scikit-Learn's Random Forest Classifier's prediction accuracy is: 99.064516
Time consumed for training: 0.089186 seconds
Time consumed for prediction: 0.002605 seconds
