**The data consists of 10,000 observations of space taken by the SDSS. Every observation is described by 17 feature columns and 1 class column which identifies it to be either a star, galaxy or quasar.
Task to identify Star, Galaxy or Quasar.**

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

Importing Libraries

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

#from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, precision_score, recall_score

In [None]:
df = pd.read_csv('../input/sloan-digital-sky-survey/Skyserver_SQL2_27_2018 6_51_39 PM.csv')
df.head(10)

Understanding Dataset

In [None]:
df.shape

In [None]:
df.keys()

In [None]:
df.isnull().sum()

In [None]:
df.describe()

Plotting the class column

In [None]:
print(df['class'].unique())
sns.countplot(df['class'])

In [None]:
df['run'].unique()

In [None]:
df['objid'].unique()

Understanding unique values if present in columns, which will help to reduce dimensionality in future processing.

In [None]:
for i in df.keys():
    print("Colname:=",i)
    print(df[i].unique())

Finding the correaltion between attributes

In [None]:
df.corr()

Apparently we don't need 'rerun', 'objid' columns

Here we create a dictionary which will map Class to numeric form and then replace it.
Star will become 1 and so on.

In [None]:
dictionary={'STAR':1,'GALAXY':2,'QSO':3}
df.replace({'class':dictionary},inplace=True)

# you can use LabelEncoder here also

In [None]:
df.head()

In [None]:
y = df['class']

Dropping the columns which we donot need.

In [None]:
df = df.drop(['class','rerun'], axis=1)

Also dropping 'objid'

In [None]:
df.drop('objid',axis=1)

In [None]:
df.head(5)

Using minmax scaling, we reduce the scale size to betwee 0 and 1. This helps the dataset to be more accuracte and removes high range anamolies. Apparently this must be used while dealing with large range of datasets.

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
sdss = scaler.fit_transform(df)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(sdss, y, test_size=0.2, random_state=2)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
# Logistic Regression
lr = LogisticRegression(C=2, max_iter=1500)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

print("Accuracy of Logistic Regression= ", accuracy_score(y_test,y_pred))

In [None]:
# KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print("Accuracy of KNeighborsClassifier = ", accuracy_score(y_test,y_pred))

In [None]:
# RandomForestClassifier
rf = RandomForestClassifier(max_depth=18, n_estimators=120)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print("Accuracy of RandomForestClassifier = ", accuracy_score(y_test,y_pred))

In [None]:
# XGBClassifier
xgb= XGBClassifier()
xgb.fit(X_train,y_train)
y_pred =xgb.predict(X_test)
print("Accuracy of XGBClassifier = ", accuracy_score(y_test,y_pred))

In [None]:
import lightgbm as lgb

lgb=lgb.LGBMClassifier()
lgb.fit(X_train,y_train)
y_pred =lgb.predict(X_test)
print("Accuracy of lightgbm = ", accuracy_score(y_test,y_pred))

In [None]:
# GradientBoostingClassifier
gb = GradientBoostingClassifier()
gb.fit(X_train,y_train)
y_pred =gb.predict(X_test)
print("Accuracy of GradientBoostingClassifier = ", accuracy_score(y_test,y_pred))