## Variety of models for SDSS data

In [None]:
import os                       # accessing directory structure
import numpy as np              # linear algebra
import pandas as pd             # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # plotting
import seaborn as sns           # plotting

In [None]:
import warnings
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
from IPython.display import Image
%matplotlib inline

In [None]:
df = pd.read_csv('/kaggle/input/Skyserver_12_30_2019 4_49_58 PM.csv')
df.head(10)

In [None]:
nRow, nCol = df.shape
print(f'There are {nRow} rows and {nCol} columns')

In [None]:
df.columns

In [None]:
# Store columns in a list, might will be helpful later
cols = list(df.columns)
cols

In [None]:
# Our labels
df['class'].unique()

In [None]:
# Let's see how the classes are distributed 
df['class'].value_counts()

In [None]:
# Visualization with Matplotlib
df['class'].value_counts().plot(kind='bar')

In [None]:
# Visualization with Seaborn
sns.countplot(x='class', data=df, palette="brg")
plt.show()

In [None]:
df['mjd'].hist()

In [None]:
df['redshift'].hist()

In [None]:
# Let's find missing values
df.isnull().sum()

## Encoding class labels
For some cases, we cannot simply provide categorical values (just strings). Instead, we can convert them to numerical values.
For example, since we have 3 classes, we able to assign to each class some values, so that:

* 0 is for GALAXY
* 1 is for QSO
* 2 is for STAR.

In [None]:
# Mapping classes to 0,1,2 values
class_mapping = {label: idx for idx, label in enumerate(np.unique(df['class']))}
class_mapping

In [None]:
df['class'] = df['class'].map(class_mapping)
df

# now we see class column with numerical (0,1,2) values

In [None]:
# Invariant back to original
inv_class_mapping = {v: k for k, v in class_mapping.items()}
df['class'] = df['class'].map(inv_class_mapping)
df

In [None]:
# Labels encoding with special Scikit Learn function
from sklearn.preprocessing import LabelEncoder

class_le = LabelEncoder()
y = class_le.fit_transform(df['class'].values)
print(y)
print('We have {} values'.format(len(y)))

In [None]:
# Our target
print(y)

# We prepare data on which we will train and test
# Labels column should be excluded
df = df.drop(columns=['class'])
df

In [None]:
# We need to normalize the data, to not have bias of huge values

from sklearn import preprocessing

x = df.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df = pd.DataFrame(x_scaled)

## Split

In [None]:
X = df

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2,random_state=1)

In [None]:
X_train

# Models

## Decision Trees

In [None]:
from sklearn.tree import DecisionTreeClassifier

model_dtc = DecisionTreeClassifier(random_state=49)

model_dtc.fit(X_train, y_train)

accuracies = {}

acc = model_dtc.score(X_test, y_test)*100
accuracies['Decision Tree'] = acc
print("Decision Tree Test Accuracy {:.2f}%".format(acc))

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

model_lr = LogisticRegression() # default parameters

model_lr.fit(X_train,y_train)

y_pred = model_lr.predict(X_test)

acc = model_lr.score(X_test,y_test)*100

accuracies['Logistic Regression'] = acc
print("Test Accuracy {:.2f}%".format(acc))

In [None]:
print(y_pred)
print(y_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

print('Classification Report: \n', classification_report(y_test, y_pred))
print('Confusion Matrix: \n', confusion_matrix(y_test, y_pred))
lr_train_acc = model_lr.score(X_train, y_train)
print('Training Score: ', lr_train_acc)
lr_test_acc = model_lr.score(X_test, y_test)
print('Testing Score: ', lr_test_acc)

## Support Vector Machines (SVM)

In [None]:
from sklearn.svm import SVC

model_svm = SVC(random_state = 1)

model_svm.fit(X_train, y_train)

acc = model_svm.score(X_test,y_test)*100

accuracies['SVM'] = acc
print("Test Accuracy of SVM Algorithm: {:.2f}%".format(acc))

# Attention: Slow model

## Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

model_nb = GaussianNB()

model_nb.fit(X_train, y_train)

acc = model_nb.score(X_test,y_test)*100
accuracies['Naive Bayes'] = acc
print("Accuracy of Naive Bayes: {:.2f}%".format(acc))

In [None]:
y_pred = model_nb.predict(X_test)

In [None]:
print('Classification Report: \n', classification_report(y_test, y_pred))
print('Confusion Matrix: \n', confusion_matrix(y_test, y_pred))
lr_train_acc = model_lr.score(X_train, y_train)
print('Training Score: ', lr_train_acc)
lr_test_acc = model_lr.score(X_test, y_test)
print('Testing Score: ', lr_test_acc)

## KNN Neighbours

In [None]:
from sklearn.neighbors import KNeighborsClassifier
model_knn = KNeighborsClassifier(n_neighbors = 3)  # n_neighbors means k
model_knn.fit(X_train, y_train)
prediction = model_knn.predict(X_test)

print("{} NN Score: {:.2f}%".format(3, model_knn.score(X_test, y_test)*100))

In [None]:
# try ro find best k value
scoreList = []
for i in range(1,20):
    knn2 = KNeighborsClassifier(n_neighbors = i)  # n_neighbors means k
    knn2.fit(X_train, y_train)
    scoreList.append(knn2.score(X_test, y_test))
    
plt.plot(range(1,20), scoreList)
plt.xticks(np.arange(1,20,1))
plt.xlabel("K value")
plt.ylabel("Score")
plt.show()

acc = max(scoreList)*100
accuracies['KNN'] = acc
print("Maximum KNN Score is {:.2f}%".format(acc))

## Tasks to do:
* Class weights for class imbalance
* ANN models