In [24]:
import pandas as pd
import numpy as np
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

Datasets:
Stars: https://www.kaggle.com/deepu1109/star-dataset
Skyserver: https://www.kaggle.com/lucidlenn/sloan-digital-sky-survey

### Classifying Star Type

This part of the project will be using the Stars dataset.

Objectives:
- Use features of stars to identify star type
- Identify best model for the dataset
- Train a model that predicts star type

In [25]:
stars = pd.read_csv("stars_data.csv")
stars

Unnamed: 0,Temperature (K),Luminosity(L/Lo),Radius(R/Ro),Absolute magnitude(Mv),Star type,Star color,Spectral Class
0,3068,0.002400,0.1700,16.12,0,Red,M
1,3042,0.000500,0.1542,16.60,0,Red,M
2,2600,0.000300,0.1020,18.70,0,Red,M
3,2800,0.000200,0.1600,16.65,0,Red,M
4,1939,0.000138,0.1030,20.06,0,Red,M
...,...,...,...,...,...,...,...
235,38940,374830.000000,1356.0000,-9.93,5,Blue,O
236,30839,834042.000000,1194.0000,-10.63,5,Blue,O
237,8829,537493.000000,1423.0000,-10.73,5,White,A
238,9235,404940.000000,1112.0000,-11.23,5,White,A


In [26]:
stars["Star type"].unique()

array([0, 1, 2, 3, 4, 5], dtype=int64)

In [27]:
stars["Spectral Class"].unique()

array(['M', 'B', 'A', 'F', 'O', 'K', 'G'], dtype=object)

In [28]:
stars["Star color"].unique()

array(['Red', 'Blue White', 'White', 'Yellowish White', 'Blue white',
       'Pale yellow orange', 'Blue', 'Blue-white', 'Whitish',
       'yellow-white', 'Orange', 'White-Yellow', 'white', 'Blue ',
       'yellowish', 'Yellowish', 'Orange-Red', 'Blue white ',
       'Blue-White'], dtype=object)

Since we have a categorical column with text data, we should encode it for training models. A simple way to do this is to get dummy variables based on the categorical features. Below, we take the features "Spectral Class" and "Star color" and transform them into multiple variables based on the values of the features. For example, Spectral Class contains classifications of the stars by their spectrum and luminosity and are classified as below:

Source: https://lweb.cfa.harvard.edu/~pberlind/atlas/htmls/note.html

In [29]:
pd.read_html("https://lweb.cfa.harvard.edu/~pberlind/atlas/htmls/note.html")[0].rename(columns={"Spectral Type": "Spectral Class"})

Unnamed: 0,Spectral Class,Surface Temperature,Distinguishing Features
0,O,"> 25,000K",H; HeI; HeII
1,B,"10,000-25,000K",H; HeI; HeII absent
2,A,"7,500-10,000K",H; CaII; HeI and HeII absent
3,F,"6,000-7,500K","H; metals (CaII, Fe, etc)"
4,G,"5,000-6,000K",H; metals; some molecular species
5,K,"3,500-5,000K",metals; some molecular species
6,M,"< 3,500K",metals; molecular species (TiO!)
7,C,"< 3,500K",metals; molecular species (C2!)


Instead of having one feature called "Spectral Class", we separate the values of the feature into 8 different binary features that indicate whether an observation is the respective feature or not (1 or 0). In this case for example, "Spectral Class" is separated into "Spectral Class_A", "Spectral Class_B", "Spectral Class_F", etc. If an observation's "Spectral Class" was F, then "Spectral Class_F" would be 1, and the rest would be 0. The same transformation is applied to "Star color" as well.

In [30]:
stars_dummy_df = pd.get_dummies(stars, columns = ['Spectral Class', 'Star color'] )
stars_dummy_df

Unnamed: 0,Temperature (K),Luminosity(L/Lo),Radius(R/Ro),Absolute magnitude(Mv),Star type,Spectral Class_A,Spectral Class_B,Spectral Class_F,Spectral Class_G,Spectral Class_K,...,Star color_Pale yellow orange,Star color_Red,Star color_White,Star color_White-Yellow,Star color_Whitish,Star color_Yellowish,Star color_Yellowish White,Star color_white,Star color_yellow-white,Star color_yellowish
0,3068,0.002400,0.1700,16.12,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,3042,0.000500,0.1542,16.60,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,2600,0.000300,0.1020,18.70,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,2800,0.000200,0.1600,16.65,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,1939,0.000138,0.1030,20.06,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235,38940,374830.000000,1356.0000,-9.93,5,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
236,30839,834042.000000,1194.0000,-10.63,5,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
237,8829,537493.000000,1423.0000,-10.73,5,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
238,9235,404940.000000,1112.0000,-11.23,5,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [31]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

X = stars_dummy_df.drop(columns=["Star type"])
y = stars_dummy_df[["Star type"]]

### Train/Test Split

In [32]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

### Picking best model

In [33]:
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier

In [34]:
models = [
    RandomForestClassifier(n_estimators=50,random_state=42, criterion='entropy',max_depth=None, min_samples_split=2),
    svm.SVC(gamma="scale",kernel="rbf"),
    GaussianNB(),
    DecisionTreeClassifier(),
    LogisticRegression(),
    SGDClassifier(),
    KNeighborsClassifier(),
    KMeans()
    
    
]

model_names = ['rf','svm','dt','nb', 'lr', 'sgd', 'kn', "km"]

In [35]:
accuracy = []
for model in models:
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    accuracy.append(model.score(X_test, y_test))
        
for i in range(8):
    
    print("Accuracy for " + model_names[i] + ": %.2f%%" % (accuracy[i] * 100.0))

Accuracy for rf: 100.00%
Accuracy for svm: 28.75%
Accuracy for dt: 83.75%
Accuracy for nb: 100.00%
Accuracy for lr: 57.50%
Accuracy for sgd: 28.75%
Accuracy for kn: 60.00%
Accuracy for km: -3478661368484.82%


In [53]:
# using grid search to find best var_smoothing
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV

nb_classifier = GaussianNB()

params_NB = {'var_smoothing': np.logspace(0,-9, num=100)}
gs_NB = GridSearchCV(estimator=nb_classifier, 
                 param_grid=params_NB, 
                 verbose=1, 
                 scoring='accuracy') 
gs_NB.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


GridSearchCV(estimator=GaussianNB(),
             param_grid={'var_smoothing': array([1.00000000e+00, 8.11130831e-01, 6.57933225e-01, 5.33669923e-01,
       4.32876128e-01, 3.51119173e-01, 2.84803587e-01, 2.31012970e-01,
       1.87381742e-01, 1.51991108e-01, 1.23284674e-01, 1.00000000e-01,
       8.11130831e-02, 6.57933225e-02, 5.33669923e-02, 4.32876128e-02,
       3.51119173e-02, 2.84803587e-02, 2.3101297...
       1.23284674e-07, 1.00000000e-07, 8.11130831e-08, 6.57933225e-08,
       5.33669923e-08, 4.32876128e-08, 3.51119173e-08, 2.84803587e-08,
       2.31012970e-08, 1.87381742e-08, 1.51991108e-08, 1.23284674e-08,
       1.00000000e-08, 8.11130831e-09, 6.57933225e-09, 5.33669923e-09,
       4.32876128e-09, 3.51119173e-09, 2.84803587e-09, 2.31012970e-09,
       1.87381742e-09, 1.51991108e-09, 1.23284674e-09, 1.00000000e-09])},
             scoring='accuracy', verbose=1)

In [54]:
gs_NB.best_params_["var_smoothing"]

1.519911082952933e-09

In [56]:
#training and testing naive bayes
gnb = GaussianNB(var_smoothing=gs_NB.best_params_["var_smoothing"])
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))


Accuracy: 0.8125


In [59]:
from sklearn.metrics import confusion_matrix,accuracy_score
confusion_matrix(y_test, y_pred)


array([[11,  5,  0,  0,  0,  0],
       [ 2, 11,  0,  0,  0,  0],
       [ 0,  0, 12,  0,  0,  0],
       [ 0,  0,  4,  4,  3,  0],
       [ 0,  0,  0,  1, 11,  0],
       [ 0,  0,  0,  0,  0, 16]], dtype=int64)