# NASA Star Classification

The goal of this notebook is to complete a basic project including exploration, cleaning, visualization, and classification. The dataset used here is the [Star Type Classification / NASA](https://www.kaggle.com/brsdincer/star-type-classification) dataset on Kaggle. Its target value is the "Type" feature, and 6 other features are given for classification.

## Import Data

In [None]:
import pandas as pd

In [None]:
data = pd.read_csv('../input/star-type-classification/Stars.csv')

In [None]:
types = ['Red Dwarf', 'Brown Dwarf', 'White Dwarf', 'Main Sequence', 'Super Giants', 'Hyper Giants']

## Data Exploration

In [None]:
data.head()

In [None]:
data.describe()

In [None]:
data.isnull().sum()

## Cleaning and Feature Engineering

In [None]:
import numpy as np

In [None]:
data.columns = [c.lower() for c in data.columns]

In [None]:
log_cols = ['temperature', 'l', 'r']

In [None]:
for c in log_cols:
    data[c+'_log'] = np.log10(data[c])

In [None]:
data['color'] = data['color'].str.replace('-', ' ').str.replace('ish','').str.replace('Pale ','').str.title()
data['color'] = data['color'].replace('Whit', 'White')
data['color'] = data['color'].replace('White Yellow', 'Yellow White')

In [None]:
data['color'].unique()

In [None]:
color_order = ['Red', 'Orange Red', 'Orange', 'Yellow Orange', 'Yellow','Yellow White','White','Blue White', 'Blue']

In [None]:
data['color_value'] = data['color'].apply(lambda x: color_order.index(x))

In [None]:
data['spectral_class_value'] = data['spectral_class'].apply(lambda x: list(data['spectral_class'].unique()).index(x))

## Basic Visualizations

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

from IPython.display import Image

In [None]:
drop_cols = log_cols.copy()
drop_cols.extend(['color_value', 'spectral_class_value']) # I currently don't want to see categorical data
sns.pairplot(data.drop(drop_cols, axis=1), hue='type', palette='bright')

In [None]:
# Ideally these are linearly correlated
fig = px.scatter(data, x='a_m', y='l_log', trendline='ols', title='Linear Regression', width=900, height=600)
corr = np.corrcoef(data['a_m'], data['l_log'])[0][1]

fig.show()
print(f'Correlation: {corr}')

## Advanced Visualizations

### H-R Classification Chart

In [None]:
# This will help fix the color distribution in the chart
data['color_value_adj'] = np.power(data['color_value'], 2)

In [None]:
# Let's try to re-create the H-R classification chart
fig = px.scatter(
    data,
    x = 'temperature',
    y = 'a_m',
    log_x = True,
    #log_y = True,
    color = 'color_value_adj',
    size = data['r_log'] - data['r_log'].min() + .15,
    color_continuous_scale = 'RdYlBu',
    template = 'plotly_dark',
    labels = {
        'l': 'Relative Luminosity (Sun=1)',
        'temperature': 'Surface Temperature (K)',
        'a_m': 'Absolute Magnitude'
    }
)

fig.update_layout(
    title = 'H-R Star Classification Chart',
    coloraxis_showscale = False,
    width = 1200,
    height = 800
)

'''
fig.add_annotation(
    x = 3.763428,
    y = -0.09151489,
    xshift = -30,
    yshift = -15,
    text = 'The Sun',
    showarrow = False
)
'''
fig.add_annotation(
    x = 3.763428,
    y = 5.05,
    xshift = -30,
    yshift = -15,
    text = 'The Sun',
    showarrow = False
)

fig.update_xaxes(autorange='reversed')
fig.update_yaxes(autorange='reversed')

## KNN Classification

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler


In [None]:
X = data[['temperature_log', 'l_log', 'r_log', 'a_m']]
y = data['type']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

In [None]:
predictions = knn.predict(X_test)

## Model Evaluation

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

In [None]:
accuracy = accuracy_score(y_test, predictions)

print(f'Accuracy: {accuracy}')

In [None]:
scores = cross_val_score(knn, X, y, cv=5)

print(f'Cross Validation Average Score: {scores.mean()}')

A seemingly perfect model!