In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv("/kaggle/input/star-type-classification/Stars.csv")

In [None]:
data.head()

First, lets make sure there are no null values

In [None]:
data.isnull().sum()

Since there are no null values, we can now map out the 'color' column so that it is a numerical "float" type variable

In [None]:
data['Color'].value_counts()

lets now one-hot-encode and get the top 10 most frequent Colors in order to remove some of the 'noise'

In [None]:
top_10 = data['Color'].value_counts()[:9]

In [None]:
labels_10 = top_10.index

for label in labels_10:
    data[label] = np.where(data['Color'] == label,1, 0)
    
spectral = data['Spectral_Class'].value_counts().index

for label in spectral:
    data[label] = np.where(data['Spectral_Class'] == label, 1, 0)

Now that the categorical variable has been one-hot-encoded, we can drop the 'Color' variable

In [None]:
data = data.drop(['Color'], axis = 1)
data = data.drop(['Spectral_Class'], axis = 1)

In [None]:
data

great! Now we only have numerical variables! With this, lets now look at the relationship between each variable and 'type' to determine which variables are actually useful!

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize = (15,15))

columns = data.columns

def plot(columns, target):
    
    for i in range(len(columns)):
        plt.subplot(len(columns) // 3, 3, i+1)
        plt.scatter(data[columns[i]], data[target])
        plt.xlabel(columns[i])
        plt.ylabel(target)
        plt.title(columns[i] + "vs" + target)
    plt.show()
    
plt.subplots_adjust(bottom = 0.01, top = 1.3)
    
plot(columns, 'Type')
    
    

wonderful! we can now see the relationships between different variables and their respective star types!

In [None]:
import tensorflow as tf
from tensorflow import keras

ok, now lets split this data into training and test sets

In [None]:
all_columns = data.columns

all_columns = all_columns.drop("Type")

print(all_columns)

new_data = data[all_columns]

labels = data['Type']

labels_array = np.array(labels)

train_data = new_data

In [None]:
import sklearn

from sklearn.model_selection import train_test_split

In [None]:
train_data, test_data, train_labels, test_labels = train_test_split(new_data, labels_array)

train_data

don't forget to normalize the data!

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

norm_train_data = scaler.fit_transform(train_data)

norm_test_data = scaler.fit_transform(test_data)

norm_train_data.shape

In [None]:
model = tf.keras.models.Sequential([
    
    tf.keras.layers.Flatten(input_shape = (1, 20)),
    tf.keras.layers.Dense(20, activation = 'relu'),
    tf.keras.layers.Dense(6, activation = 'softmax')
])

In [None]:
model.summary()

In [None]:
model.compile(loss = 'sparse_categorical_crossentropy', optimizer = tf.optimizers.Adam(), metrics = ['acc'])

In [None]:
history = model.fit(norm_train_data, train_labels, epochs = 90, validation_data = (norm_test_data, test_labels))

In [None]:
def plot_history(history, string):
    
    plt.plot( history.history[string])
    plt.plot(history.history['val_' + string])
    plt.xlabel("Epochs")
    plt.ylabel('string')
    plt.legend([string, 'val_' + string])
    plt.show()
    
plot_history(history, 'acc')
plot_history(history, 'loss')

lovely! after training for 90 epochs, our validation accuracy is 95%! Thats pretty good!