In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import scipy
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv('../input/star-type-classification/Stars.csv')
data.columns

In [None]:
continuous_features = ['Temperature', 'L', 'R', 'A_M']
discrete_features = ['Color', 'Spectral_Class']
star_type = 'Type'

In [None]:
data.Color = [c.replace('-',' ').lower() for c in data.Color]

In [None]:
color_dict = dict()
for sc in data.Color.unique():
    if sc not in color_dict:
        color_dict[sc] = len(color_dict)
color_dict

# EDA

## Let's have a look to the continuos features

In [None]:
g = sns.PairGrid(data[continuous_features])
g.map_upper(sns.scatterplot)
g.map_lower(sns.kdeplot)
g.map_diag(sns.histplot, legend=False)

better to take the logarithm for Temperature, L and R

In [None]:
data['Temperature'] = np.log10(data['Temperature'])
data['L'] = np.log10(data['L'])
data['R'] = np.log10(data['R'])

In [None]:
g = sns.PairGrid(data[continuous_features])
g.map_upper(sns.scatterplot)
g.map_lower(sns.kdeplot)
g.map_diag(sns.histplot, legend=False)

## .. and the discrete features

In [None]:
ax=sns.displot(data[discrete_features[0]])
ax.set_xticklabels(rotation=90)
ax.set(title=discrete_features[0])
ax=sns.displot(data[discrete_features[1]])
ax.set(title=discrete_features[1])
plt.show()

The colors haven't been considered in this analysis (maybe as an update) 

In [None]:
color_dict = {'red': 0,
 'blue white': 1,
 'white': 2,
 'yellowish white': 3,
 'pale yellow orange': 4,
 'blue': 5,
 'whitish': 6,
 'yellow white': 7,
 'orange': 8,
 'white yellow': 9,
 'yellowish': 10,
 'orange red': 11}

In [None]:
spectral_class_dict = dict()
for sc in data[discrete_features[1]].unique():
    if sc not in spectral_class_dict:
        spectral_class_dict[sc] = len(spectral_class_dict)
spectral_class_dict

## Star Types

In [None]:
star_type_dict = {
    0: 'Red Dwarf',
    1: 'Brown Dwarf',
    2: 'White Dwarf',
    3: 'Main Sequence',
    4: 'Super Giants',
    5: 'Hyper Giants'
}

In [None]:
ax=sns.displot(data[star_type].map(star_type_dict))
ax.set_xticklabels(rotation=90)
ax.set(title='Star Types')
plt.show()

In [None]:
n_of_randomization = 30

# Preprocessing

In [None]:
spectral_array = np.array([])
for t in data[discrete_features[1]].map(spectral_class_dict):
    s = np.zeros(len(spectral_class_dict))
    s[t] = 1.0
    spectral_array = np.append( spectral_array, s) 
spectral_array = spectral_array.reshape(-1,len(spectral_class_dict))

In [None]:
input_array = np.hstack((data[continuous_features].to_numpy(),spectral_array)) #no colors for now

In [None]:
feature_names = continuous_features+list(spectral_class_dict.keys())

In [None]:
output_array = np.array([])
for t in data[star_type]:
    s = np.zeros(len(star_type_dict))
    s[t] = 1.0
    output_array = np.append( output_array, s) 
output_array = output_array.reshape(-1,len(star_type_dict))
output_array.shape

In [None]:
output_type_1d = data[star_type]

# 1. Neural network

In [None]:
import tensorflow as tf

In [None]:
#FUNCTION TO PLOT THE TRAINING
def plot_training(fit, evaluation):
    best_epoch = fit.epoch[fit.history['val_loss'].index(min(fit.history['val_loss']))]
    fig, ax = plt.subplots(2,1,figsize=(3,5))
    
    ax[0].plot(fit.epoch,fit.history['val_loss'],'.-',color='red', label='validation')
    ax[0].plot(fit.epoch,fit.history['loss'],'.-',color='orange', label='train')
    ax[0].set(ylabel='Loss',ylim=[0,1])
    ax[0].axvspan(best_epoch-0.5,best_epoch+0.5, alpha=0.5, color='red')
    #ax[0].autoscale(False)
    ax[0].scatter(best_epoch, evaluation[0],s=2, zorder=1,color='green')
    ax[0].legend()
    
    ax[1].plot(fit.epoch,fit.history['val_accuracy'],'.-',color='red', label='validation')
    ax[1].plot(fit.epoch,fit.history['accuracy'],'.-',color='orange', label='train')
    ax[1].set(ylabel='Accuracy',ylim=[0,1])
    ax[1].axvspan(best_epoch-0.5,best_epoch+0.5, alpha=0.5, color='red')
    #ax[1].autoscale(False)
    ax[1].scatter(best_epoch, evaluation[1],s=2, zorder=1,color='green')
    ax[1].legend()
    plt.show()
    print("[Best epoch]:", best_epoch)
    print("[Loss]:", min(fit.history['val_loss']), " test:", evaluation[0])
    print("[Accuracy]:", max(fit.history['val_accuracy']), " test:", evaluation[1])
    

In [None]:
#Z-scoring the continuous features
norm_continuous_array = scipy.stats.zscore(data[continuous_features].to_numpy())

input_array = np.hstack((norm_continuous_array,spectral_array))

In [None]:
BATCH_SIZE = 1
DATASET_SIZE = input_array.shape[0]
base_depth = 96
dropout_prob = 0.4
activation_func = tf.nn.leaky_relu

In [None]:
train_size = int(0.7 * DATASET_SIZE)//BATCH_SIZE
val_size = int(0.15 * DATASET_SIZE)//BATCH_SIZE
test_size = int(0.15 * DATASET_SIZE)//BATCH_SIZE

print("\n[Train size]:",train_size,"\n[Valid size]:", val_size,"\n[Test size]:", test_size )

In [None]:
NN_accuracy = []
for i in range(n_of_randomization):

    dataset = tf.data.Dataset.from_tensor_slices( (input_array,output_array) ).shuffle(1000).batch(BATCH_SIZE)
    train_data = dataset.take(train_size)
    test_data = dataset.skip(train_size)
    valid_data = test_data.skip(test_size)
    test_data = test_data.take(test_size)
    
    StarType_Classifier = tf.keras.Sequential([
        tf.keras.Input(shape=(input_array.shape[1],)),
        tf.keras.layers.Dense(base_depth,activation=activation_func),
        tf.keras.layers.Dropout(dropout_prob),
        tf.keras.layers.Dense(base_depth,activation=activation_func),
        tf.keras.layers.Dropout(dropout_prob),
        tf.keras.layers.Dense(base_depth,activation=activation_func),
        tf.keras.layers.Dropout(dropout_prob),
        tf.keras.layers.Dense(len(star_type_dict),activation=tf.nn.softmax)
    ], name="star_type_classifier")

    StarType_Classifier.compile(optimizer=tf.optimizers.Adam(learning_rate=0.0001), loss='categorical_crossentropy',metrics=['accuracy'])
    
    fit = StarType_Classifier.fit(train_data, epochs=400, validation_data=valid_data,
                        batch_size=BATCH_SIZE, verbose=False,
                        callbacks=[tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, min_lr=0.000001),
                                   tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', min_delta=0.0, patience=100, verbose=1, mode='auto', restore_best_weights=True)])

    evaluation = StarType_Classifier.evaluate(test_data)
    #plot_training(fit, evaluation)
    NN_accuracy.append( evaluation[1] )

In [None]:
sns.histplot(NN_accuracy)

# 2. Random forest

In [None]:
feature_importance_df = pd.DataFrame()

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.inspection import permutation_importance

In [None]:
RF_accuracy = list()
for i in range(n_of_randomization):
    X_train, X_test, y_train, y_test = train_test_split(input_array, output_type_1d, test_size=0.2, random_state=i)
    random_forest_clf = RandomForestClassifier(max_depth=4, random_state=i)
    random_forest_clf.fit(X_train,y_train)
    cross_val_score(random_forest_clf, X_train, y_train, cv=5)
    RF_accuracy.append( random_forest_clf.score(X_test, y_test) )

In [None]:
sns.histplot(RF_accuracy)

In [None]:
result = permutation_importance(random_forest_clf, input_array, output_type_1d, n_repeats=n_of_randomization, random_state=0)

importances = result.importances_mean
std = result.importances_std
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(input_array.shape[1]):
    print("%d. %s (%f)" % (f + 1, feature_names[indices[f]], importances[indices[f]]))

plt.figure()
plt.title("Feature importances")
plt.bar(range(input_array.shape[1]), importances[indices],
        color="r", yerr=std[indices], align="center")
plt.xticks(range(input_array.shape[1]), [feature_names[i] for i in indices], rotation=90)
plt.xlim([-1, input_array.shape[1]])
plt.show()

In [None]:
feature_importance_df['Random forest']  = pd.Series( importances, index=feature_names )

# 3. Decision tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
DT_accuracy = list()
for i in range(n_of_randomization):
    X_train, X_test, y_train, y_test = train_test_split(input_array, output_type_1d, test_size=0.2, random_state=i)
    decision_tree_clf = DecisionTreeClassifier(random_state=i)
    decision_tree_clf.fit(X_train,y_train)
    cross_val_score(decision_tree_clf, X_train, y_train, cv=5)
    DT_accuracy.append( decision_tree_clf.score(X_test, y_test) )

In [None]:
sns.histplot(DT_accuracy)

In [None]:
result = permutation_importance(decision_tree_clf, input_array, output_type_1d, n_repeats=n_of_randomization, random_state=0)

importances = result.importances_mean
std = result.importances_std
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(input_array.shape[1]):
        print("%d. %s (%f)" % (f + 1, feature_names[indices[f]], importances[indices[f]]))

plt.figure()
plt.title("Feature importances")
plt.bar(range(input_array.shape[1]), importances[indices],
        color="r", yerr=std[indices], align="center")
plt.xticks(range(input_array.shape[1]), [feature_names[i] for i in indices], rotation=90)
plt.xlim([-1, input_array.shape[1]])
plt.show()

In [None]:
feature_importance_df['Decision tree'] = pd.Series( importances, index=feature_names )

# 4. Gradient boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
GB_accuracy = list()
for i in range(n_of_randomization):
    X_train, X_test, y_train, y_test = train_test_split(input_array, output_type_1d, test_size=0.2, random_state=i)
    gradient_boosting_clf = GradientBoostingClassifier(max_depth=4, random_state=i)
    gradient_boosting_clf.fit(X_train,y_train)
    cross_val_score(gradient_boosting_clf, X_train, y_train, cv=5)
    GB_accuracy.append( gradient_boosting_clf.score(X_test, y_test) )

In [None]:
sns.histplot(GB_accuracy)

In [None]:
result = permutation_importance(gradient_boosting_clf, input_array, output_type_1d, n_repeats=n_of_randomization, random_state=0)

importances = result.importances_mean
std = result.importances_std
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(input_array.shape[1]):
        print("%d. %s (%f)" % (f + 1, feature_names[indices[f]], importances[indices[f]]))

plt.figure()
plt.title("Feature importances")
plt.bar(range(input_array.shape[1]), importances[indices],
        color="r", yerr=std[indices], align="center")
plt.xticks(range(input_array.shape[1]), [feature_names[i] for i in indices], rotation=90)
plt.xlim([-1, input_array.shape[1]])
plt.show()

In [None]:
feature_importance_df['Gradient boosting'] = pd.Series( importances, index=feature_names )

# 5. Support Vector Machine

In [None]:
from sklearn import svm

In [None]:
SVM_accuracy = list()
for i in range(n_of_randomization):
    X_train, X_test, y_train, y_test = train_test_split(input_array, output_type_1d, test_size=0.2, random_state=i)
    svm_clf = svm.SVC(gamma='auto')
    svm_clf.fit(X_train,y_train)
    cross_val_score(svm_clf, X_train, y_train, cv=5)
    SVM_accuracy.append( svm_clf.score(X_test, y_test) )

In [None]:
sns.histplot(SVM_accuracy)

In [None]:
result = permutation_importance(svm_clf, input_array, output_type_1d, n_repeats=n_of_randomization, random_state=0)

importances = result.importances_mean
std = result.importances_std
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(input_array.shape[1]):
        print("%d. %s (%f)" % (f + 1, feature_names[indices[f]], importances[indices[f]]))

plt.figure()
plt.title("Feature importances")
plt.bar(range(input_array.shape[1]), importances[indices],
        color="r", yerr=std[indices], align="center")
plt.xticks(range(input_array.shape[1]), [feature_names[i] for i in indices], rotation=90)
plt.xlim([-1, input_array.shape[1]])
plt.show()

In [None]:
feature_importance_df['Support vector machine'] = pd.Series( importances, index=feature_names )

# 6. AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostClassifier

In [None]:
AB_accuracy = list()
for i in range(n_of_randomization):
    X_train, X_test, y_train, y_test = train_test_split(input_array, output_type_1d, test_size=0.2, random_state=i)
    adaboost_clf = AdaBoostClassifier(n_estimators=100)
    adaboost_clf.fit(X_train,y_train)
    cross_val_score(adaboost_clf, X_train, y_train, cv=5)
    AB_accuracy.append( adaboost_clf.score(X_test, y_test) )

In [None]:
sns.histplot(AB_accuracy)

In [None]:
result = permutation_importance(adaboost_clf, input_array, output_type_1d, n_repeats=n_of_randomization, random_state=0)

importances = result.importances_mean
std = result.importances_std
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(input_array.shape[1]):
        print("%d. %s (%f)" % (f + 1, feature_names[indices[f]], importances[indices[f]]))

plt.figure()
plt.title("Feature importances")
plt.bar(range(input_array.shape[1]), importances[indices],
        color="r", yerr=std[indices], align="center")
plt.xticks(range(input_array.shape[1]), [feature_names[i] for i in indices], rotation=90)
plt.xlim([-1, input_array.shape[1]])
plt.show()

In [None]:
feature_importance_df['AdaBoost'] = pd.Series( importances, index=feature_names )

# Perfomance Summary and Best Features

In [None]:
accuracy_data = pd.DataFrame.from_dict({
    'AdaBoost': AB_accuracy,
    'Neural network': NN_accuracy,
    'Support vector machine': SVM_accuracy,
    'Gradient boosting': GB_accuracy,
    'Random forest': RF_accuracy,
    'Decision tree': DT_accuracy
})

In [None]:
fig, ax = plt.subplots()
sns.boxplot(x='Model',y='value',data=accuracy_data.melt(var_name='Model'),showfliers=False,ax=ax)
sns.stripplot(x='Model',y='value',data=accuracy_data.melt(var_name='Model'),ax=ax,color='black')
ax.set(xlabel='Model',ylabel='Accuracy',title='Perfomance')
ax.set_xticklabels(ax.get_xticklabels(),rotation=90)

The average accuracy greater than 95% for the majority of the model.
Top performing model is based on Decision Tree.

In [None]:
sorted_models = ['AdaBoost', 'Support vector machine','Gradient boosting','Random forest','Decision tree']
sorted_models.reverse()

In [None]:
sns.heatmap(feature_importance_df[sorted_models], annot=True)

Best features for predictions are R, A_M and Temperature. The analysis didn't considered the Colors.