In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing
import matplotlib.pyplot as plt # for graphics
import seaborn as sns # too for graphics

# Reading the dataset

In [None]:
# this is for show the dataset route
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# the dataset path
path = '../input/star-type-classification/Stars.csv'
# read the dataset
dataset = pd.read_csv(path)

# lets rename a few columns
new_names = {'L': 'Relative_Luminosity', 'R': 'Relative_Radius', 'A_M': 'Absolute_Magnitude'}
dataset = dataset.rename(columns=new_names)
# see the first dataset's rows
dataset.head()

# Classifying Stars - Context
As far as I know about classifying stars there's many classes we can use depending on the star qualities. This dataset as I see requires to classify stars with categories that appear to be 'by size'. I found a [video](https://www.youtube.com/watch?v=Y5VU3Mp6abI) which explains that categories. `The target is the type`.

The video talks about a ***H-R Diagram*** which could shows the 6 clusters of stars that we want to classify. This plot is `composed by the temperature in X and the luminosity in Y`.

In [None]:
# This is only for give an image of the data
Y = np.log10(dataset['Relative_Luminosity'].values)
X = np.log10(dataset['Temperature'].values)
clusters = dataset['Type'].values

cdict = {0: 'skyblue', 1: 'indigo', 2: 'orange', 3: 'yellow', 4: 'limegreen', 5: 'lightcoral'}


fig, ax = plt.subplots()
plt.title('H-R Diagram')
for g in np.unique(clusters):
    ix = np.where(clusters == g)
    ax.scatter(X[ix], Y[ix], c=cdict[g], label=g)
ax.legend()
plt.show()

# Data procesing

## Processing Color - One Hot Encoding

In [None]:
#dataset['Color'].value_counts().plot(kind='bar')
dataset['Color'].value_counts()

In [None]:
# there few are repeated colors that are not counted by the same column
# so it's needed to replace some names for 'repair' the names and then
# do not have too much columns, that's going to be important
fixed_names = {
    'Red' : 'Red', 
    'Blue' : 'Blue', 
    'Blue-white' : 'Blue-white', 
    'Blue White' : 'Blue-white', 
    'yellow-white' : 'White-Yellow', 
    'White' : 'White',
    'Blue white' : 'Blue-white', 
    'white' : 'White', 
    'Yellowish White' : 'White-Yellow', 
    'yellowish' : 'Yellow', 
    'Orange' : 'Orange',
    'Whitish' : 'White', 
    'Yellowish' : 'Yellow', 
    'Blue-White' : 'Blue-white', 
    'Pale yellow orange' : 'Orange',
    'Orange-Red' : 'Orange', 
    'White-Yellow' : 'White-Yellow' 
}

# this line renames values of Color depending on the dict
dataset['Color'] = dataset['Color'].map(fixed_names).astype('category')
# then we visualize the values of color
figure = plt.figure(figsize=(20,8))
sns.barplot(x="Color", y="Type", data=dataset)
plt.show()

In [None]:
# convert the type to categorical for get ids
dataset['Color'] = dataset['Color'].astype('category')
# add the categorical numbers in a new column
dataset['Color_cats'] = dataset['Color'].cat.codes

# get the categorical numbers of each Color
ids = list(dataset['Color_cats'].value_counts().index)
# get the colors, both are in the same order
colors = list(dataset['Color'].value_counts().index)
dataset.head()

In [None]:
# create a df with the ids and the colors
df = pd.DataFrame(list(zip(ids, colors)),columns=['Ids', 'Color'])
# and then use the oneHotEncoder in a new dataframe
encoding = pd.get_dummies(dataset['Color'], prefix='Color')
# finally there's to concat the encoding df to dataset
dataset = pd.concat([dataset, encoding], axis=1, join="inner")
# and drop the Color_cats column
dataset.drop(['Color_cats', 'Color'], axis=1, inplace=True)
print(dataset.shape)
dataset.head()

## Processing Spectral Class - One Hot Encoding

In [None]:
figure = plt.figure(figsize=(20,8))
sns.barplot(x="Spectral_Class", y="Type", data=dataset)
plt.show()

In [None]:
# for this it's okey to repeat the previous process
col = 'Spectral_Class'

# convert the type to categorical for get ids
dataset[col] = dataset[col].astype('category')
# add the categorical numbers in a new column
dataset[f'{col}_cats'] = dataset[col].cat.codes

# get the categorical numbers of each Color
ids = list(dataset[f'{col}_cats'].value_counts().index)
# get the colors, both are in the same order
colors = list(dataset[col].value_counts().index)

# create a df with the ids and the colors
df = pd.DataFrame(list(zip(ids, colors)),columns=['Ids', 'Color'])
# and then use the oneHotEncoder in a new dataframe
encoding = pd.get_dummies(dataset[col], prefix=col)
# finally there's to concat the encoding df to dataset
dataset = pd.concat([dataset, encoding], axis=1, join="inner")
# and drop the Color_cats column
dataset.drop([f'{col}_cats', col], axis=1, inplace=True)
dataset.head()

## Processing Temperature, Luminosity, Radius and Magnitude - Standard Scaler

In [None]:
from sklearn.preprocessing import StandardScaler # the scaler

cols = ['Temperature','Absolute_Magnitude','Relative_Luminosity','Relative_Radius' ]

# instance the scaler
temp_scaler = StandardScaler()
# fit the scaler with the data
temp_scaler.fit(dataset[cols])
# transform the data
scaled_vals = temp_scaler.transform(dataset[cols])
# and save the changes
dataset['Temperature'] = scaled_vals[:,0]
dataset['Absolute_Magnitude'] = scaled_vals[:,1]
dataset['Relative_Luminosity'] = scaled_vals[:,2]
dataset['Relative_Radius'] = scaled_vals[:,3]
dataset.head()

## The target: Type

In [None]:
# select the target
target = dataset['Type'].values
# delete the target from the dataset
dataset.drop(['Type'], axis=1, inplace=True)
target

In [None]:
dataset.head()

In [None]:
dataset.shape

## Split the dataset on train and test

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(dataset, target, test_size=.2, random_state=2021)

# The Models
Theres many models that can be used for this practise. Since there's no many data use a neural network may not be the best option. These models that are included are from sklearn, and are used for classify. These are the models:

* [LogisticRegression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html)
* [KNeighborsClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html)
* [DecisionTreeClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html)
* [RandomForestClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html)
* [GradientBoostingClassifier](https://scikit-learn.org/0.15/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html)
* [LGBMClassifier]()
* [KMeans](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html?highlight=kmeans#sklearn.cluster.KMeans)

In [None]:
# many models to prove how is them accuracy
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

# and import the cross validation score to eval the models
from sklearn.model_selection import cross_val_score

In [None]:
# instance and fit all the models with the data

# the solver="liblinear" is better for small datasets
lj = LogisticRegression(solver="liblinear").fit(X_train, Y_train)
knn = KNeighborsClassifier().fit(X_train, Y_train)
dtc = DecisionTreeClassifier(random_state=0).fit(X_train, Y_train)
rfc = RandomForestClassifier(random_state=0,verbose=False).fit(X_train, Y_train)
gbmc = GradientBoostingClassifier(verbose=False).fit(X_train, Y_train)

# save the models in a list to visualize the accuracy of each
models = [lj, knn, dtc, rfc, gbmc]

In [None]:
# iterate the models to evaluate them and see the accuracy
for model in models:
    # select the name
    name = model.__class__.__name__
    # evaluate with cross validation and get the mean
    score = cross_val_score(model,X_test, Y_test,cv=5,verbose=False).mean()
    # calculate the error metric value with neg_mean_squared_error and same the mean value
    error = -cross_val_score(model,X_test, Y_test,cv=5,scoring="neg_mean_squared_error",verbose=False).mean()
    # show the results
    print("-> " + name + ": ")
    print('Accuracy:', score)
    # aply the sqrt as it is squared error
    print('Error:', np.sqrt(error))
    print("*" * 20)
    

# Visualize the evaluations

In [None]:
# we will try to eval the models with a largest cv

# set the results in a dataframe
results = pd.DataFrame(columns=["model","score"])

# iterate the models
for model in models:
    # select the name
    name = model.__class__.__name__
    # evaluate the model 
    score = cross_val_score(model, X_test, Y_test, cv=5, verbose=False).mean()
    # set the result on a dataframe
    result = pd.DataFrame([[name,score]], columns=["model","score"])
    # and append the result on the results dataframe
    results = results.append(result)

# then plot the results
figure = plt.figure(figsize=(20,8))   
sns.barplot(x="score",y="model",data=results)
plt.xlabel("Accuracy")
plt.ylabel("Model")
# set the limit on 100 that is the score limit
plt.xlim(0,1)
plt.title("Model Accuracy Score")
plt.show()

In [None]:
# do the same but for the error values

# set the results in a dataframe
results = pd.DataFrame(columns=["model","error"])

# iterate the models
for model in models:
    # select the name
    name = model.__class__.__name__
    # evaluate the model 
    error = -1 * cross_val_score(model, X_test, Y_test, cv=5, scoring="neg_mean_squared_error", verbose=False).mean()
    # set the result on a dataframe
    result = pd.DataFrame([[name, np.sqrt(error)]], columns=["model","error"])
    # and append the result on the results dataframe
    results = results.append(result)

# then plot the results
figure = plt.figure(figsize=(20,8))   
sns.barplot(x="error",y="model",data=results)
plt.xlabel("Error")
plt.ylabel("Model")
# set the limit on 100 that is the score limit
plt.xlim(0,1)
plt.title("Model Error Score")
plt.show()

# Results

As we can see the **DecisionTreeClassifier** looks like the best option as it has 100% of accuracy and 0 error.
However we can see that **RandomForestClassifier** and **GradientBoostingClassifier**, both have a good performance in more than 90% and an not so high error value.

# Answer to a comment:
So from here, how might one go about using this model if given some measurements on an unclassified star?

I will try to set the measurements in a **DataFrame** and then use the trained **OneHotEncoder and Standard Scaler** to normalize the measurements and then use that normalized data to predict with a model. The next cells develop the process.

## 1 - Set the new measurements in a DataFrame
For example the first star from the dataset.

In [None]:
# I will repeat the reading of the dataset
# but with other name, data
data = pd.read_csv(path)

# lets rename a few columns
new_names = {'L': 'Relative_Luminosity', 'R': 'Relative_Radius', 'A_M': 'Absolute_Magnitude'}
data = data.rename(columns=new_names)

# FIRST: have the new measurements, for example:
# select the first star in the dataset

# get the values and the column names
vals = data.drop(['Type'], axis=1).loc[0].values
cols = data.drop(['Type'], axis=1).columns

# set the data in a DataFrame
data = pd.DataFrame(data=[vals], columns=cols)

## 2 - Apply the Scaler and Encoder to the data
These must be the same, or have the same parameters.

In [None]:
# First, the scaler, temp_scaler
num_cols = ['Temperature','Absolute_Magnitude','Relative_Luminosity','Relative_Radius']

# apply the transformation with the scaler
scal_vals = temp_scaler.transform(data[num_cols])
# set the scaled values
data['Temperature'] = scal_vals[:,0]
data['Absolute_Magnitude'] = scal_vals[:,1]
data['Relative_Luminosity'] = scal_vals[:,2]
data['Relative_Radius'] = scal_vals[:,3]
data
# expected: -0.779382	-0.598624	-0.459210	1.11674

In [None]:
# Second, the one hot encodind
# I will use an extra function to apply one hot encoding
# and I will use it with Color and Spectral_Class

# target col will be 'Color' in data
# cols will be the color columns in dataset
def set_encoding(value, target_col, cols):
    res = pd.DataFrame()
    # col with value 1
    col_name = target_col + '_' + value
    # iterate all the columns
    for col in cols:
        # look for the special column with value 1
        if col == col_name:
            res[col] = [1]
        # add the column to the dataset with value 0
        else:
            res[col] = [0]
    return res


# collect the required parameters
color_cols = [col for col in dataset.columns if col[:5] == 'Color']
color_val = data['Color'].values[0]

spectral_cols = [col for col in dataset.columns if col[:8] == 'Spectral']
spectral_val = data['Spectral_Class'].values[0]


# then use the function to Color and Spectral Class
color_encoding = set_encoding(color_val, 'Color', color_cols)
spectral_encoding = set_encoding(spectral_val, 'Spectral_Class', spectral_cols)
color_encoding

In [None]:
# concat the results to the measurements dataframe
# and delete the Color and Spectral_Class cols
data = pd.concat([data, color_encoding], axis=1)
data = data.drop(['Color'], axis=1)

data = pd.concat([data, spectral_encoding], axis=1)
data = data.drop(['Spectral_Class'], axis=1)
data

In [None]:
# we can compare data with the frist row of dataset
first_row = dataset.loc[0]
data.values == first_row.values

## 3 - Finally use a model for predict the measurements

In [None]:
# I will use the DecisionTreeClassifier
pred = dtc.predict(data)
result = target[0]
print('Prediction:', pred)
print('Answer:', result)

# Conclusion to comment
Thats what I will try to do to answer the question. Also if it's needed to do the prediction process in a single script .py, it will probably be the best option to compact this final process to a single function. The information needed to do that will be:

* The model and scaler parameterss, in the Scikit-Learn documentation tells how to extract the params and then you can save it with numpy files
* The column names from the original dataset are going to be needed in this case as I did not use OneHotEncoding module, which Scikit-Learn has [one](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html).

That parameters are going to be needed if you want to run the model an use it to predict in a single script. Here there are documentation about how to save parameters:

https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html
https://scikit-learn.org/stable/modules/model_persistence.html