In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import plot_roc_curve
from sklearn.model_selection import cross_val_score

In [None]:
stars = pd.read_csv("../input/star-dataset/6 class csv.csv")
stars.head()

Mostly numerical data, but star color is a categorical data. Let's take a look at it a little closer.

In [None]:
stars["Star color"].value_counts()

These colors are not uniform. That is, what is the difference between Blue-white and Blue White? I can fix these up and reduce the number of features. 

In [None]:
stars["Star color"] = stars["Star color"].str.lower().str.replace(" ", "").str.replace("-", "")
stars["Star color"].value_counts()

This has reduced the number of columns that are bad, but there are still a few weird things. This might be taking a bit of a leap, but I am of the opinion that yellowwhite, yellowishwhite, and whiteyellow are the same, and whitish can be grouped into white.

In [None]:
stars["Star color"] = stars["Star color"].str.replace("yellowishwhite", "yellowwhite").str.replace("whiteyellow", "yellowwhite").str.replace("whitish", "white")
stars["Star color"] = stars["Star color"].str.replace("orangered", "orange").str.replace("paleyelloworange", "orange")

In [None]:
stars['Star color'].value_counts()

I am personally OK with this set up. I am willing to believe that someone legitimately classified the remaining categories as separate entities. Anyway, let's continue with the data exploration.

In [None]:
stars.hist(bins=20, figsize=(20,15));

Looks like most of the stars are actually not too different from our own sun, at least in terms of temperature, luminosity, and radius. The sun's abolute magnitude is ~4.83, so it actually exists in the region of the Absolute Magnitude plot that is less occupied by the general population.

How do some of the features correlate with each other?

In [None]:
corr_mat = stars.corr()
sns.heatmap(corr_mat, annot=True)

It looks like there is a failr negative correlation between Absolute Magnitude and Star Type, with lots of other featues in between $\pm 0.5$

# Now let's get into some preprocessing.

First, split the data into training and test data

In [None]:
X = stars.drop("Star type", axis=1)
y = stars["Star type"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.head()

Now, we do need to handle the object columns so that our algorith will handle them properly.

In [None]:
enc = OneHotEncoder(handle_unknown='ignore')

cat_columns = ["Star color", 'Spectral Class']
num_columns = ["Temperature (K)", "Luminosity(L/Lo)", "Radius(R/Ro)", "Absolute magnitude(Mv)"]

one_hot = ColumnTransformer([("one_hot", enc, cat_columns)], remainder="passthrough")
scaler = StandardScaler()
preprocessor = ColumnTransformer(
                transformers=[
                    ("cat", one_hot, cat_columns),
                    ("scale", scaler, num_columns)
                ])
model = Pipeline(steps=
                [("preprocessor", preprocessor),
                ("model", RandomForestClassifier())
                ])


I am going to use a Random Forest for this problem. If that does not prove terribly useful, I will reasses and try another model.

In [None]:
model.fit(X_train, y_train)

In [None]:
model.score(X_test, y_test)

So our accuracy is 1.0, which means that the model accurately classified every star in the test set. This is good, and I think for a simpler data set like this one, it is OK to leave it at that. I think was a good exercise in getting used to typing out ML code.