In [None]:
import matplotlib.pyplot as plt
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

# Data Exploration

**First, I am going to identify and seperate categorical features.**

In [None]:
data = pd.read_csv('/kaggle/input/star-type-classification/Stars.csv')
data.columns = ['Temperature','Relative Luminosity','Relative Radius','Absolute Magnitude','Color','Spectral Class','Star Type']
categorical = [col for col in data.columns if data[col].dtype=='O']
data.dtypes

**Let's take a look at the distribution of the continuous features.**

In [None]:
data.describe()

**Fixing the format of the color column**

In [None]:
spectral_classes = list(data.groupby('Spectral Class')['Spectral Class'].count().keys())
data.groupby('Color')['Color'].count()

We need to fix the same labels written in different formats such as 'Blue White' and 'Blue-White' etc. Also I am going to rename similiar colors into a parent color as we already have a small sample set which results in only 1 or 2 samples in some of the colors.

In [None]:
data['Color'] = data['Color'].str.replace('-',' ').str.lower()
data.loc[(data['Color'] == 'orange red') | (data['Color'] =='pale yellow orange'),'Color'] = 'orange'
data.loc[(data['Color'] == 'white yellow') | (data['Color'] =='whitish'),'Color'] = 'white'
data.loc[(data['Color'] == 'yellowish white') | (data['Color'] =='yellowish'),'Color'] = 'yellow white'
#label_encoder = LabelEncoder()
#data['Colors Encoded'] = label_encoder.fit_transform(data['Color'])

In [None]:
data.groupby('Color')['Color'].count()


In [None]:
background= '#f5f3f0'
colors = {'blue':'#0000FF',
         'blue white':'#b1e6f6',
         'orange':'#FFA500',
         'red':'#FF0000',
         'white':'#FFFFFF',
          'yellow white':'#FFFFE0'
          
         }

**Exploring the relations between different features**

In [None]:
g = sns.PairGrid(data,hue='Star Type',diag_sharey=False,palette='colorblind')
g.map_diag(sns.histplot)
g.map_lower(sns.scatterplot)
g.add_legend()
plt.show()

**Absolute Magnitude seems to be our most important feature. Furthermore in the Absolute Magnitude / Temperature chart it is almost possible to draw the clusters by hand.**

In [None]:
plt.figure(figsize=[6,4])
sns.scatterplot(x = data['Temperature'],y=data['Absolute Magnitude'],hue=data['Star Type'],palette='colorblind')
plt.show()

**I think these two features are enough to classify our classes however lets take a look at the categorical features.**

In [None]:
color_totals = data.groupby(['Star Type','Color']).count().rename(columns={'Temperature':'Count'})['Count'].reset_index()
type_totals = color_totals.groupby(['Star Type']).sum().reset_index()
color_totals['Percentage'] = color_totals['Count']/40*100
missing_colors = []
for star_type in range(0,6):
    temp = color_totals.loc[color_totals['Star Type'] == star_type]
    for color in colors.keys(): 
        if color not in temp['Color'].values:
            missing_colors.append([star_type,color,0,0])
df = pd.DataFrame(missing_colors,columns=['Star Type','Color','Count','Percentage'])
color_totals = color_totals.append(df).sort_values(by=['Star Type'])
prev_color = None
bottom = None

fig, ax = plt.subplots()
ax.set_facecolor(background)
ax.set_ylabel('Percentage')
ax.set_xlabel('Star Types')
labels = range(0,6)
width =0.5

for color in colors.keys():
    if bottom is None:
        ax.bar(labels, color_totals.loc[color_totals['Color'] == color,'Percentage'], width, label=color,color=colors[color])
        bottom = color_totals.loc[color_totals['Color'] == color]['Percentage'].values
    else:
        ax.bar(labels, color_totals.loc[color_totals['Color'] == color,'Percentage'], width, label=color,color=colors[color],bottom=bottom)
        bottom = bottom + color_totals.loc[color_totals['Color'] == color]['Percentage'].values
    prev_color = color
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.,facecolor=background)
plt.title('Star Type Distributions By Star Color')
plt.show()

In [None]:
sclass_totals = data.groupby(['Star Type','Spectral Class']).count().rename(columns={'Temperature':'Count'})['Count'].reset_index()
type_totals = sclass_totals.groupby(['Star Type']).sum().reset_index()
sclass_totals['Percentage'] = sclass_totals['Count']/40*100
missing_sclasses = []
for star_type in range(0,6):
    temp = sclass_totals.loc[sclass_totals['Star Type'] == star_type]
    for sclass in spectral_classes: 
        if sclass not in temp['Spectral Class'].values:
            missing_sclasses.append([star_type,sclass,0,0])
df = pd.DataFrame(missing_sclasses,columns=['Star Type','Spectral Class','Count','Percentage'])
sclass_totals = sclass_totals.append(df).sort_values(by=['Star Type'])
prev_sclass = None
bottom = None

fig, ax = plt.subplots()
ax.set_facecolor(background)
ax.set_ylabel('Percentage')
ax.set_xlabel('Star Types')
labels = range(0,6)
width =0.5

for sclass in spectral_classes:
    if bottom is None:
        ax.bar(labels, sclass_totals.loc[sclass_totals['Spectral Class'] == sclass,'Percentage'], width, label=sclass)
        bottom = sclass_totals.loc[sclass_totals['Spectral Class'] == sclass]['Percentage'].values
    else:
        ax.bar(labels, sclass_totals.loc[sclass_totals['Spectral Class'] == sclass,'Percentage'], width, label=sclass,bottom=bottom)
        bottom = bottom + sclass_totals.loc[sclass_totals['Spectral Class'] == sclass]['Percentage'].values
    prev_sclass = sclass
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.,facecolor=background)
plt.title('Star Type Distributions By Star sclass')
plt.show()

**Categorical features seem to be only helpful in discerning star types 0 and 1 thus I am not going to include them in the final model.**

# Classification

In [None]:
encoder = LabelEncoder()
data['Color'] = encoder.fit_transform(data['Color'])
data['Spectral Class'] = encoder.fit_transform(data['Spectral Class'])
y = data['Star Type']
X = data[['Absolute Magnitude','Temperature']]
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.3)
clf = XGBClassifier(max_depth=5, objective='multi:softmax', n_estimators=100, num_classes=6)
clf.fit(X_train, y_train)  
pred = clf.predict(X_test)
acc = accuracy_score(y_test,pred)

In [None]:
print('Accuracy : {}%'.format(acc*100))