In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
%matplotlib inline
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv('../input/star-type-classification/Stars.csv')
data.head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix,classification_report

# Data Exploration

In [None]:
data.head()

In [None]:
data.describe()

In [None]:
print(f"shape: {data.shape}")
print('\n')
print(f"missing value:\n{data.isnull().sum()}")
print('\n')

* Dataset is including with numeric and category features
* 240 records, 6 features and 1 label
* No missing value

In [None]:
plt.figure(figsize=(15,5))
data.drop('Type',axis=1).boxplot(vert=False)
plt.title('Data Distribution')
plt.xlabel('value')
plt.show()

In [None]:
plt.figure(figsize=(5,3))
data.Type.value_counts().plot.bar()
plt.title('Record count of each Type')
plt.show()

* The range of values is very wide
* But the balancing is perfect (40 records per Type)
* So let's drop feature that not related to Type

In [None]:
col_num = ['Temperature', 'L', 'R', 'A_M']
col_cat = ['Color','Spectral_Class']

In [None]:
i=1
plt.figure(figsize=(20,5))
for c in col_num:
  ax = plt.subplot(1,4,i)
  data.groupby('Type').mean()[c].plot.bar(ax=ax)
  ax.set_title(f'Mean of {c}')
  i+=1
plt.show()

* Can cleary see the difference between each Type of numeric features

In [None]:
plt.figure(figsize=(5,5))
# Correlation
num_corr=data.drop('Type',axis=1).corr()
sns.heatmap(num_corr,vmin=-1,vmax=1,annot=True,cmap='RdBu')
plt.title('Nemeric Feature Correlation')
plt.show()

> y = mx + c

In [None]:
a = 1
plt.figure(figsize=(22,22))
for j in range(len(col_num)):
    for k in range(len(col_num)):
        ax = plt.subplot(4,4,a)
        plt.scatter(data[col_num[k]],data[col_num[j]],color='gray')
        m, c = np.polyfit(data[col_num[k]],data[col_num[j]], 1)
        plt.plot(data[col_num[k]], m*data[col_num[k]] + c,'blue')
        plt.xlabel(col_num[k])
        plt.ylabel(col_num[j])
        a+=1
plt.show()

* There're relationship between pair of numeric features
* So all numeric features might be related with Type
* What about relation of category features?

In [None]:
data.Color.value_counts()

* There're duplicate color such as 'Blue-white' and 'Blue White'
* Transform characters to lowercase and remove non-word characters

In [None]:
data.Color=list(map(lambda x: x.lower(),data.Color.values))
data.Color.replace('\W','',regex=True,inplace=True)
data.Color.value_counts()

In [None]:
plt.figure(figsize=(7,10))
# Map class and count values between Color and Spectrum
ax1 = plt.subplot(2,1,1)
cat_dep1 = data.pivot_table(index='Spectral_Class',columns='Type', aggfunc='size')
mask1=cat_dep1.isnull()
sns.heatmap(cat_dep1,annot=True,fmt='g',cmap='Blues',mask=mask1,ax=ax1)

ax2 = plt.subplot(2,1,2)
cat_dep2 = data.pivot_table(index='Color',columns='Spectral_Class', aggfunc='size')
mask2=cat_dep2.isnull()
sns.heatmap(cat_dep2,annot=True,fmt='g',cmap='Greens',mask=mask2,ax=ax2)

plt.show()

* We can see group of data
* So Color and Spectral_Class might be related with Type
* No need to drop any features!

# Data Preparation

* Categorical Encoding

In [None]:
data_enc = pd.get_dummies(data.drop('Type',axis=1), prefix=('c','S'))
data_enc.head()

* Normalization
> X = (X0 - Xmin) / (Xmax - Xmin)

In [None]:
min = data_enc.min()
max = data_enc.max()
data_norm = (data_enc - min) / (max - min)
data_norm.head()

In [None]:
data_norm.describe()

* Train/Test set split

In [None]:
train_x,train_y,test_x,test_y = train_test_split(data_norm,data.Type,test_size=0.3, random_state=42)

In [None]:
#Transform dataframe to array
train_xa = np.array(train_x)
test_xa = np.array(test_x)
train_ya = np.array(train_y)
test_ya = np.array(test_y)
print(train_xa.shape)
print(test_xa.shape)
print(train_ya.shape)
print(test_xa.shape)

# Model Training

* Using Gaussian Naive Bayes

In [None]:
model_g = [GaussianNB(),'GaussianNB()']
model_g[0].fit(train_xa,test_xa)

# Model Evaluation

* Using confusion metrix to evaluate model

In [None]:
def modelEvaluate(model,y_train,y_test):
  y_pred = model[0].predict(y_train)
  plt.figure(figsize=(5,5))
  conf = confusion_matrix(y_test,y_pred)
  sns.heatmap(conf,annot=True,cmap='Blues')
  plt.title(f'Confusion Metrix\n-- {model[1]} --')
  plt.ylabel('Prediction')
  plt.xlabel('Actual')
  plt.show()
  print(classification_report(y_test,y_pred))

In [None]:
modelEvaluate(model_g,train_ya,test_ya)

# Model Tuning

* Adjust var_smoothing

In [None]:
model_tune = [GaussianNB(var_smoothing=0.01),'GaussianNB(var_smoothing=0.01)']
model_tune[0].fit(train_xa,test_xa)

In [None]:
modelEvaluate(model_tune,train_ya,test_ya)

# Conclusion

* GaussianNB(var_smoothing=0.01)
* Accuracy = 99%