# Classification of Stars using Machine Learning

## Table of Contents

* [Target Distribution](#1)
* [Numerical Features](#2)
* [Categorical Features](#3)
* [Target vs Features](#4)
* [Visualization using PCA](#5)
* [Fit Model](#6)
* [Evaluate Performance](#7)

In [None]:
# packages

# standard
import numpy as np
import pandas as pd
import time

# plots
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
from statsmodels.graphics.mosaicplot import mosaic

# PCA
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# start H2O
import h2o
from h2o.estimators import H2ORandomForestEstimator

In [None]:
# read data
df = pd.read_csv('../input/star-type-classification/Stars.csv')
# type conversions
df.Type = df.Type.astype('category')
# preview
df.head()

### Data Description:
* Temperature in Kelvin
* L : Luminosity (relative to our sun) 
* R : Radius (relative to our sun)
* A_M : Absolute Magnitude
* Target Levels:
     * Red Dwarf - 0
     * Brown Dwarf - 1
     * White Dwarf - 2
     * Main Sequence - 3
     * Super Giants - 4
     * Hyper Giants - 5


In [None]:
# structure of data frame
df.info()

<a id='1'></a>
# Target Distribution

In [None]:
# eval target
print(df['Type'].value_counts())

plt.figure(figsize=(8,6))
df['Type'].value_counts().plot(kind='bar')
plt.title('Target (Type)')
plt.grid()
plt.show()

#### Nicely balanced!

<a id='2'></a>
# Numerical Features

In [None]:
features_num = ['Temperature', 'L', 'R', 'A_M']

# summary stats for numerical features
df[features_num].describe(percentiles=[0.01,0.1,0.25,0.5,0.75,0.9,0.99])

In [None]:
# plot distributions (histogram + boxplot)
for f in features_num:
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(11,7), sharex=True)
    ax1.hist(df[f], bins=20)
    ax1.grid()
    ax1.set_title(f)
    ax2.boxplot(df[f], vert=False)
    ax2.grid()   
    ax2.set_title(f + ' - boxplot')
    plt.show()

In [None]:
# pairwise scatter plot
sns.pairplot(df[features_num], 
             kind='reg',
             plot_kws={'line_kws':{'color':'magenta'}, 
                       'scatter_kws': {'alpha': 0.5}})
plt.show()

### Correlations

In [None]:
# calc correlation matrices
corr_pearson = df[features_num].corr(method='pearson')
corr_spearman = df[features_num].corr(method='spearman')

# and plot side by side
plt.figure(figsize=(15,5))
ax1 = plt.subplot(1,2,1)
sns.heatmap(corr_pearson, annot=True, cmap='RdYlGn', vmin=-1, vmax=+1)
plt.title('Pearson Correlation')

ax2 = plt.subplot(1,2,2, sharex=ax1)
sns.heatmap(corr_spearman, annot=True, cmap='RdYlGn', vmin=-1, vmax=+1)
plt.title('Spearman Correlation')
plt.show()

<a id='3'></a>
# Categorical Features

In [None]:
features_cat = ['Color', 'Spectral_Class']

#### The feature "Color" needs a little bit of cleaning first:

In [None]:
df.Color.value_counts()

In [None]:
# replace levels
df.Color.loc[df.Color=='Blue-white'] = 'Blue-White'
df.Color.loc[df.Color=='Blue White'] = 'Blue-White'
df.Color.loc[df.Color=='Blue white'] = 'Blue-White'
df.Color.loc[df.Color=='yellow-white'] = 'White-Yellow'
df.Color.loc[df.Color=='Yellowish White'] = 'White-Yellow'
df.Color.loc[df.Color=='white'] = 'White'
df.Color.loc[df.Color=='yellowish'] = 'Yellowish'

In [None]:
# let's check
df.Color.value_counts()

In [None]:
# plot distribution of categorical features
for f in features_cat:
    plt.figure(figsize=(10,4))
    df[f].value_counts().plot(kind='bar')
    plt.title(f)
    plt.grid()
    plt.show()

### Dependencies between the two categorical features:

In [None]:
# visualize cross table of features using heatmap
sns.heatmap(pd.crosstab(df.Color, df.Spectral_Class),
            cmap='RdYlGn',
            annot=True, fmt='.0f')
plt.show()

<a id='4'></a>
# Target vs Features

### Numerical Features

In [None]:
for f in features_num:
    plt.figure(figsize=(10,5))
    sns.violinplot(x=f, y='Type', data=df)
    my_title = 'Distribution by Type for ' + f
    plt.title(my_title)
    plt.grid()

### Categorical Features

In [None]:
# visualize cross table of target vs features using heatmap
for f in features_cat:
    sns.heatmap(pd.crosstab(df.Type, df[f]), 
                annot=True, cmap='RdYlGn')
    plt.show()

<a id='5'></a>
# Visualization using PCA

In [None]:
# use PCA to reduce dimension of data
df4pca = df[features_num]
# standardize first
df4pca_std = StandardScaler().fit_transform(df4pca)
# define 3D PCA
pc_model = PCA(n_components=3)
# apply PCA
pc = pc_model.fit_transform(df4pca_std)
# add to original data frame
df['pc_1'] = pc[:,0]
df['pc_2'] = pc[:,1]
df['pc_3'] = pc[:,2]
# show extended data frame
df.head()

In [None]:
# interactive plot - click on legend to filter for individual classes
df['size'] = 1
fig = px.scatter_3d(df, x='pc_1', y='pc_2', z='pc_3',
                    color='Type',
                    size='size',
                    size_max=10,
                    opacity=0.5)
fig.update_layout(title='PCA 3D Interactive')
fig.show()

#### The 3D plot looks like we could separate the classes quite well using a model.

<a id='6'></a>
# Fit model

In [None]:
# init H2O
h2o.init(max_mem_size='12G', nthreads=4)

In [None]:
# upload data frame in H2O environment
t1 = time.time()
df_hex = h2o.H2OFrame(df)
t2 = time.time()
print('Elapsed time [s]: ', np.round(t2-t1,2))

In [None]:
# define target
target = 'Type'
# select features
features = features_num + features_cat
print('Features used:', features)
# explicitly convert target to categorical => multiclass classification problem
df_hex[target] = df_hex[target].asfactor()

In [None]:
# train / test split
train_perc = 0.5 # use only 50% otherwise test set will be very small
train_hex, test_hex = df_hex.split_frame(ratios=[train_perc], seed=999)

#### Check target distribution in train/test set:

In [None]:
train_hex[target].as_data_frame().value_counts()

In [None]:
test_hex[target].as_data_frame().value_counts()

In [None]:
# define (distributed) random forest model
n_cv = 5
fit_DRF = H2ORandomForestEstimator(ntrees=5,
                                   max_depth=20,
                                   min_rows=5,
                                   nfolds=n_cv,
                                   seed=999)

# train model
t1 = time.time()
fit_DRF.train(x=features,
              y=target,
              training_frame=train_hex)
t2 = time.time()
print('Elapsed time [s]: ', np.round(t2-t1,2))

In [None]:
# variable importance
fit_DRF.varimp_plot()

In [None]:
# cross validation metrics
fit_DRF.cross_validation_metrics_summary()

<a id='7'></a>
# Evaluate Performance

### Training Performance

In [None]:
# predict
pred_train = fit_DRF.predict(train_hex)
# add actual target
pred_train['target'] = train_hex[target]
pred_train = pred_train.as_data_frame()
# preview
pred_train.head()

In [None]:
# confusion matrix; rows ~ actual observations, cols ~ predictions
conf_train = pd.crosstab(pred_train['target'], pred_train['predict'])
# visualize
sns.heatmap(conf_train, cmap='Blues', annot=True, 
            cbar=False, fmt='d',
            linecolor='black',
            linewidths=0.1)
plt.show()

### Test Set Performance

In [None]:
# predict
pred_test = fit_DRF.predict(test_hex)
# add actual target
pred_test['target'] = test_hex[target]
pred_test = pred_test.as_data_frame()
pred_test.head()

In [None]:
# confusion matrix; rows ~ actual observations, cols ~ predictions
conf_test = pd.crosstab(pred_test['target'], pred_test['predict'])
# visualize
sns.heatmap(conf_test, cmap='Blues', annot=True, 
            cbar=False, fmt='d',
            linecolor='black',
            linewidths=0.1)
plt.show()

### That's nice, all predictions are correct!