<div width="100%">
    <img width="100%" src="https://storage.googleapis.com/kaggle-datasets-images/1247161/2080258/f00c5de7e83ea7287bc77aeb969b79ea/dataset-cover.jpg" />
</div>

In [None]:
import numpy as np
import pandas as pd

from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression

import matplotlib.pyplot as plt

from math import sin
from math import pi
from numpy import arange
from numpy import vstack
from numpy import argmax
from numpy import asarray
from numpy.random import normal
from numpy.random import random
from scipy.stats import norm
from sklearn.gaussian_process import GaussianProcessRegressor
from warnings import catch_warnings
from warnings import simplefilter
from matplotlib import pyplot

<h1 id="dataset" style="color:#463833; background:white; border:0.5px dotted #3490b8;"> 
    <center>Dataset
        <a class="anchor-link" href="#dataset" target="_self">¶</a>
    </center>
</h1>

## Load Dataset

In [None]:
path = "../input/star-type-classification/Stars.csv"
df = pd.read_csv(path)
df.head()

## Categorical encoding

In [None]:
colors_to_idx = {v:k for k,v in enumerate(df['Color'].unique())}
idx_to_colors = {k:v for k,v in enumerate(df['Color'].unique())}

spectral_to_idx = {v:k for k,v in enumerate(df['Spectral_Class'].unique())}
idx_to_spectral = {k:v for k,v in enumerate(df['Spectral_Class'].unique())}

In [None]:
df['Color'].replace(colors_to_idx, inplace=True)
df['Spectral_Class'].replace(spectral_to_idx, inplace=True)

## Describe columns

In [None]:
df.describe().T

## Features and labels

In [None]:
features = df.drop('Type', axis=1)
labels = df['Type']

## Min-Max Scaling

In [None]:
mms = MinMaxScaler()

features = mms.fit_transform(features)

## Dimension reduction

In [None]:
pca = PCA(n_components=2, svd_solver='arpack')
X_tr = pca.fit_transform(features)

In [None]:
pd.DataFrame(features).describe()

<h1 id="categorize" style="color:#463833; background:white; border:0.5px dotted #3490b8;"> 
    <center>Categorize Stars using PCA
        <a class="anchor-link" href="#categorize" target="_self">¶</a>
    </center>
</h1>

In [None]:
plt.figure(figsize=(14,8))

colors = ['blue', 'red', 'green', 'brown', 'cyan', 'purple']

for i in range(len(colors)):
    X, y = X_tr[list(df[df['Type'] == i].index)][:,0], X_tr[list(df[df['Type'] == i].index)][:,1]
    plt.scatter(X, y, color=colors[i], marker='s')

<h1 id="blue" style="color:#463833; background:white; border:0.5px dotted #3490b8;"> 
    <center>Blue White Star
        <a class="anchor-link" href="#blue" target="_self">¶</a>
    </center>
</h1>

In [None]:
# Select the blue white star
id = colors_to_idx['Blue white']

X_orig, y_orig = X_tr[list(df[df['Type'] == id].index)][:,0], X_tr[list(df[df['Type'] == id].index)][:,1]
    
plt.figure(figsize=(14,8))
plt.plot(X_orig, y_orig, 'bo')

In [None]:
X = X_orig.reshape(len(X_orig), 1)
y = y_orig.reshape(len(y_orig), 1)

<h1 id="target" style="color:#463833; background:white; border:0.5px dotted #3490b8;"> 
    <center>Polynomial Regression
        <a class="anchor-link" href="#target" target="_self">¶</a>
    </center>
</h1>

<h3>Goal: Use polynomial regression to setup Gaussian target for optimization.</h3>

In [None]:
degree = 16
polyreg = make_pipeline(PolynomialFeatures(degree),LinearRegression())
polyreg.fit(X,y)

In [None]:
plt.figure(figsize=(14,8))
plt.plot(X, y, 'bo')
plt.plot(X, polyreg.predict(X), 'ro')

<h1 id="gaussian" style="color:#463833; background:white; border:0.5px dotted #3490b8;"> 
    <center>Gaussian Base training
        <a class="anchor-link" href="#gaussian" target="_self">¶</a>
    </center>
</h1>

In [None]:
model = GaussianProcessRegressor()
model.fit(X, y)

plt.figure(figsize=(14,8))
plt.plot(X, y, 'bo')
plt.plot(X, model.predict(X), 'ro')

<h1 id="extra" style="color:#463833; background:white; border:0.5px dotted #3490b8;"> 
    <center>Gaussian Optimization functions
        <a class="anchor-link" href="#extra" target="_self">¶</a>
    </center>
</h1>

In [None]:
def surrogate(model, X):
    '''
        Surrogate or approximation for the objective function
    '''
    with catch_warnings():
        # ignore generated warnings
        simplefilter("ignore")
        return model.predict(X, return_std=True)
 
def acquisition(X, Xsamples, model):
    '''
        Probability of improvement acquisition function
    '''
    # calculate the best surrogate score found so far
    yhat, _ = surrogate(model, X)
    best = max(yhat)
    # calculate mean and stdev via surrogate function
    mu, std = surrogate(model, Xsamples)
    mu = mu[:, 0]
    # calculate the probability of improvement
    probs = norm.cdf((mu - best) / (std+1E-9))
    return probs
 
def opt_acquisition(X, y, model):
    '''
        Optimize the acquisition function
    '''
    # random search, generate random samples
    Xsamples = random(100)
    Xsamples = Xsamples.reshape(len(Xsamples), 1)
    # calculate the acquisition function for each sample
    scores = acquisition(X, Xsamples, model)
    # locate the index of the largest scores
    ix = argmax(scores)
    return Xsamples[ix, 0]

<h1 id="training" style="color:#463833; background:white; border:0.5px dotted #3490b8;"> 
    <center>Gaussian Optimization training
        <a class="anchor-link" href="#training" target="_self">¶</a>
    </center>
</h1>

In [None]:
for i in range(10):
    # select the next point to sample
    x = opt_acquisition(X, y, model)
    # sample the point
    actual = polyreg.predict(np.array(x).reshape(1, -1))
    # summarize the finding
    est, _ = surrogate(model, [[x]])
    if((i+1) % 1 == 0):
        print('>x=%.3f, f()=%3f, actual=%.3f' % (x, est, actual))
    # add the data to the dataset
    X = vstack((X, [[x]]))
    y = vstack((y, actual))
    # update the model
    model.fit(X, y)

<h1 id="results" style="color:#463833; background:white; border:0.5px dotted #3490b8;"> 
    <center>Gaussian Optimization results
        <a class="anchor-link" href="#results" target="_self">¶</a>
    </center>
</h1>

In [None]:
plt.figure(figsize=(14,8))
plt.plot(X_orig, y_orig, 'o', color='blue', label='original')
plt.plot(X, model.predict(X), 'o', color='red', label='gaussian optimized')
plt.xlim([-0.5, 1.0])
plt.ylim([-0.4, 0.4])
plt.legend()
plt.show()

<h1 id="reference" style="color:#463833; background:white; border:0.5px dotted #3490b8;"> 
    <center>Reference
        <a class="anchor-link" href="#reference" target="_self">¶</a>
    </center>
</h1>

[Gaussian Optimization - Machine Learning Mastery](https://machinelearningmastery.com/what-is-bayesian-optimization/)