# **1. Preparation**

# 1.1 Import Library & Data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
import warnings
warnings.filterwarnings("ignore")
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
data=pd.read_csv("/kaggle/input/star-categorization-giants-and-dwarfs/Star3642_balanced.csv")
data

# 1.2 Exploratory Analysis

Check whether there are vacancies or outliers in the data set, so as to determine the work of data preprocess

In [None]:
print(data.isnull().sum())
print(data.isna().sum())

In [None]:
data.describe()

As we can see from the result, there's no need to delete null or na value. But it's necessary to delete some outliers in Plx, causing the std of Plx is too high.


# 1.3 Data Preprocessing

In [None]:
# Delete the data in the top 5% of e_Plx
data = data.drop(data[ data['e_Plx'] > data['e_Plx'].quantile(q=0.95)].index)
data

In [None]:
# Delete irrelevant variables
del data['Vmag']
del data['e_Plx']
del data['SpType']
data

In [None]:
# Draw scatterplot matrix
sns.pairplot(data,hue='TargetClass')
plt.show()

# **2. Build algorithm**

# 2.1 foundation function

In [None]:
# Classification function: sigmoid function
#    which is denoted: sigmoid(x)
# Learning rate update algorithm: Exponential decay algorithm. 
#    which is denoted: e_decay(alpha,decay_rate,epoch)
def sigmoid(x):
    return 1 / (1 + np.exp(-x))
def e_decay(alpha,decay_rate,epoch):
    #alpha: learning rate
    #decay_rate: decay rate, which is given previously
    #epoch: The current number of rounds of the algorithm loop
        return alpha/(1+decay_rate*epoch)

# 2.2 logistic regression algorithm

In [None]:
# Classification function: sigmoid function
# Cost function: maximum likelihood estimation function
# Weight optimization algorithm: Gradient descent method
# Learning rate update algorithm: Exponential decay algorithm. 
class LogRegression():
        def __init__(self,numFeatures):
                self.weights=np.array([1]*numFeatures)
        def train(self,x,y,alpha,steps):
            # Gradient descent method
            for step in range(steps):
                output=sigmoid(x.dot(self.weights.T)).T
                err=y-output
                # Consider the Cost function is maximum likelihood estimation function,
                # so we have: weights-=weights-alpha*err*x
                self.weights=self.weights+alpha*err.dot(x)
                # Update learning rate
                alpha=e_decay(alpha,decay_rate=1/(10*steps),epoch=step)
        def predict(self,x,y):
                numSamples=np.shape(x)[0]
                correct_num=0
                y_pred=np.array([])
                for i in range(numSamples):
                    predict=sigmoid(x[i,:].dot(self.weights))
                    if predict>=0.5:
                        y_pred=np.append(y_pred,1)
                    else:
                        y_pred=np.append(y_pred,0)
                    if (predict>=0.5)==bool(y[i]):
                        correct_num+=1
                accuracy=float(correct_num)/numSamples
                #Return prediction results and accuracy
                return y_pred,accuracy

# 2.3 deep neural network

In [None]:
class Neuron():
    def __init__(self, weights, bias):
        self.weights = weights
        self.bias = bias

In [None]:
# Use for calaulating partial derivative
def deriv_sigmoid(x):
    fx = sigmoid(x)
    return fx * (1 - fx)

In [None]:
# Classification function: sigmoid function
# Cost function: MSE --> use for calaulating partial derivative
# Weight optimization algorithm: Gradient descent method
# Learning rate update algorithm: Exponential decay algorithm. 
class NeuralNetwork():
    def __init__(self,numFeatures):
        # Hidden layers neuron
        # Which has "numFeatures" neurons, and every neuron has "numFeatures" weights and 1 bias
        self.h_layers=[(Neuron(weights=np.array([1]*numFeatures,dtype='float64'),bias=0)) for _ in range(numFeatures)]
        # Output neuron
        # 1 neuron, which has "numFeatures" weights and 1 bias
        self.o_node=Neuron(weights=np.array([1]*numFeatures,dtype='float64'),bias=0)
    def feedforward(self, x):#feed forward network
        h_lst=[]
        for h_node in self.h_layers:
            sum_h=h_node.weights.dot(x)+h_node.bias
            h_lst.append(sigmoid(sum_h))
        sum_o=self.o_node.weights.dot(np.array(h_lst))+self.o_node.bias
        y_pred=sigmoid(sum_o)
        return y_pred
    def train(self,train_x,train_y,alpha,steps):
        for epoch in range(steps):
            for x, y in zip(train_x,train_y):
                # Calculate prediction of train data set
                sum_h_lst=[]
                h_lst=[]
                for h_node in self.h_layers:
                    sum_h=h_node.weights.dot(x)+h_node.bias
                    sum_h_lst.append(sum_h)
                    h_lst.append(sigmoid(sum_h))
                sum_o=self.o_node.weights.dot(np.array(h_lst))+self.o_node.bias
                y_pred=sigmoid(sum_o)
                # Calaulate partial derivative
                d_L_d_ypred = -2 * (y - y_pred)
                # Output node
                d_ypred_d_w=[]
                for h in h_lst:
                    d_ypred_d_w.append(h*deriv_sigmoid(sum_o))
                d_ypred_d_b=deriv_sigmoid(sum_o)
                d_ypred_d_h=[]
                for w in self.o_node.weights:
                    d_ypred_d_h.append(w*deriv_sigmoid(sum_o))
                # Hidden layers
                d_h_d_w=[]
                d_h_d_b=[]
                for sum_h in sum_h_lst:
                    temp=[]
                    for i in range(len(x)):
                        temp.append(x[i]*deriv_sigmoid(sum_h))
                    d_h_d_w.append(temp)
                    d_h_d_b.append(deriv_sigmoid(sum_h))
                # Update weights and bias: using the result of partial derivative
                # Output node
                for w_index in range(len(self.o_node.weights)):
                    self.o_node.weights[w_index] -= alpha * d_L_d_ypred * d_ypred_d_w[w_index]
                self.o_node.bias -= alpha * d_L_d_ypred * d_ypred_d_b
                # Hidden layers
                for h_index in range(len(self.h_layers)):
                    for w_index in range(len(h_node.weights)):
                        self.h_layers[h_index].weights[w_index]-=alpha*d_L_d_ypred*d_ypred_d_h[h_index]*d_h_d_w[h_index][w_index]
                    self.h_layers[h_index].bias -= alpha * d_L_d_ypred * d_ypred_d_h[h_index] * d_h_d_b[h_index]
            # Update learning rate
            alpha=e_decay(alpha,decay_rate=1/(10*steps),epoch=epoch)    
    def predict(self,test_x,test_y):
        numSamples=np.shape(test_x)[0]
        correct_num=0
        y_pred=np.array([])
        for x, y in zip(test_x,test_y):
            predict=self.feedforward(x)
            if predict>=0.5:
                y_pred=np.append(y_pred,1)
            else:
                y_pred=np.append(y_pred,0)
            if (predict>=0.5)==bool(y):
                correct_num+=1
        accuracy=float(correct_num)/numSamples
        return y_pred,accuracy

# **3. Train & Prediction**

# 3.1 Create Training Set & Test Set

In [None]:
# Train data set
train_data=data.sample(frac=0.5,random_state=1999)
train_x=np.array(train_data.loc[:,['Plx','B-V','Amag']])
train_y=np.array(train_data['TargetClass'])
# Test data set
test_data=data[~data.index.isin(train_data.index)]
test_x=np.array(test_data.loc[:,['Plx','B-V','Amag']])
test_y=np.array(test_data['TargetClass'])

train_x,train_y

In [None]:
numFeatures=np.shape(train_x)[1]
# Data Standardization
for i in range(numFeatures):
    maxx=train_x[:,i].max()
    minn=train_x[:,i].min()
    train_x[:,i]=(train_x[:,i]-minn)/(maxx-minn)
    test_x[:,i]=(test_x[:,i]-minn)/(maxx-minn)

train_x,train_y

# 3.2 Logistic Regression Prediction

In [None]:
log_reg=LogRegression(numFeatures)
log_reg.train(train_x,train_y,alpha=0.1,steps=10000)
y_pred,accuracy=log_reg.predict(test_x,test_y)
print('accuracy=',accuracy)
sns.set()
c_m= confusion_matrix(test_y,y_pred,labels=[0,1])
sns.heatmap(c_m,square=True,annot=True,cmap='Blues',fmt='.20g')
plt.show()

# 3.3 Deep Neural Network Prediction

In [None]:
network=NeuralNetwork(numFeatures)
network.train(train_x,train_y,alpha=0.1,steps=100)
y_pred,accuracy=network.predict(test_x,test_y)
print('accuracy=',accuracy)
sns.set()
c_m= confusion_matrix(test_y,y_pred,labels=[0,1])
sns.heatmap(c_m,square=True,annot=True,cmap='Blues',fmt='.20g')
plt.show()