In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 0 - Importing Libraries

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, jaccard_score
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix

'''
For further versions, it would be interesting to implement a Neural Network.

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
import torchvision.transforms as transforms
'''

# 1 - Importing & analyzing the dataset

First of all let's import the dataset and let's check the type of data, the missing values and also we are going to take a look at the different categorical variables.

In [None]:
stars_df = pd.read_csv('../input/star-categorization-giants-and-dwarfs/Star3642_balanced.csv')
stars_df

In [None]:
stars_df.describe()

In [None]:
stars_df.dtypes

In [None]:
null, nan = stars_df.isnull().sum() , stars_df.isna().sum()
print(null)
print(nan)

In [None]:
stars_df['SpType'].unique

# 2 - Removing rows that have too much error.

In [None]:
threshold = stars_df['e_Plx'].mean() + 0.5

stars_dff = stars_df[ stars_df['e_Plx'] < threshold ]

In [None]:
stars_dff

# 3 - Visualization

Let's plot some features so we can get more insights about the dataset.

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(20,12))

sns.scatterplot(data = stars_dff, x = 'Vmag', y= 'B-V', ax = axes[0][0], hue = 'TargetClass')
axes[0,0].set_xlabel('Apparent Magnitude')

sns.scatterplot(data = stars_dff, x = 'Amag',  y= 'B-V', ax = axes[0][1], hue = 'TargetClass')
axes[0,1].set_xlabel('Absolute Magnitude')


sns.scatterplot(data = stars_dff, x = 'Vmag', y = 'Plx', ax = axes[1][0], hue='TargetClass')
axes[1,0].set_xlabel('Apparent Magnitude')


sns.scatterplot(data = stars_dff, x = 'Amag', y = 'Plx', ax = axes[1][1], hue = 'TargetClass')
axes[1,1].set_xlabel('Absolute Magnitude')



In [None]:
fig, axes = plt.subplots(1,2, figsize=(20,6))

sns.kdeplot(data = stars_dff['Amag'].loc[stars_dff['TargetClass']==0], shade=True, ax = axes[0],label='Dwarf')
sns.kdeplot(data = stars_dff['Amag'].loc[stars_dff['TargetClass']==1], shade=True, ax = axes[0],label='Giant')
axes[0].legend()

axes[0].set_xlabel('Absolute Magnitude', size=14)
sns.kdeplot(data = stars_dff['Vmag'].loc[stars_dff['TargetClass']==0], shade=True, ax = axes[1],label='Dwarf')
sns.kdeplot(data = stars_dff['Vmag'].loc[stars_dff['TargetClass']==1], shade=True, ax = axes[1],label='Giant')
axes[1].legend()

axes[1].set_xlabel('Apparent Magnitude', size=14)

# 4 - Feature Engineering

## 4.1 - Why feature engineering?

If we examine the dataset, we can see that the scatter plots shows an interesting division of the space. Just a small number of stars would be misclasified if we drawed a simple curve in order to separate the two kind of stars. The idea is to expand our features by, for example, calculating the squared value of some columns and also performing some calculations between two columns in order to obtain a third one.

## 4.2 - Working with features

In [None]:
#Let's calculate the squared value of the apparent and absolute magnitudes. The same for the B-V column.

stars_df_eng = stars_dff.copy()

stars_df_eng['Amag_SQ'] = stars_df_eng['Amag']**2
stars_df_eng['Vmag_SQ'] = stars_df_eng['Vmag']**2
stars_df_eng['B-V_SQ'] = stars_df_eng['B-V']**2

In [None]:
#Now let's try adding different values to each other

stars_df_eng['Sum_AV'] = stars_df_eng['Amag'] + stars_df_eng['Vmag']
stars_df_eng['Sub_AV'] = stars_df_eng['Amag'] + stars_df_eng['Vmag']

In [None]:
stars_numeric = stars_df_eng.drop('SpType', inplace=False, axis='columns')
stars_numeric.head()

In [None]:
#Standarization of the dataset
for i in stars_numeric.columns.tolist():
    stars_numeric[i] = stars_numeric[i] / stars_numeric[i].max()
stars_numeric.head()
stars_numeric['TargetClass'] = stars_numeric['TargetClass'].astype('int64')

# 5 - Splitting the dataset

In [None]:
#We will use a 20% of the data for the test dataset.
X = stars_numeric.drop('TargetClass', axis=1, inplace=False)
Y = stars_numeric['TargetClass']
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.20, random_state=1)

# 6 - Models Development

# 6.1 - KNN Model

In [None]:
n_neigh = 15
K_best = 0
Score_best = 0

for i in range(1,n_neigh):
    KNN = KNeighborsClassifier(n_neighbors=i)
    KNN.fit(X_train,Y_train)
    Y_pred = KNN.predict(X_test)
    if KNN.score(X_test,Y_test) > Score_best:
        Score_best = KNN.score(X_test,Y_test)
        K_best = i
print("The best number of neighbors is {} with a test accuracy of {}%""".format(K_best, (Score_best*100)))


In [None]:
matrix = confusion_matrix(Y_test, Y_pred, labels=[0,1])
plot_confusion_matrix(KNN, X_test, Y_test,labels=[0,1],cmap='Blues')

# 6.2 - Logistic Regression

In [None]:
solvers = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
best_solver = ''
best_score_LR = 0
for i in solvers:
    LogReg = LogisticRegression(solver=i)
    LogReg.fit(X_train, Y_train)
    Y_pred_LR = LogReg.predict(X_test)
    score = LogReg.score(X_test,Y_test)
    
    if score > best_score_LR:
        best_score_LR = score
        best_solver = i
print('The best solver for the Logistic Regression is {}, with a {}% of accuracy in the test set.'.format(best_solver, best_score_LR*100))


In [None]:
matrix_LR = confusion_matrix(Y_test, Y_pred_LR, labels=[0,1])
plot_confusion_matrix(LogReg, X_test, Y_test ,labels=[0,1],cmap='Blues')

# 6.3 - Support Vector Classification

In [None]:
kernels = ['rbf', 'poly', 'sigmoid','linear']
best_score_SVC = 0
kernel_best = ''
for i in kernels:
    SupVec = SVC(kernel=i, gamma='auto', random_state=1)
    SupVec.fit(X_train,Y_train)
    Y_pred_SV = SupVec.predict(X_test)
    if SupVec.score(X_test, Y_test) > best_score_SVC:
        best_score_SVC = SupVec.score(X_test, Y_test)
        kernel_best = i
    
print('The SVC performs better using a {} kernel, obtaining a {}% of accuracy'.format(kernel_best, best_score_SVC*100))
#print("The (test) accuracy is approximately {}%".format(round(SupVec.score(X_test,Y_test)*100)))

In [None]:
matrix_SV = confusion_matrix(Y_test, Y_pred_SV, labels=[0,1])
plot_confusion_matrix(SupVec, X_test, Y_test,labels=[0,1],cmap='Blues')