In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

This is a dataset consisting of several features of stars.

Some of them are:

    Absolute Temperature (in K)
    Relative Luminosity (L/Lo)
    Relative Radius (R/Ro)
    Absolute Magnitude (Mv)
    Star Color (white,Red,Blue,Yellow,yellow-orange etc)
    Spectral Class (O,B,A,F,G,K,,M)
    Star Type **(Red Dwarf, Brown Dwarf, White Dwarf, Main Sequence , SuperGiants, HyperGiants)**

Lo = 3.828 x 10^26 Watts (Avg Luminosity of Sun)
Ro = 6.9551 x 10^8 m (Avg Radius of Sun)

1. Brown Dwarf -> Star Type = 0
2. Red Dwarf -> Star Type = 1
3. White Dwarf-> Star Type = 2
4. Main Sequence -> Star Type = 3
5. Supergiant -> Star Type = 4
6. Hypergiant -> Star Type = 5


In [None]:
#importing visualization libraries
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from warnings import filterwarnings
filterwarnings('ignore')

In [None]:
#reading 'csv' file
df=pd.read_csv('/kaggle/input/star-dataset/6 class csv.csv')
df.head()

In [None]:
df.shape #checking  shape of the data

In [None]:
df.info() #checking datatypes

In [None]:
df.isna().sum()/len(df) #percentange of missing values in each column

Below diagram is the 

Below images, show that the stars follows a certain graph in the celestial Space ,
called Hertzsprung-Russell Diagram or simply HR-Diagram

![](https://www.googleapis.com/download/storage/v1/b/kaggle-user-content/o/inbox%2F3791628%2Ffe9436bf4e2d23b5b18fb3db1f1fcbcb%2FHRDiagram.png?generation=1597348809674507&alt=media)
![](https://www.googleapis.com/download/storage/v1/b/kaggle-user-content/o/inbox%2F3791628%2F9fc57334a9b9fafbc71aacdd6e5cd69c%2F310px-Hertzsprung-Russel_StarData.png?generation=1597349661801284&alt=media)
![](https://www.googleapis.com/download/storage/v1/b/kaggle-user-content/o/inbox%2F3791628%2F14338bbebf77d18e1faef582bccdbdd6%2Fhr.jpg?generation=1597349509841965&alt=media)

Let's visualize HR-diagram with our data

In [None]:
fig,ax=plt.subplots(nrows=1,ncols=2,figsize=(13,9))
sns.scatterplot(x='Temperature (K)',y='Absolute magnitude(Mv)',data=df,hue='Star type',ax=ax[0],palette='dark')
sns.scatterplot(x='Temperature (K)',y='Absolute magnitude(Mv)',data=df,hue='Spectral Class',ax=ax[1],palette='dark')
plt.tight_layout()
plt.show()

The above diagram resembles a graph similar to HR-diagram

In [None]:
fig,ax=plt.subplots(nrows=1,ncols=2,figsize=(13,9))
sns.scatterplot(x='Temperature (K)',y='Luminosity(L/Lo)',data=df,hue='Star type',ax=ax[0],palette='bright')
sns.scatterplot(x='Temperature (K)',y='Luminosity(L/Lo)',data=df,hue='Spectral Class',ax=ax[1],palette='bright')
plt.tight_layout()
plt.show()

In [None]:
#splitting dataframe into categorical and numeric features to perform preprocessing
df_cat=df.select_dtypes(exclude=np.number)
df_num=df.select_dtypes(include=np.number)

In [None]:
df_cat.head(2)

In [None]:
df_num.head(2)

In [None]:
#visualizing outliers
fig,ax=plt.subplots(len(df_num.columns),figsize=(16,10))
for i in range(len(df_num.columns)):
    sns.boxplot(x=df_num.iloc[:,i],ax=ax[i],hue=df_num['Star type'])
    plt.tight_layout()
plt.show()

Since these outliers are natural variations,it is not necessary to remove them.

In [None]:
#ckeckin for class imbalance
df['Star type'].value_counts()

we have an equally classified dataset

In [None]:
#numeric features description
df.describe()

From the above table, we can see that the values among features vary drastically,hence scaling is required.

In [None]:
#initializing standardscaler for scaling
ss=StandardScaler()

In [None]:
scaled_num=pd.DataFrame(ss.fit_transform(df_num),columns=df_num.columns)
scaled_num.head()

In [None]:
#encoding categorical variables
encoded_cat=pd.get_dummies(df_cat,drop_first=True)
encoded_cat.head()

In [None]:
#combining scaled numeric variables and encoded categorical variables
df_mod=pd.concat([encoded_cat,scaled_num],axis=1)
df_mod.head()

In [None]:
X=df_mod.drop('Star type',axis=1)
y=df['Star type']#since target varibale in df_mod has been scaled I am taking it from original df

In [None]:
#splitting training and test data
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,shuffle=True,random_state=6)

In [None]:
# trying knn classifier with 3 neighbors
kn3=KNeighborsClassifier(n_neighbors=3)

In [None]:
kn3_model=kn3.fit(X_train,y_train)

In [None]:
train_score=kn3_model.score(X_train,y_train)
print('Train_score:',train_score)
test_score=kn3_model.score(X_test,y_test)
print('Test_score:',test_score)

scores of the training and test data are very close to each other,which implies that the model also performs well for unseen data.

## Lets check with classification metrics

In [None]:
train_pred=kn3_model.predict(X_train)
print(classification_report(y_train,train_pred))

The 'f1-weighted avg' score is 0.98 which implies that our model is able to classify 98% of the data correctly.

## Cross Validating the model

In [None]:
k = KFold(n_splits=5,shuffle=True,random_state=48)#splitting into five folds
scores = cross_val_score(kn3,X_train,y_train,cv=k,scoring='f1_weighted')

In [None]:
scores

In [None]:
print('Bias error:', 1 -  np.mean(scores))
print('Variance error:',np.std(scores)/np.mean(scores))

very low bias and variance error,therefore there is no overfitting or underfitting in the model

## GridSearchCV

Let's find the best value of n_neighbors using GRIDSEARCHCV

In [None]:
params={'n_neighbors':range(2,8)}#assigning n_neighbors from 2 to 7

In [None]:
knn=KNeighborsClassifier()#initializing function
grd=GridSearchCV(knn,param_grid=params,cv=5,scoring='f1_weighted')

In [None]:
#fiiting the grid model to training set
grid_model=grd.fit(X_train,y_train)

In [None]:
#finding best score and n_neigbors
print('Best n_neighbors:',grid_model.best_params_)
print('Best Score:',grid_model.best_score_)

The best parameter for 'n_neighbors' is calculated as 2 with 0.97 as 'f1-weighted' score which is more or less equal to our kn3_model.Hence I'll stick with n_neighbors=3.

## Conclusion

The KNN model developed has 98% accuracy in classifying a star.One can also try with Random Forest or Xgboost to get better accuracy for the model.