In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Importing the Required Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelBinarizer
import warnings
warnings.filterwarnings("ignore")

data = pd.read_csv('/kaggle/input/star-type-classification/Stars.csv')
data.head()

In [None]:
data.shape

In [None]:
data.info()

## Missing Value Analysis

In [None]:
data.isnull().sum()

#### No Missing Values in the Data

## Duplicates Analysis

In [None]:
data[data.duplicated()]

#### No Duplicate Values present

## Feature Analysis

In [None]:
fig, axs = plt.subplots(2, 2,figsize=(15,15))
axs[0, 0].boxplot(data['Temperature'])
axs[0, 0].set_title('Temperature')
axs[0, 1].boxplot(data['R'], 'tab:orange')
axs[0, 1].set_title('Radius')
axs[1, 0].boxplot(data['A_M'], 'tab:green')
axs[1, 0].set_title('Absolute Magnitutde')
axs[1, 1].boxplot(data['L'], 'tab:red')
axs[1, 1].set_title('Luminosity')

for ax in axs.flat:
    ax.set(xlabel='x-label', ylabel='y-label')

# Hide x labels and tick labels for top plots and y ticks for right plots.
for ax in axs.flat:
    ax.label_outer()

## Star Color Analysis

In [None]:
a= pd.DataFrame(data['Color'].value_counts())
plt.figure(figsize=(8,6))
sns.barplot(a['Color'], a.index, palette= 'Spectral')
plt.title("Star Color Analysis")

#### 112 stars have Red color & 56 stars have Blue color

## Star Spectral Class Analysis

In [None]:
a= pd.DataFrame(data['Spectral_Class'].value_counts())
plt.figure(figsize=(8,6))
sns.barplot(a['Spectral_Class'], a.index, palette= 'rainbow')
plt.title("Star Spectral Class Analysis")

#### 111 Stars belong to "M" Spectral Class, making it the most dominant class in the sample

## Star Type Analysis

In [None]:
a =pd.DataFrame(data['Type'].value_counts())
plt.figure(figsize=(10,8))
plt.pie(a['Type'],labels=a.index,autopct='%1.1f%%')
plt.title("Percentage Distribution of Star Type")

#### It is seen that the comprises of equal distribution of Star Types

## Correlation Analysis

In [None]:
matrix= data.corr()
mask = np.zeros_like(matrix, dtype=np.bool)
mask[np.triu_indices_from(mask)]= True


plt.figure(figsize=(11,6))
sns.heatmap(matrix,annot=True,cmap='viridis',annot_kws = {'size': 10},mask=mask)
plt.title("Correlation Analysis")
plt.show()

- There is a Moderate Positive Correlation between Luminosity-Temperature and Star Type-Temperature
- Moderately High Positive Correlation is seen between Lumionsity-Radius, Luminosity-Star Type & Radius- Star Type
- Moderately High Negatively Correlation is seen between Lumionsity-Magnitude & Radius-Magnitude
- Strong Negative Correlation is seen between Magnitude & Star Type

## Star Classification Analysis

In [None]:
from sklearn.preprocessing import LabelEncoder
x1=LabelEncoder()  
data['Spectral_Class']= x1.fit_transform(data['Spectral_Class'])
data['Color']= x1.fit_transform(data['Color'])

Y= data[['Type']]
X= data.drop(['Type'], axis=1)

x_train, x_test, y_train, y_test= train_test_split(X, Y, test_size=0.2, random_state=0, stratify=Y)

### Logistic Regression

In [None]:
LogReg= LogisticRegression()
LogReg= LogReg.fit(x_train,y_train)
y_pred= LogReg.predict(x_test)
print("Accuracy Score: ",metrics.accuracy_score(y_pred,y_test))
cm= confusion_matrix(y_test,y_pred)
print("Confusion Matrix: ",cm,sep='\n')

In [None]:
def multiclass_roc_auc_score(y_test, y_pred, average="macro"):
    lb = LabelBinarizer()
    lb.fit(y_test)
    y_test = lb.transform(y_test)
    y_pred = lb.transform(y_pred)
    return roc_auc_score(y_test, y_pred, average=average)


LogReg= multiclass_roc_auc_score(y_test,y_pred)
LogReg

### KNN 

In [None]:
knn= KNeighborsClassifier(n_neighbors=5)
knn.fit(x_train,y_train)

y_pred= knn.predict(x_test)
print("Accuracy Score: ",metrics.accuracy_score(y_pred,y_test))

cm= confusion_matrix(y_test,y_pred)
print("Confusion Matrix: ",cm,sep='\n')

In [None]:
KNN=multiclass_roc_auc_score(y_test,y_pred)
KNN

### Decision Tree

In [None]:
dtc= DecisionTreeClassifier(criterion="entropy")
dtc.fit(x_train,y_train)

y_pred= dtc.predict(x_test)
print("Accuracy Score: ",metrics.accuracy_score(y_pred,y_test))

cm= confusion_matrix(y_test,y_pred)
print("Confusion Matrix: ",cm,sep='\n')

In [None]:
DT=multiclass_roc_auc_score(y_test,y_pred)
DT

### Random Forest

In [None]:
rf= RandomForestClassifier(n_estimators=100,random_state=0)
rf.fit(x_train,y_train)

y_pred= rf.predict(x_test)
print("Accuracy Score RandomForest: ",metrics.accuracy_score(y_test,y_pred))

cm= confusion_matrix(y_test,y_pred)
print("Confusion Matrix: ",cm,sep='\n')

In [None]:
RF= multiclass_roc_auc_score(y_test,y_pred)
RF

### Ada Boost Classifier

In [None]:
ada= AdaBoostClassifier(n_estimators=200,random_state=0)
ada.fit(x_train,y_train)

y_pred= ada.predict(x_test)
print("Accuracy Score of AdaBoost Classifier: ",metrics.accuracy_score(y_test,y_pred))

cm= confusion_matrix(y_test,y_pred)
print("Confusion Matrix: ",cm,sep='\n')

In [None]:
AB= multiclass_roc_auc_score(y_test,y_pred)
AB

### Gradient Boost Classifiier

In [None]:
gradient= GradientBoostingClassifier(n_estimators=200,random_state=0,max_depth=2)
gradient.fit(x_train,y_train)

y_pred= gradient.predict(x_test)
print("Accuracy Score of GradientBoost Classifier: ",metrics.accuracy_score(y_test,y_pred))

cm= confusion_matrix(y_test,y_pred)
print("Confusion Matrix: ",cm,sep='\n')

In [None]:
GB= multiclass_roc_auc_score(y_test,y_pred)
GB

### Model Comparison

In [None]:
mc= pd.DataFrame([LogReg,KNN,DT,RF,AB,GB],['Logistic Regression','KNN Classifier','Decision Tree','Random Forest','Ada-Boost','Gradient Boost'])
mc.columns=['ROC_AUC']
mc

plt.figure(figsize=(11,6))
sns.barplot(mc.index,mc.ROC_AUC,palette='rainbow')
plt.title('ML Model Comparison')

### It is seen that Random Forest, Decision Tree & Gradient Boost Classifiers give the best Classification Performance