# Download the data
We followed the steps illustrated in the link:
https://www.kaggle.com/general/74235. 
You need to upload kaggle.json file.

# Read the data
We will import also the usual needed libraries.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
#%matplotlib inline for static images in notebooks
from IPython.core.pylabtools import figsize
import matplotlib.pyplot as plt
figsize(10, 10)

df_orginal= pd.read_csv('../input/star-dataset/6 class csv.csv')

# Basic Insight of Dataset

In [None]:
df_orginal.head()

We notice that we have two String columns that we have to move them to numerical.

In [None]:
df_orginal.describe()

We have only 240 rows. and it seems that the target columns 'Star type' ditributed uniformally between rows. we will check.

In [None]:
df_orginal['Star type'].value_counts()

Yeah 40 row for each type. We will check missing data.

In [None]:
df_orginal.isnull().sum()

Good news. no missing data.  
We will see the range of values for string data.

In [None]:
set(df_orginal['Spectral Class']) 

We have five spectral classes. 

In [None]:
set(df_orginal['Star color']) 

We have five main colors. and we have some modification to do regarding color.

# Data Preprocessing  
We will start by changing string columns.  
In the dataset link "https://www.kaggle.com/deepu1109/star-dataset", they said that Spectral Class (O,B,A,F,G,K,,M) and in this order represent info about spectral. like spectral O is close of spectral B.  
So we will represt this column by ordinal data as below:

In [None]:
df=df_orginal
df['Spectral Class'] = df['Spectral Class'].map({'O': 0, 'B': 1, 'A': 2, 'F': 3, 'G': 4, 'K': 5, 'M': 6})
df.head()

Regading the colors, we noticed that all rows consist of one or two combinations of the colors : ['Blue','White','Orange','Red','Yellow']
So we will create an attribut for each of them and put 1 where the color exist.

In [None]:
df[['Blue','White','Orange','Red','Yellow']] = 0
df.loc[df['Star color'].str.contains("Blue",case=False),'Blue']=1
df.loc[df['Star color'].str.contains("Whit",case=False),'White']=1
df.loc[df['Star color'].str.contains("Red",case=False),'Red']=1
df.loc[df['Star color'].str.contains("Yellow",case=False),'Yellow']=1
df.loc[df['Star color'].str.contains("Orange",case=False),'Orange']=1

We will check by making some queries.

In [None]:
df[df['Star color']=='Yellowish White' ]

In [None]:
df[df['Star color']=='Whitish' ]

Now we will drop the orginal 'Star color' attribute.

In [None]:
df.drop(labels=['Star color'],axis=1, inplace=True)
df.head()

We will move the target column to the last.

In [None]:
cols = list(df.columns.values)
cols.pop(cols.index('Star type')) 
df = df[cols+['Star type']]
df.head()

Standardisation.

In [None]:
from sklearn.preprocessing import StandardScaler
df_n=df
df_n.iloc[:,:-1] = StandardScaler().fit_transform(df.iloc[:,:-1])
df_n.head()

Now our data is ready.

# Model Development

Let's see how the columns affect the Star type.


In [None]:
sns.scatterplot(data=df_n, x='Temperature (K)', y='Absolute magnitude(Mv)', hue='Star type') 

So these columns have high impact

In [None]:
sns.scatterplot(data=df_n, x='Spectral Class', y='Absolute magnitude(Mv)', hue='Star type')

Also Spectral Class have considerable impact.  
We will split the data into train, tes and validate data (0.6,0.2,0.2)

In [None]:
from sklearn.model_selection import train_test_split
X = df_n.iloc[:,:-1]
y = df_n['Star type']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1) # 0.25 x 0.8 = 0.2
len(X_train), len(X_test), len(X_val)

In [None]:
y_train.value_counts()

Good the types are distrbuted uniformaly in train data

## DecisionTreeClassifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

accuracy_scores = []
max_depths = []

for max_depth in range(1, 16):
    model = DecisionTreeClassifier(max_depth = max_depth)
    model.fit(X_train, y_train)

    test_prediction = model.predict(X_test)
    test_accuracy = accuracy_score(y_test, test_prediction)

    max_depths.append(max_depth)
    accuracy_scores.append(test_accuracy)

ax = sns.lineplot(x = max_depths, y = accuracy_scores)
ax.set(xlabel='Decision Tree Max Depth', ylabel='Accuracy Score')
print(f'Best accuracy score |{max(accuracy_scores)}| achieved at max depth |{np.argmax(accuracy_scores) + 1}|')

model1 = DecisionTreeClassifier(max_depth=max_depths[np.argmax(accuracy_scores)])
model1.fit(X_train, y_train)

We got 100% accuracy for max depths 4.  
We will see the performance of other models also.

## LogisticRegression

In [None]:
from sklearn.linear_model import LogisticRegression

model2 = LogisticRegression()
model2.fit(X_train, y_train)

test_prediction = model2.predict(X_test)
model2_acc = accuracy_score(y_test, test_prediction)

print(f'Logistic Regression Accuracy on train: |{model2_acc}|')

## SVC

In [None]:
from sklearn.svm import SVC
accuracy_scores = []
params = []

for C in np.linspace(0.01, 5, 20):
    for degree in range(2, 10):
        model = SVC(C = C, degree=degree, kernel='poly')
        model.fit(X_train, y_train)

        test_prediction = model.predict(X_test)
        test_accuracy = accuracy_score(y_test, test_prediction)

        params.append({'C': C, 'degree': degree})
        accuracy_scores.append(test_accuracy)

print(f'Best accuracy score |{max(accuracy_scores)}| achieved with params|{params[np.argmax(accuracy_scores)]}|')

model3 = SVC(C = params[np.argmax(accuracy_scores)]['C'], degree=params[np.argmax(accuracy_scores)]['degree'], kernel='poly')
model3.fit(X_train, y_train)

In [None]:
from tqdm import tqdm

accuracy_scores = []
params = []

for C in tqdm(np.linspace(0.01, 5, 20)):
    for gamma in np.linspace(0.001, 2, 40):
        model = SVC(C = C, gamma=gamma, kernel='rbf')
        model.fit(X_train, y_train)

        test_prediction = model.predict(X_test)
        test_accuracy = accuracy_score(y_test, test_prediction)

        params.append({'C': C, 'gamma': gamma})
        accuracy_scores.append(test_accuracy)

print(f'Best accuracy score |{max(accuracy_scores)}| achieved with params|{params[np.argmax(accuracy_scores)]}|')

model4 = SVC(C = params[np.argmax(accuracy_scores)]['C'], gamma=params[np.argmax(accuracy_scores)]['gamma'], kernel='rbf')
model4.fit(X_train, y_train)


SVC with Radial basis function kernel we gor 0.96 accuaracy.

## KNeighborsClassifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier

accuracy_scores = []
K_values = []

for K in range(1, 10):
    model = KNeighborsClassifier(n_neighbors = K)
    model.fit(X_train, y_train)

    test_prediction = model.predict(X_test)
    test_accuracy = accuracy_score(y_test, test_prediction)

    K_values.append(K)
    accuracy_scores.append(test_accuracy)

ax = sns.lineplot(x = K_values, y = accuracy_scores)
ax.set(xlabel='K Neigbours considered', ylabel='Accuracy Score')
print(f'Best accuracy score |{max(accuracy_scores)}| achieved with K |{K_values[np.argmax(accuracy_scores)]}|')

model5 = KNeighborsClassifier(n_neighbors = K_values[np.argmax(accuracy_scores)])
model5.fit(X_train, y_train)

# Model Evaluation 

In [None]:
model1_predictions = model1.predict(X_val)
model2_predictions = model2.predict(X_val)
model3_predictions = model3.predict(X_val)
model4_predictions = model4.predict(X_val)
model5_predictions = model5.predict(X_val)

model1_accuracy = accuracy_score(y_val, model1_predictions)
model2_accuracy = accuracy_score(y_val, model2_predictions)
model3_accuracy = accuracy_score(y_val, model3_predictions)
model4_accuracy = accuracy_score(y_val, model4_predictions)
model5_accuracy = accuracy_score(y_val, model5_predictions)

print(f'Accuracy Score is |{model1_accuracy}| for model |{model1}|')
print(f'Accuracy Score is |{model2_accuracy}| for model |{model2}|')
print(f'Accuracy Score is |{model3_accuracy}| for model |{model3}|')
print(f'Accuracy Score is |{model4_accuracy}| for model |{model4}|')
print(f'Accuracy Score is |{model5_accuracy}| for model |{model5}|')

# Conclusion  
After good preprocessing for the data, most of the classifier gives 100% accuracy. We figure out the best hyperparameter for each classifier.