### Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score

plt.rcParams['figure.figsize'] = [12, 6]
sns.set_theme(style="darkgrid")

### Load data from dataset

In [None]:
df = pd.read_csv('../input/star-dataset/6 class csv.csv')
df

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

Okey, we don't have any missing value. It's rare.

In [None]:
sns.countplot(x=df['Star type'], data=df)

We have a balanced dataset, good.

In [None]:
sns.pairplot(data=df)

In [None]:
num_cols = list(df.select_dtypes(include=['int64', 'float64']).columns)
num_cols

In [None]:
sns.boxplot(data=df[num_cols])

We may have outliers but I don't want to get rid of them because we don't have large dataset and I wan to see the preformance of models with this dataset.

In [None]:
sns.heatmap(df.loc[:, df.columns != 'Star type'].corr(), annot=True)

There isn't big correlation among columns.

### We determine both the feature and the target variables.

In [None]:
X = df.copy() # feature variables
X.drop(['Star type'], inplace=True, axis=1) # drop the target variable
y = df['Star type'] # target variable
y = y.values.reshape(-1, 1)

In [None]:
X

### Scaling

In [None]:
sc = StandardScaler()
X.iloc[:, :-2] = sc.fit_transform(X.iloc[:, :-2])

Let's check it.

In [None]:
sns.boxplot(data=X)

Now the numeric type columns have similar range.

Models can't calculate with strings therefore we have to encode these values. I will use OnehotEncoder for this.

In [None]:
oh_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
oh_cols = pd.DataFrame(oh_encoder.fit_transform(X[['Star color', 'Spectral Class']]))

oh_cols.index = X.index
num_X = X.drop(['Star color', 'Spectral Class'], axis=1)

oh_X = pd.concat([num_X, oh_cols], axis=1)
oh_X

In [None]:
len(X['Star color'].unique())

### Spliting

Let's split the dataframe to train and test set.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(oh_X, y, test_size=0.25, random_state=5)

In [None]:
X_train.shape

In [None]:
X_test.shape

### Modeling

We will use gridsearchcv to find the best hyperparameters for our models.

We define 2 methods because of cleaner code.

In [None]:
def find_best_params(params, model):
    parameters = params

    grid_search = GridSearchCV(estimator = model,
                               param_grid = parameters,
                               scoring = 'accuracy',
                               cv = 10,
                               return_train_score = True,
                               n_jobs = -1)

    grid_search.fit(X_train, y_train.ravel())
    best_acc = grid_search.best_score_
    best_parameters = grid_search.best_params_
    print()
    print("Best accuracy: {:.2f}".format(best_acc))
    print("Best Parameters:", best_parameters)
    print()

In [None]:
def evaluate(model):
    classifier = model
    classifier.fit(X_train, y_train.ravel())
    results = cross_val_score(estimator=classifier, X=X_train, y=y_train, cv=10, n_jobs=-1)
    print('model accuracy score: {:.3f}'.format(results.mean()))
    print('model train accuracy score: {:.3f}'.format(classifier.score(X_train, y_train)))
    print('model test accuracy score: {:.3f}'.format(classifier.score(X_test, y_test)))
    
    return classifier

### Decision Tree

In [None]:
find_best_params({
    'max_depth': [None, 10, 15, 20, 30, 50, 80, 100],
    'max_features': [None, 5, 10, 20, 30],
    'min_samples_leaf': [1, 2, 3, 4, 5],
    'min_samples_split': [2, 8, 10, 12],
    },
    DecisionTreeClassifier()
)

In [None]:
model_tree = evaluate(DecisionTreeClassifier(min_samples_leaf=5, min_samples_split=12))

Alright, let's do this with other models as well.

### KNN

In [None]:
find_best_params({'n_neighbors': range(1,20),'p': range(1,10)}, KNeighborsClassifier())

In [None]:
model_knn = evaluate(KNeighborsClassifier(n_neighbors=1, p=1))

### Random Forest

In [None]:
# To run gridsearchCV takes a long time in case of random forest. I tried out it and it works but decesion tree can be enough for us in this project.
'''
find_best_params({
    'max_depth': [None, 10, 15, 20, 30, 50],
    'max_features': [None, 5, 10],
    'min_samples_leaf': [1, 2, 3, 4, 5],
    'min_samples_split': [2, 8, 10, 12],
    'n_estimators': [100, 200, 500, 1000]
    },
    RandomForestClassifier()
)
'''

In [None]:
# model_random_forest = evaluate(RandomForestClassifier(min_samples_leaf=1, min_samples_split=2, n_estimators=20))

### Logistic Regression

In [None]:
find_best_params({'penalty': ['l2'], 'C': range(2, 10)}, LogisticRegression())

In [None]:
model_log_reg = evaluate(LogisticRegression(penalty='l2', C=3))

### SVM 

In [None]:
find_best_params({
    'C': range(1, 20),
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'degree': range(2, 10)
    }, 
    SVC()
)

In [None]:
model_svm = evaluate(SVC(C=1.0, degree=2, kernel='linear'))


We've seen different classification models so far and they had similar results. Consequently, there is no significant difference as to which model will be used for prediction.

### Prediction

In [None]:
color = 'White'
sclass = 'A'
temp = 8829
lum = 537493.000000
rad = 1423.0000
magnitude = -10.73

num_values = sc.transform([[temp, lum, rad, magnitude]])
cat_values = oh_encoder.transform(np.array([color, sclass]).reshape(-1, 2))

array = []
for i in num_values:
    for j in i:
        array.append(j)
for i in cat_values:
    for j in i:
        array.append(j)
        
result = model_tree.predict([array])
result[0]