In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# Read data

In [None]:
row_data = pd.read_csv('/kaggle/input/star-type-classification/Stars.csv')
row_data.head()

# Exploring data

In [None]:
# Checking for duplicate lines

row_data.duplicated().unique()

In [None]:
# Checking for empty cells in data

row_data.isnull().sum() 

In [None]:
# Ð¡lass balance check

plt.figure(figsize=(20,5))
sns.countplot(x = row_data['Type'])
plt.show()

In [None]:
# Numerical features exploration

numerical_features = ['Temperature', 'L', 'R', 'A_M']

for column_name in numerical_features:
    plt.figure(figsize=(8,6))
    sns.distplot(x = row_data[column_name])
    plt.xlabel(column_name)
    plt.show()
    
    plt.figure(figsize=(9,3))
    sns.boxplot(x = row_data[column_name])
    plt.show()

In [None]:
# The boxplots shows some outliers from Temperature, relative Luminosity (L) and relative Radius (R).
# Let's Explore them in more detail

In [None]:
outliers_Temperature = row_data.loc[row_data['Temperature'] > 33000]
print('Star Type with Temperature > 33000:', ', '.join([str(i) for i in outliers_Temperature['Type'].unique()]))
print()
print(outliers_Temperature)

In [None]:
outliers_L = row_data.loc[row_data['L'] > 500000]
print('Star Type with L > 500000:', ', '.join([str(i) for i in outliers_L['Type'].unique()]))
print()
print(outliers_L)

In [None]:
outliers_R = row_data.loc[row_data['R'] > 500]
print('Star Type with R > 500:', ', '.join([str(i) for i in outliers_R['Type'].unique()]))
print()
print(outliers_R)

In [None]:
# After researching the data and reading sites about stars and space,
# I came to the conclusion that these data are not outliers. You cannot get rid of them.

In [None]:
# Categorical features exploration

categorical_features = ['Color', 'Spectral_Class']

for column_name in categorical_features:
    plt.figure(figsize=(20,5))
    sns.countplot(x = row_data[column_name])
    plt.show()

In [None]:
# Encoding categorical features

le = LabelEncoder()

for column_name in categorical_features:
    row_data[column_name] = le.fit_transform(row_data[column_name])

In [None]:
# Features correlation exploration

# Pearson correlation
plt.figure(figsize=(10,8))
corr = row_data.corr(method='pearson')
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(corr, annot=True, fmt= '.2f', cmap='RdBu', mask=mask)
plt.show()

In [None]:
# Spearman correlation
plt.figure(figsize=(10,8))
corr = row_data.corr(method='spearman')
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(corr, annot=True, fmt= '.2f', cmap='RdBu', mask=mask)
plt.show()

In [None]:
# The data show a strong correlation between L and R; L, R, Temperature and A_M; Temperature and Color.

# The correlation between Temperature and Color is explained by physics - one depends on the other.

# The correlation between L and R (and partly Temperature) is explained by the fact that L is calculated from R and Temperature.

# The correlation between L, R, Temperature and A_M is explained by the fact that A_M is calculated from L,
# which, as I wrote above, is calculated from R and Temperature.

# From all of the above, it follows that in the work you can ignore such parameters as L, R and Temperature.

# Preparing data

In [None]:
# Split data in to train and test sets

data_X = row_data.iloc[:, 3:6]  # data without Temperature, L, R and target variable
data_y = row_data['Type']  # target variable

X_train, X_test, y_train, y_test = train_test_split(data_X, data_y, test_size=0.33, random_state=42, stratify=data_y)

In [None]:
# Scaling data

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Trying out different models using kFold cross-validation

In [None]:
models = []

models.append(('KNN',KNeighborsClassifier(n_jobs=-1)))
models.append(('LR',LogisticRegression(random_state=42,n_jobs=-1)))
models.append(('DT',DecisionTreeClassifier(random_state=42)))
models.append(('Bag_DT',BaggingClassifier(DecisionTreeClassifier(random_state=42), random_state=42, n_jobs=-1)))
models.append(('RF',RandomForestClassifier(random_state=42, n_jobs=-1)))
models.append(('GBC',GradientBoostingClassifier(random_state=42)))

kf = KFold(n_splits=5, shuffle=True, random_state=42)

for name, model in models:
    scores = cross_val_score(model, X_train, y_train, scoring='f1_weighted', cv=kf, n_jobs=-1)
    accuracy = scores.mean()
    std = scores.std()
    print(f"{name} : Mean F1 {round(accuracy, 3)} STD:({round(std, 3)})")

In [None]:
# The best results were shown by DecisionTreeClassifier and BaggingClassifier
# Let's check the DecisionTreeClassifier on the test set

In [None]:
# Training DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)

# Getting predicted values
y_predicted = dt.predict(X_test)

print('Accuracy of DecisionTreeClassifier is', dt.score(X_test, y_test)*100, '%')

In [None]:
# Creating a confusion matrix

conf_matix = pd.crosstab(y_test, y_predicted)

sns.heatmap(conf_matix, cmap='Greys', annot=True, 
            linecolor='black', square='True',
            linewidths=0.2)
plt.ylabel("Real type of stars")
plt.xlabel("Predicted type of stars") 
plt.show()

# Too good to be true