In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, train_test_split
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [2]:
url = 'https://raw.githubusercontent.com/stepan1518/golubov3/main/classes.csv'
df = pd.read_csv(url)
df.head()

Unnamed: 0,Temperature (K),Luminosity(L/Lo),Radius(R/Ro),Absolute magnitude(Mv),Star type,Star color,Spectral Class
0,3068,0.0024,0.17,16.12,0,Red,M
1,3042,0.0005,0.1542,16.6,0,Red,M
2,2600,0.0003,0.102,18.7,0,Red,M
3,2800,0.0002,0.16,16.65,0,Red,M
4,1939,0.000138,0.103,20.06,0,Red,M


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240 entries, 0 to 239
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Temperature (K)         240 non-null    int64  
 1   Luminosity(L/Lo)        240 non-null    float64
 2   Radius(R/Ro)            240 non-null    float64
 3   Absolute magnitude(Mv)  240 non-null    float64
 4   Star type               240 non-null    int64  
 5   Star color              240 non-null    object 
 6   Spectral Class          240 non-null    object 
dtypes: float64(3), int64(2), object(2)
memory usage: 13.3+ KB


In [4]:
df['Spectral Class'].value_counts()

Spectral Class
M    111
B     46
O     40
A     19
F     17
K      6
G      1
Name: count, dtype: int64

In [5]:
df['Star type'].value_counts()

Star type
0    40
1    40
2    40
3    40
4    40
5    40
Name: count, dtype: int64

In [6]:
df['Star color'].value_counts()

Star color
Red                   112
Blue                   55
Blue-white             26
Blue White             10
yellow-white            8
White                   7
Blue white              3
Yellowish White         3
white                   3
Whitish                 2
Orange                  2
yellowish               2
Pale yellow orange      1
White-Yellow            1
Blue                    1
Yellowish               1
Orange-Red              1
Blue white              1
Blue-White              1
Name: count, dtype: int64

In [7]:
df['Star color'] = df['Star color'].str.lower().str.strip()
df['Star color'] = df['Star color'].replace('blue-white', 'blue white')
# df['Star color'] = df['Star color'].replace(['White'], 'white')
# df['Star color'] = df['Star color'].replace(['Blue '], 'Blue')


In [8]:
df['Star color'].value_counts()

Star color
red                   112
blue                   56
blue white             41
white                  10
yellow-white            8
yellowish white         3
yellowish               3
whitish                 2
orange                  2
pale yellow orange      1
white-yellow            1
orange-red              1
Name: count, dtype: int64

In [9]:
df_encoded = pd.get_dummies(df, columns=['Star color', 'Spectral Class']).astype(int)
df_encoded

Unnamed: 0,Temperature (K),Luminosity(L/Lo),Radius(R/Ro),Absolute magnitude(Mv),Star type,Star color_blue,Star color_blue white,Star color_orange,Star color_orange-red,Star color_pale yellow orange,...,Star color_yellow-white,Star color_yellowish,Star color_yellowish white,Spectral Class_A,Spectral Class_B,Spectral Class_F,Spectral Class_G,Spectral Class_K,Spectral Class_M,Spectral Class_O
0,3068,0,0,16,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,3042,0,0,16,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,2600,0,0,18,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,2800,0,0,16,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,1939,0,0,20,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235,38940,374830,1356,-9,5,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
236,30839,834042,1194,-10,5,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
237,8829,537493,1423,-10,5,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
238,9235,404940,1112,-11,5,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [10]:
df_encoded.corr()['Star type'].sort_values()

Absolute magnitude(Mv)          -0.956781
Spectral Class_M                -0.447735
Star color_red                  -0.435244
Star color_yellowish white      -0.032939
Star color_white-yellow         -0.018938
Star color_pale yellow orange   -0.018938
Spectral Class_F                -0.004755
Star color_white                 0.012209
Star color_orange-red            0.018938
Star color_whitish               0.026838
Star color_yellowish             0.032939
Star color_yellow-white          0.054366
Spectral Class_A                 0.058735
Spectral Class_B                 0.092975
Spectral Class_G                 0.094689
Spectral Class_K                 0.109388
Star color_blue white            0.119925
Star color_orange                0.134191
Star color_blue                  0.346103
Spectral Class_O                 0.399339
Temperature (K)                  0.411129
Radius(R/Ro)                     0.660966
Luminosity(L/Lo)                 0.676845
Star type                        1

In [11]:
y = df['Star type']
df_encoded.drop('Star type', axis=1, inplace=True)

In [12]:
df_encoded.columns

Index(['Temperature (K)', 'Luminosity(L/Lo)', 'Radius(R/Ro)',
       'Absolute magnitude(Mv)', 'Star color_blue', 'Star color_blue white',
       'Star color_orange', 'Star color_orange-red',
       'Star color_pale yellow orange', 'Star color_red', 'Star color_white',
       'Star color_white-yellow', 'Star color_whitish',
       'Star color_yellow-white', 'Star color_yellowish',
       'Star color_yellowish white', 'Spectral Class_A', 'Spectral Class_B',
       'Spectral Class_F', 'Spectral Class_G', 'Spectral Class_K',
       'Spectral Class_M', 'Spectral Class_O'],
      dtype='object')

In [13]:
X_train, X_test, y_train, y_test = train_test_split(df_encoded, y, test_size=0.3, random_state=42)
X_train.shape

(168, 23)

In [14]:
X_test.shape

(72, 23)

In [15]:
df_encoded.head()

Unnamed: 0,Temperature (K),Luminosity(L/Lo),Radius(R/Ro),Absolute magnitude(Mv),Star color_blue,Star color_blue white,Star color_orange,Star color_orange-red,Star color_pale yellow orange,Star color_red,...,Star color_yellow-white,Star color_yellowish,Star color_yellowish white,Spectral Class_A,Spectral Class_B,Spectral Class_F,Spectral Class_G,Spectral Class_K,Spectral Class_M,Spectral Class_O
0,3068,0,0,16,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
1,3042,0,0,16,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
2,2600,0,0,18,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
3,2800,0,0,16,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
4,1939,0,0,20,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0


In [16]:
model = DecisionTreeClassifier()

param_grid = {
    'max_depth': [3, 5, 7, 9, 11],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(model, param_grid, cv=5)

grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

y_pred = grid_search.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy)

Best Parameters: {'max_depth': 5, 'min_samples_split': 10}
Best Score: 1.0
Test Accuracy: 1.0
