In [13]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.base import BaseEstimator

In [2]:
url = 'https://raw.githubusercontent.com/stepan1518/golubov3/main/classes.csv'
df = pd.read_csv(url)
df.head()

Unnamed: 0,Temperature (K),Luminosity(L/Lo),Radius(R/Ro),Absolute magnitude(Mv),Star type,Star color,Spectral Class
0,3068,0.0024,0.17,16.12,0,Red,M
1,3042,0.0005,0.1542,16.6,0,Red,M
2,2600,0.0003,0.102,18.7,0,Red,M
3,2800,0.0002,0.16,16.65,0,Red,M
4,1939,0.000138,0.103,20.06,0,Red,M


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240 entries, 0 to 239
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Temperature (K)         240 non-null    int64  
 1   Luminosity(L/Lo)        240 non-null    float64
 2   Radius(R/Ro)            240 non-null    float64
 3   Absolute magnitude(Mv)  240 non-null    float64
 4   Star type               240 non-null    int64  
 5   Star color              240 non-null    object 
 6   Spectral Class          240 non-null    object 
dtypes: float64(3), int64(2), object(2)
memory usage: 13.3+ KB


In [4]:
df['Star type'].value_counts()

Star type
0    40
1    40
2    40
3    40
4    40
5    40
Name: count, dtype: int64

In [5]:
df['Star color'].value_counts()

Star color
Red                   112
Blue                   55
Blue-white             26
Blue White             10
yellow-white            8
White                   7
Blue white              3
Yellowish White         3
white                   3
Whitish                 2
Orange                  2
yellowish               2
Pale yellow orange      1
White-Yellow            1
Blue                    1
Yellowish               1
Orange-Red              1
Blue white              1
Blue-White              1
Name: count, dtype: int64

In [6]:
df['Star color'] = df['Star color'].str.lower().str.strip()
df['Star color'] = df['Star color'].replace('blue-white', 'blue white')

In [7]:
df['Star color'].value_counts()

Star color
red                   112
blue                   56
blue white             41
white                  10
yellow-white            8
yellowish white         3
yellowish               3
whitish                 2
orange                  2
pale yellow orange      1
white-yellow            1
orange-red              1
Name: count, dtype: int64

In [8]:
df_encoded = pd.get_dummies(df, columns=['Star color', 'Spectral Class']).astype(int)
df_encoded

Unnamed: 0,Temperature (K),Luminosity(L/Lo),Radius(R/Ro),Absolute magnitude(Mv),Star type,Star color_blue,Star color_blue white,Star color_orange,Star color_orange-red,Star color_pale yellow orange,...,Star color_yellow-white,Star color_yellowish,Star color_yellowish white,Spectral Class_A,Spectral Class_B,Spectral Class_F,Spectral Class_G,Spectral Class_K,Spectral Class_M,Spectral Class_O
0,3068,0,0,16,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,3042,0,0,16,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,2600,0,0,18,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,2800,0,0,16,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,1939,0,0,20,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235,38940,374830,1356,-9,5,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
236,30839,834042,1194,-10,5,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
237,8829,537493,1423,-10,5,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
238,9235,404940,1112,-11,5,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [9]:
df_encoded.corr()['Star type'].sort_values()

Absolute magnitude(Mv)          -0.956781
Spectral Class_M                -0.447735
Star color_red                  -0.435244
Star color_yellowish white      -0.032939
Star color_white-yellow         -0.018938
Star color_pale yellow orange   -0.018938
Spectral Class_F                -0.004755
Star color_white                 0.012209
Star color_orange-red            0.018938
Star color_whitish               0.026838
Star color_yellowish             0.032939
Star color_yellow-white          0.054366
Spectral Class_A                 0.058735
Spectral Class_B                 0.092975
Spectral Class_G                 0.094689
Spectral Class_K                 0.109388
Star color_blue white            0.119925
Star color_orange                0.134191
Star color_blue                  0.346103
Spectral Class_O                 0.399339
Temperature (K)                  0.411129
Radius(R/Ro)                     0.660966
Luminosity(L/Lo)                 0.676845
Star type                        1

In [10]:
y = df_encoded['Star type']
df_encoded.drop('Star type', axis=1, inplace=True)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(df_encoded, y, test_size=0.3, random_state=42)
X_train.shape

(168, 23)

In [15]:
X_test.shape

(72, 23)

In [16]:
df_encoded.head()

Unnamed: 0,Temperature (K),Luminosity(L/Lo),Radius(R/Ro),Absolute magnitude(Mv),Star color_blue,Star color_blue white,Star color_orange,Star color_orange-red,Star color_pale yellow orange,Star color_red,...,Star color_yellow-white,Star color_yellowish,Star color_yellowish white,Spectral Class_A,Spectral Class_B,Spectral Class_F,Spectral Class_G,Spectral Class_K,Spectral Class_M,Spectral Class_O
0,3068,0,0,16,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
1,3042,0,0,16,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
2,2600,0,0,18,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
3,2800,0,0,16,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
4,1939,0,0,20,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0


In [36]:
# class NeuralNetwork(nn.Module):
#     def __init__(self, input_size, hidden_size, num_classes):
#         super(NeuralNetwork, self).__init__()
#         self.fc1 = nn.Linear(input_size, hidden_size)
#         self.relu = nn.ReLU()
#         self.fc2 = nn.Linear(hidden_size, num_classes)
    
#     def forward(self, x):
#         out = self.fc1(x)
#         out = self.relu(out)
#         out = self.fc2(out)
#         return out

# class PyTorchClassifier(BaseEstimator):
#     def __init__(self, input_size, hidden_size, num_classes, epochs=10, batch_size=32, lr=0.01):
#         self.input_size = input_size
#         self.hidden_size = hidden_size
#         self.num_classes = num_classes
#         self.epochs = epochs
#         self.batch_size = batch_size
#         self.lr = lr
#         self.model = None
        
#     def fit(self, X, y):
#         self.model = NeuralNetwork(self.input_size, self.hidden_size, self.num_classes)
#         criterion = nn.CrossEntropyLoss()
#         optimizer = optim.Adam(self.model.parameters(), lr=self.lr)
        
#         X = torch.FloatTensor(X)
#         y = torch.LongTensor(y)
#         dataset = TensorDataset(X, y)
#         dataloader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True)
        
#         for epoch in range(self.epochs):
#             for inputs, labels in dataloader:
#                 optimizer.zero_grad()
#                 outputs = self.model(inputs)
#                 loss = criterion(outputs, labels)
#                 loss.backward()
#                 optimizer.step()
    
#     def predict(self, X):
#         with torch.no_grad():
#             X = torch.FloatTensor(X)
#             outputs = self.model(X)
#             _, predicted = torch.max(outputs, 1)
#         return predicted.numpy()

# # Преобразование данных в массивы NumPy перед использованием
# # Convert pandas DataFrames to NumPy arrays
# X_train_np = X_train.values if isinstance(X_train, pd.DataFrame) else X_train
# y_train_np = y_train.values if isinstance(y_train, pd.Series) else y_train
# X_test_np = X_test.values if isinstance(X_test, pd.DataFrame) else X_test
# y_test_np = y_test.values if isinstance(y_test, pd.Series) else y_test

# # Определение параметров сетки для GridSearchCV
# param_grid = {
#     'hidden_size': [64, 128, 256],
#     'epochs': [600, 700, 800],
#     'batch_size': [32, 64, 128],
#     'lr': [0.01]
# }

# # Создание экземпляра класса PyTorchClassifier
# classifier = PyTorchClassifier(input_size=X_train.shape[1], hidden_size=64, num_classes=6)

# # Использование GridSearchCV для настройки гиперпараметров
# grid_search = GridSearchCV(classifier, param_grid, cv=3, scoring='accuracy')
# grid_search.fit(X_train_np, y_train_np)

# # Получение лучших параметров и лучшегоЫ результата
# best_params = grid_search.best_params_
# best_score = grid_search.best_score_

# print("Best Parameters:", best_params)
# print("Best Score:", best_score)

# # Predict on the test set using the best model
# best_model = grid_search.best_estimator_
# y_pred = best_model.predict(X_test_np)
# print(type(y_pred))
# # Calculate accuracy on the test set
# accuracy = accuracy_score(y_test, y_pred)
# print("Test Accuracy:", accuracy)

In [33]:
X_train, X_test, y_train, y_test = train_test_split(df_encoded, y, test_size=0.15, random_state=42)

class NeuralNetwork(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(NeuralNetwork, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)


    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out


class PyTorchClassifier(BaseEstimator):
    def __init__(self, input_size, hidden_size, num_classes, epochs=5000, batch_size=32, lr=0.001):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_classes = num_classes
        self.epochs = epochs
        self.batch_size = batch_size
        self.lr = lr
        self.model = None

    def fit(self, X, y):
        self.model = NeuralNetwork(self.input_size, self.hidden_size, self.num_classes)
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(self.model.parameters(), lr=self.lr)

        X = torch.FloatTensor(X)
        y = torch.LongTensor(y)
        dataset = TensorDataset(X, y)
        dataloader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True)

        for epoch in range(self.epochs):
            for inputs, labels in dataloader:
                optimizer.zero_grad()
                outputs = self.model(inputs)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()

    def predict(self, X):
        with torch.no_grad():
            X = torch.FloatTensor(X)
            outputs = self.model(X)
            _, predicted = torch.max(outputs, 1)
        return predicted.numpy()


# Преобразование данных в массивы NumPy перед использованием
# Convert pandas DataFrames to NumPy arrays
X_train_np = X_train.values if isinstance(X_train, pd.DataFrame) else X_train
y_train_np = y_train.values if isinstance(y_train, pd.Series) else y_train
X_test_np = X_test.values if isinstance(X_test, pd.DataFrame) else X_test
y_test_np = y_test.values if isinstance(y_test, pd.Series) else y_test

# Определение параметров сетки для GridSearchCV
param_grid = {
    'hidden_size': [64, 128, 256],
    'epochs': [600, 700, 800],
    'batch_size': [32, 64, 128],
    'lr': [0.01]
}

# Создание экземпляра класса PyTorchClassifier
classifier = PyTorchClassifier(input_size=X_train.shape[1], hidden_size=64, num_classes=6)

# Использование GridSearchCV для настройки гиперпараметров
# grid_search = GridSearchCV(classifier, param_grid, cv=5, scoring='accuracy')
# grid_search.fit(X_train_np, y_train_np)
#
# # Получение лучших параметров и лучшегоЫ результата
# best_params = grid_search.best_params_
# best_score = grid_search.best_score_
#
# print("Best Parameters:", best_params)
# print("Best Score:", best_score)
#
# # Predict on the test set using the best model
# best_model = grid_search.best_estimator_
# y_pred = best_model.predict(X_test_np)
# print(type(y_pred))
# # Calculate accuracy on the test set
# accuracy = accuracy_score(y_test, y_pred)
# print("Test Accuracy:", accuracy)

classifier.fit(X_train_np, y_train_np)
y_pred = classifier.predict(X_test_np)
print(f'Test accuracy : {accuracy_score(y_pred, y_test_np)}\n')


Test accuracy : 0.9722222222222222

