In [1]:
import pyreadstat
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
def read_dataset(path):
    df, meta = pyreadstat.read_sav(path)

    required_columns = ['yj1.1.1', 'yj13.2', 'yj72.18a', 'yj6.2', 'yj10.2', 'yj21b']

    filtered_df = df[required_columns]
    filtered_df = filtered_df.rename(columns={
        'yj1.1.1': 'label',
        'yj13.2': 'salary',
        'yj72.18a': 'education',
        'yj6.2': 'working_hours',
        'yj10.2': 'bonus',
        'yj21b': 'vacation_days'
    })
    return filtered_df

In [3]:
dataset = read_dataset('dataset.sav')
print(dataset.head())

   label   salary  education  working_hours       bonus  vacation_days
0    NaN      NaN        4.0            NaN         NaN            NaN
1    NaN      NaN        1.0            NaN         NaN            NaN
2    NaN      NaN        6.0            NaN         NaN            NaN
3    NaN      NaN        2.0            NaN         NaN            NaN
4    3.0  25000.0        5.0           40.0  99999996.0           42.0


In [4]:
dataset = dataset.dropna()
dataset = dataset[(dataset <= 1000000).all(axis=1)]
dataset = dataset.astype(float)
label = dataset['label']
dataset = dataset.drop(columns=['label'])

### Здесь можно универсально подать свои 
dataset[x,5]

label[x] (где возможные ответы имеют значения от 1 до 5 дискретно)

и запускать модель

In [106]:
X_train, X_test, y_train, y_test = train_test_split(dataset, label, test_size = 0.2, shuffle=False)

In [78]:
class AdamOptimizer:
    def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8):
        self.learning_rate = learning_rate
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        self.m_w, self.v_w = 0, 0  # Moments for weights
        self.m_b, self.v_b = 0, 0  # Moments for biases
        self.t = 0  # Time step
    
    def update(self, weights, biases, dW, db):
        """
        Update weights and biases using Adam optimization.
        
        Parameters:
        weights (ndarray): Current weights.
        biases (ndarray): Current biases.
        dW (ndarray): Gradients of the weights.
        db (ndarray): Gradients of the biases.
        
        Returns:
        updated_weights, updated_biases: Updated weights and biases.
        """
        # Increment time step
        self.t += 1
        
        # Update biased first moment estimate
        self.m_w = self.beta1 * self.m_w + (1 - self.beta1) * dW
        self.m_b = self.beta1 * self.m_b + (1 - self.beta1) * db
        
        # Update biased second moment estimate
        self.v_w = self.beta2 * self.v_w + (1 - self.beta2) * (dW**2)
        self.v_b = self.beta2 * self.v_b + (1 - self.beta2) * (db**2)
        
        # Compute bias-corrected first moment estimate
        m_w_hat = self.m_w / (1 - self.beta1**self.t)
        m_b_hat = self.m_b / (1 - self.beta1**self.t)
        
        # Compute bias-corrected second moment estimate
        v_w_hat = self.v_w / (1 - self.beta2**self.t)
        v_b_hat = self.v_b / (1 - self.beta2**self.t)
        
        # Compute the updates
        weight_update = self.learning_rate * m_w_hat / (np.sqrt(v_w_hat) + self.epsilon)
        bias_update = self.learning_rate * m_b_hat / (np.sqrt(v_b_hat) + self.epsilon)
        
        # Update parameters
        weights -= weight_update
        biases -= bias_update
        
        return weights, biases

In [109]:
class MulticlassLogisticRegression:
    def __init__(self, num_features, num_classes, learning_rate=0.01, epochs=1000):
        self.num_classes = num_classes
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.weights = np.random.randn(num_features, num_classes) * 0.0001
        self.weights = np.random.uniform(0, 0.01, (num_features, num_classes))
        self.biases = np.zeros((1, num_classes))
        self.optimizer = AdamOptimizer(learning_rate=learning_rate)

    def fit(self, X, y, optimizer = 'Naive'):
        """
        Train the model using gradient descent.
        
        optimizer = ['Naive','Adam']
        """
        n_samples, n_features = X.shape

        for epoch in range(self.epochs):
            # Compute linear scores
            logits = np.dot(X, self.weights) + self.biases
            
            # Apply softmax
            y_pred = self.softmax(logits)
            
            # Compute loss
            loss = self.cross_entropy_loss(y, y_pred)

            # Gradients
            y_one_hot = self.one_hot_encode(y, self.num_classes)
            dW = np.dot(X.T, (y_pred - y_one_hot)) / n_samples
            db = np.sum(y_pred - y_one_hot, axis=0, keepdims=True) / n_samples

            if optimizer == 'Adam':
                self.weights, self.biases = self.optimizer.update(self.weights, self.biases, dW, db)
            else:
                self.weights -= self.learning_rate * dW
                self.biases -= self.learning_rate * db

            # Print loss
            if epoch % 50 == 0:
                print(f"Epoch {epoch}, Loss: {loss:.4f}")

    def softmax(self, x):
        """Compute softmax values for each sets of scores in x."""
        x -= x.max(axis=-1, keepdims=True)
        e_x = np.exp(x)
        result = e_x / e_x.sum(axis=-1, keepdims=True)
        result = np.clip(result, 0.0001, 1.0)
        return result

    def cross_entropy_loss(self, y_true, y_pred):    
        y_true = (y_true - 1).astype(int)
        loss = 0

        for i in range(len(y_pred)):
            ans = y_pred[i][y_true.iloc[i]]
            loss = loss + (-np.log(ans))
        return loss

    def one_hot_encode(self, y, num_classes):
        """One-hot encode target labels."""
        y = y.astype(int)
        one_hot = np.zeros((y.size, num_classes))
        for i in range(len(one_hot)):
            one_hot[i][y.iloc[i]] = 1
        return one_hot

    def accuracy(self, y_true, y_pred):
        correct_predictions = np.sum(y_true == y_pred)
        return correct_predictions / len(y_true)

    def predict(self, X, y=[]):
        """Predict class labels."""
        logits = np.dot(X, self.weights) + self.biases
        probabilities = self.softmax(logits)
        labels = np.argmax(probabilities, axis=1)

        if y is not None:
            accuracy = self.accuracy(y, labels)
        else:
            accuracy = []
        return np.argmax(probabilities, axis=1), accuracy


### TRAIN VARIABLES

In [121]:
num_features=5
num_classes=5
learning_rate=1e-5
epochs=1000
optimizer='Adam'

In [122]:
np.random.seed(89)
# Initialize and train model
model = MulticlassLogisticRegression(num_features=num_features, num_classes=num_classes, learning_rate=learning_rate, epochs=epochs)
model.fit(X_train, y_train, optimizer=optimizer)

predictions, accuracy = model.predict(X_test, y_test)
print("Accuracy:", accuracy)
print("Predictions:", predictions)

Epoch 0, Loss: 1971.0128
Epoch 50, Loss: 1953.3019
Epoch 100, Loss: 1913.3491
Epoch 150, Loss: 1683.8528
Epoch 200, Loss: 1602.3631
Epoch 250, Loss: 1471.9838
Epoch 300, Loss: 1036.5992
Epoch 350, Loss: 889.3305
Epoch 400, Loss: 843.0330
Epoch 450, Loss: 813.3640
Epoch 500, Loss: 789.5537
Epoch 550, Loss: 753.1008
Epoch 600, Loss: 738.1937
Epoch 650, Loss: 734.1018
Epoch 700, Loss: 730.7521
Epoch 750, Loss: 729.0688
Epoch 800, Loss: 728.4838
Epoch 850, Loss: 728.2331
Epoch 900, Loss: 728.2709
Epoch 950, Loss: 728.6434
Accuracy: 0.43283582089552236
Predictions: [2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]
