In [1]:
#!pip install numpy scipy matplotlib scikit-learn pandas

# Подключение библиотек

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score, f1_score, mean_squared_error, r2_score, recall_score
from sklearn.feature_selection import VarianceThreshold
from sklearn.compose import ColumnTransformer
from category_encoders import CountEncoder

# Собственная реализация линейной регресии

In [None]:
class MyLinearRegression:
    def __init__(self, lr=0.01, n_iter=1000, fit_intercept=True):
        self.lr = lr
        self.n_iter = n_iter
        self.fit_intercept = fit_intercept

    def fit(self, X, y):
        X = np.asarray(X, dtype=float)
        y = np.asarray(y, dtype=float)

        if X.ndim == 1:
            X = X.reshape(-1, 1)

        if self.fit_intercept:
            X_aug = np.hstack([np.ones((X.shape[0], 1)), X])
        else:
            X_aug = X

        self.theta_ = np.linalg.pinv(X_aug) @ y

        if self.fit_intercept:
            self.intercept_ = self.theta_[0]
            self.coef_ = self.theta_[1:].copy()
        else:
            self.intercept_ = 0.0
            self.coef_ = self.theta_.copy()

        return self

    def predict(self, X):

        X = np.asarray(X, dtype=float)
        if X.ndim == 1:
            X = X.reshape(-1, 1)

        if self.fit_intercept:
            X_aug = np.hstack([np.ones((X.shape[0], 1)), X])
        else:
            X_aug = X

        return X_aug @ self.theta_

# Реализация логистической регерессии

In [4]:
class MyLogisticRegression:
    def __init__(self, learning_rate=0.05, max_iter=5000):
        self.learning_rate = learning_rate
        self.max_iter = max_iter

    def softmax(self, z):
        exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))
        return exp_z / np.sum(exp_z, axis=1, keepdims=True)
    
    def make_encoder_and_decoder(self, y):
        classes = np.unique(y)
        n = classes.shape[0]
        
        class_to_idx = {cls: i for i, cls in enumerate(classes)}
        idx_to_class = {i: cls for i, cls in enumerate(classes)}
        
        def encoder(y_single):
            if isinstance(y_single, (list, np.ndarray)):
                result = np.zeros((len(y_single), n))
                for i, val in enumerate(y_single):
                    result[i, class_to_idx[val]] = 1
                return result
            else:
                vec = np.zeros(n)
                vec[class_to_idx[y_single]] = 1
                return vec
        
        def decoder(vector):
            if vector.ndim == 1:
                return idx_to_class[np.argmax(vector)]
            else:
                return [idx_to_class[i] for i in np.argmax(vector, axis=1)]
        
        return encoder, decoder

    def fit(self, X, y):
        X = np.array(X)
        y = np.array(y)
        
        x0 = np.ones((X.shape[0], 1))
        X_dop = np.hstack((x0, X))
        
        self.encoder, self.decoder = self.make_encoder_and_decoder(y)
        y_encoded = self.encoder(y)
        
        n_classes = y_encoded.shape[1] if y_encoded.ndim > 1 else 1
        self.theta = np.zeros((X_dop.shape[1], n_classes))
        
        for _ in range(self.max_iter):
            y_pred = self.softmax(X_dop @ self.theta)
            
            self.theta -= self.learning_rate * (1 / X_dop.shape[0]) * X_dop.T @ (y_pred - y_encoded)
            
        return self
    
    def score(self, X, y):
        y_pred = self.predict(X)
        return np.mean(y == y_pred)
    
    def predict(self, X):
        X = np.array(X)
        x0 = np.ones((X.shape[0], 1))
        X_dop = np.hstack((x0, X))

        return self.decoder(self.softmax(X_dop @ self.theta))

# Проверка классификации

In [5]:
df = pd.read_csv('classification.csv')

classification_X = df.drop('Bankrupt?', axis=1)
classification_y = df['Bankrupt?']

classification_X_train, classification_X_test, classification_y_train, classification_y_test = train_test_split(
    classification_X, classification_y, test_size=0.2, random_state=42, stratify=classification_y
)

sk_clf = LogisticRegression()
sk_clf.fit(classification_X_train, classification_y_train)
classification_y_pred_sk = sk_clf.predict(classification_X_test)

my_clf = MyLogisticRegression()
my_clf.fit(classification_X_train, classification_y_train)
classification_y_pred_my = my_clf.predict(classification_X_test)

print(f"Sklearn accuracy={accuracy_score(classification_y_test, classification_y_pred_sk):.4f}, f1={f1_score(classification_y_test, classification_y_pred_sk, average='weighted'):.4f}, recall={recall_score(classification_y_test, classification_y_pred_sk, pos_label=1)}")
print(f"Custom  accuracy={accuracy_score(classification_y_test, classification_y_pred_my):.4f}, f1={f1_score(classification_y_test, classification_y_pred_my, average='weighted'):.4f}, recall={recall_score(classification_y_test, classification_y_pred_my, pos_label=1)}")

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Sklearn accuracy=0.9589, f1=0.9475, recall=0.0
Custom  accuracy=0.9399, f1=0.9392, recall=0.045454545454545456


# Провека регрессии

In [6]:
df = pd.read_csv('regression.csv')
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

regression_X = df.drop(columns=['salary_in_usd', 'salary', 'salary_currency'], axis=1)
regression_y = df['salary_in_usd'].to_numpy()

regression_X = pd.get_dummies(regression_X, drop_first=True).to_numpy(dtype=np.float32)

regression_X_train, regression_X_test, regression_y_train, regression_y_test = train_test_split(
    regression_X, regression_y, test_size=0.2, random_state=42
)


sk_reg = LinearRegression()
sk_reg.fit(regression_X_train, regression_y_train)
regression_y_pred_sk = sk_reg.predict(regression_X_test)

my_reg = MyLinearRegression()
my_reg.fit(regression_X_train, regression_y_train)
regression_y_pred_my = my_reg.predict(regression_X_test)
print(f"SkLearn RMSE={np.sqrt(mean_squared_error(regression_y_test, regression_y_pred_sk)):.4f}, R2={r2_score(regression_y_test, regression_y_pred_sk):.4f}")
print(f"Custom  RMSE={np.sqrt(mean_squared_error(regression_y_test, regression_y_pred_my)):.4f}, R2={r2_score(regression_y_test, regression_y_pred_my):.4f}")

SkLearn RMSE=58872.7322, R2=0.0956
Custom  RMSE=66762.6873, R2=-0.1630


### Улучшение

### Классификация

### Подготовка данных

In [7]:
df = pd.read_csv('classification.csv')
classification_X = df.drop('Bankrupt?', axis=1)
classification_y = df['Bankrupt?']

classification_X_train, classification_X_test, classification_y_train, classification_y_test = train_test_split(
    classification_X, classification_y,
    test_size=0.2,
    random_state=42,
    stratify=classification_y
)

scaler = StandardScaler()
classification_X_train_scaled = scaler.fit_transform(classification_X_train)
classification_X_test_scaled = scaler.transform(classification_X_test)


### Обучение

In [8]:
sk_clf = LogisticRegression(max_iter=5000)
sk_clf.fit(classification_X_train_scaled, classification_y_train)

my_clf = MyLogisticRegression(learning_rate=0.01)
my_clf.fit(classification_X_train_scaled, classification_y_train)

<__main__.MyLogisticRegression at 0x128bb3750>

### Предсказание

In [9]:
classification_y_pred_sk = sk_clf.predict(classification_X_test_scaled)

classification_y_pred_my = my_clf.predict(classification_X_test_scaled)

### Метрики

In [10]:
print(f"Sklearn accuracy={accuracy_score(classification_y_test, classification_y_pred_sk):.4f}, f1={f1_score(classification_y_test, classification_y_pred_sk, average='weighted'):.4f}, recall={recall_score(classification_y_test, classification_y_pred_sk, pos_label=1)}")
print(f"Custom  accuracy={accuracy_score(classification_y_test, classification_y_pred_my):.4f}, f1={f1_score(classification_y_test, classification_y_pred_my, average='weighted'):.4f}, recall={recall_score(classification_y_test, classification_y_pred_my, pos_label=1)}")

Sklearn accuracy=0.9619, f1=0.9549, recall=0.13636363636363635
Custom  accuracy=0.9670, f1=0.9591, recall=0.1590909090909091


### Регрессия

### Подготовка данных

In [None]:
df = pd.read_csv('regression.csv')
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

regression_X = df.drop(columns=['salary_in_usd', 'salary', 'salary_currency'], axis=1)
regression_y = df['salary_in_usd'].to_numpy()

num_cols = ['work_year', 'remote_ratio']
cat_cols = ['employment_type', 'job_title', 
            'employee_residence', 'company_location', 'company_size', 'experience_level']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', CountEncoder(), cat_cols),
    ],
    remainder='passthrough',
    sparse_threshold=0 
)
regression_X_preprocessed = preprocessor.fit_transform(regression_X)

selector = VarianceThreshold(threshold=1e-10)


regression_X_train, regression_X_test, regression_y_train, regression_y_test = train_test_split(
    regression_X, regression_y, test_size=0.2, random_state=42
)

regression_X_train_clean = selector.fit_transform(regression_X_train).astype(np.float64)
regression_X_test_clean = selector.transform(regression_X_test).astype(np.float64)

### Обучение

In [12]:
sk_reg = LinearRegression()
sk_reg.fit(regression_X_train_clean, regression_y_train)

my_reg = MyLinearRegression()
my_reg.fit(regression_X_train_clean, regression_y_train)

<__main__.MyLinearRegression at 0x1290c02d0>

### Предсказание

In [13]:
regression_y_pred_sk = sk_reg.predict(regression_X_test_clean)

regression_y_pred_my = my_reg.predict(regression_X_test_clean)

### Метрики

In [14]:
print(f"SkLearn RMSE={np.sqrt(mean_squared_error(regression_y_test, regression_y_pred_sk)):.4f}, R2={r2_score(regression_y_test, regression_y_pred_sk):.4f}")
print(f"Custom  RMSE={np.sqrt(mean_squared_error(regression_y_test, regression_y_pred_my)):.4f}, R2={r2_score(regression_y_test, regression_y_pred_my):.4f}")

SkLearn RMSE=58855.4693, R2=0.0962
Custom  RMSE=66762.6873, R2=-0.1630
