### Импорт

In [1]:
# импортируем библиотеки
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

In [2]:
# импортируем данные
df = pd.read_csv(r"C:\Users\allek\OneDrive\Рабочий стол\summer_train\git_project\Diabetes-ML-From-Scratch\raw_data\diabetes.csv")

In [3]:
# Предположим, что целевая переменная называется 'target'
X = df.drop('Outcome', axis=1)  # Признаки
y = df['Outcome']               # Целевая переменная

# Разделение данных (80% train, 20% test)
X_train, X_test, Y_train, Y_test = train_test_split(
    X, y, 
    test_size=0.3, 
    random_state=14,  # для воспроизводимости
    stratify=y        # для сохранения распределения целевой переменной
)

In [4]:
X_train.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
318,3,115,66,39,140,38.1,0.15,28
469,6,154,78,41,140,46.1,0.571,27
154,8,188,78,0,0,47.9,0.137,43
755,1,128,88,39,110,36.5,1.057,37
599,1,109,38,18,120,23.1,0.407,26


In [5]:
import sys
import os

# Получаем путь к корневой директории проекта (на уровень выше от notebooks/)
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(project_root)

from src.data.preprocessing import DataPreprocessing

In [6]:
Preprocessing = DataPreprocessing()
X_train, X_test = Preprocessing.fit_transform(X_train, X_test)

In [7]:
drop_column = ['Blood_Pressure_category_norm', 'Blood_Pressure_category_prehypertension', 'Blood_Pressure_category_hypertension']
X_train = X_train.drop(drop_column, axis=1)
X_test = X_test.drop(drop_column, axis=1)

In [8]:
X_train = np.array(X_train)
X_test = np.array(X_test)
y_pred = np.array(Y_train)
y_test = np.array(Y_test)

In [9]:
def sigmoid(z):
    z = np.asarray(z, dtype=float)
    z = np.clip(z, -500, 500)
    return 1 / (1 + np.exp(-z))

def binary_cross_entropy_vectorized(p_true, p_pred):
    """
    Векторизованная бинарная кросс-энтропия
    p_true: one-hot encoded метки [[0,1], [1,0], ...]
    p_pred: предсказанные вероятности [0.8, 0.2, ...]
    """
    # Защита от log(0)
    p_pred = np.clip(p_pred, 1e-15, 1 - 1e-15)
    
    # Векторизованное вычисление
    # p_true[i][1] соответствует y=1, p_true[i][0] соответствует y=0
    loss = -np.mean(p_true[:, 1] * np.log(p_pred) + p_true[:, 0] * np.log(1 - p_pred))
    
    return loss



In [10]:
# инициализация весов
w = np.full(X_train.shape[-1], -21, dtype='float')
b = 1
learning_rate_w = 0.7
learning_rate_b = 0.7
k = 530
lm_l2 = 0.1
w

array([-21., -21., -21., -21., -21., -21., -21., -21., -21., -21., -21.,
       -21., -21., -21.])

In [11]:
for epoch in range(700):
    # выборка для SGD
    rand_index = np.random.choice(X_train.shape[0], k, replace=False)
    sample_X = X_train[rand_index]
    sample_Y = y_pred[rand_index]

    # подсчет вероятности
    linear_model = w @ sample_X.T + b
    pred = sigmoid(linear_model)

    # потери
    y_one_hot = np.eye(2)[sample_Y]
    loss = binary_cross_entropy_vectorized(y_one_hot, pred)
    if epoch % 100 == 0: print(loss)

    # веса
    dz = pred - sample_Y
    dw = (1 / k) * np.dot(sample_X.T, dz)
    db = (1 / k) * np.sum(dz)

    # Преобразуйте dw и db к numpy array с правильным типом
    w -= learning_rate_w * np.asarray(dw, dtype=float)
    b -= learning_rate_b * np.asarray(db, dtype=float)

w

19.021928989188943
5.782969464312757
1.9779161091292305
1.1661495469520684
0.6682054495689975
0.5683398971798163
0.5584297386488131


array([  0.2972017 ,   0.07786079,  -0.74786885,   0.22516086,
         0.02725329,   0.96316118,   0.20611164, -14.37926247,
       -13.52243435, -12.90889392, -21.22322111, -19.81077633,
       -10.68222064, -10.09437265])

In [12]:
prediction = np.where(sigmoid(w @ X_train.T + b) > 0.5, 1, 0)
np.mean(prediction == y_pred)

np.float64(0.7597765363128491)

In [13]:
pred_test = np.where(sigmoid(w @ X_test.T + b) > 0.5, 1, 0)
np.mean(pred_test == y_test)

np.float64(0.7748917748917749)

In [14]:
y_pred = pred_test

In [15]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Для бинарной классификации
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

# Матрица ошибок
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Полный отчет
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.7749
Precision: 0.7302
Recall: 0.5679
F1-score: 0.6389

Confusion Matrix:
[[133  17]
 [ 35  46]]

Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.89      0.84       150
           1       0.73      0.57      0.64        81

    accuracy                           0.77       231
   macro avg       0.76      0.73      0.74       231
weighted avg       0.77      0.77      0.77       231

