### Импорт

In [1]:
# импортируем библиотеки
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

In [2]:
# импортируем данные
df = pd.read_csv(r"C:\Users\allek\OneDrive\Рабочий стол\summer_train\git_project\Diabetes-ML-From-Scratch\raw_data\diabetes.csv")

In [3]:
import sys
import os

# Получаем путь к корневой директории проекта (на уровень выше от notebooks/)
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(project_root)

from src.data_processing.preprocessing import DataPreprocessing

### Разделение данных

In [4]:
# Предположим, что целевая переменная называется 'target'
X = df.drop('Outcome', axis=1)  # Признаки
y = df['Outcome']               # Целевая переменная

# Разделение данных (80% train, 20% test)
X_train, X_test, Y_train, Y_test = train_test_split(
    X, y, 
    test_size=0.3, 
    random_state=22,  # для воспроизводимости
    stratify=y        # для сохранения распределения целевой переменной
)

In [5]:
X_train.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
395,2,127,58,24,275,27.7,1.6,25
158,2,88,74,19,53,29.0,0.229,22
246,10,122,68,0,0,31.2,0.258,41
18,1,103,30,38,83,43.3,0.183,33
577,2,118,80,0,0,42.9,0.693,21


In [6]:
Preprocessing = DataPreprocessing()
X_train, X_test = Preprocessing.fit_transform(X_train, X_test)

In [7]:
X_train = np.array(X_train)
X_test = np.array(X_test)
y_pred = np.array(Y_train)
y_test = np.array(Y_test)

In [8]:
def sigmoid(z):
    z = np.asarray(z, dtype=float)
    z = np.clip(z, -500, 500)
    return 1 / (1 + np.exp(-z))

def binary_cross_entropy_vectorized(p_true, p_pred):
    """
    Векторизованная бинарная кросс-энтропия
    p_true: one-hot encoded метки [[0,1], [1,0], ...]
    p_pred: предсказанные вероятности [0.8, 0.2, ...]
    """
    # Защита от log(0)
    p_pred = np.clip(p_pred, 1e-15, 1 - 1e-15)
    
    # Векторизованное вычисление
    # p_true[i][1] соответствует y=1, p_true[i][0] соответствует y=0
    loss = -np.mean(p_true[:, 1] * np.log(p_pred) + p_true[:, 0] * np.log(1 - p_pred))
    
    return loss



In [9]:
# инициализация весов
w = np.full(X_train.shape[-1], 0.1, dtype='float')
b = 1
learning_rate_w = 0.7
learning_rate_b = 0.7
k = 530
lm_l2 = 0.1
w

array([0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1,
       0.1])

In [10]:
for epoch in range(700):
    # выборка для SGD
    rand_index = np.random.choice(X_train.shape[0], k, replace=False)
    sample_X = X_train[rand_index]
    sample_Y = y_pred[rand_index]

    # подсчет вероятности
    linear_model = w @ sample_X.T + b
    pred = sigmoid(linear_model)

    # потери
    y_one_hot = np.eye(2)[sample_Y]
    loss = binary_cross_entropy_vectorized(y_one_hot, pred)
    if epoch % 100 == 0: print(loss)

    # веса
    dz = pred - sample_Y
    dw = (1 / k) * np.dot(sample_X.T, dz)
    db = (1 / k) * np.sum(dz)

    # Преобразуйте dw и db к numpy array с правильным типом
    w -= learning_rate_w * np.asarray(dw, dtype=float)
    b -= learning_rate_b * np.asarray(db, dtype=float)

w

0.9786451233283168
0.49183457561764243
0.49055730638426365
0.49032341935554896
0.48702840766484656
0.49204381201275066
0.4840201313872225


array([ 0.40834708,  0.1331082 , -0.66622012,  0.22557893, -0.24901604,
        0.79772422,  0.45971369, -1.31108423, -0.23987401,  0.35277455,
       -0.04677463, -1.15665278, -0.27507405,  0.38031778])

In [11]:
prediction = np.where(sigmoid(w @ X_train.T + b) > 0.5, 1, 0)
np.mean(prediction == y_pred)

np.float64(0.750465549348231)

In [12]:
pred_test = np.where(sigmoid(w @ X_test.T + b) > 0.5, 1, 0)
np.mean(pred_test == y_test)

np.float64(0.7922077922077922)

In [13]:
y_pred = pred_test

In [14]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Для бинарной классификации
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

# Матрица ошибок
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Полный отчет
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.7922
Precision: 0.7089
Recall: 0.6914
F1-score: 0.7000

Confusion Matrix:
[[127  23]
 [ 25  56]]

Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.85      0.84       150
           1       0.71      0.69      0.70        81

    accuracy                           0.79       231
   macro avg       0.77      0.77      0.77       231
weighted avg       0.79      0.79      0.79       231



In [15]:
from sklearn import svm
clf = svm.SVC(kernel='rbf', gamma=0.01, C=100)
clf.fit(X_train, Y_train)
predict = clf.predict(X_test)
acc = np.mean(predict == Y_test)
acc

np.float64(0.7965367965367965)

In [17]:
y_pred = predict

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

print(classification_report(y_test, y_pred))

Accuracy: 0.7965
Precision: 0.7237
Recall: 0.6790
F1-score: 0.7006
              precision    recall  f1-score   support

           0       0.83      0.86      0.85       150
           1       0.72      0.68      0.70        81

    accuracy                           0.80       231
   macro avg       0.78      0.77      0.77       231
weighted avg       0.79      0.80      0.79       231

