In [143]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from scipy.stats import zscore

In [150]:
df = pd.read_csv('diabetes.csv')
df.head(5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


Outcome column:
- 1 - yes
- 0 - no

In [151]:
df = df.replace(0, np.nan)
df['Outcome'] = df['Outcome'].fillna(0)

Используем Z-Score (3-5) для заполнения данных в Glucose, Blood Pressure, SkinThickness, BMI

In [152]:
# Задание з-скоров и параметров границы
z_scores = df[['Glucose', 'BloodPressure', 'SkinThickness', 'BMI']].apply(
    lambda x: np.abs(zscore(x, nan_policy='omit'))
)
soft_threshold = 3
hard_threshold = 5

# Пробор по нижней границе
means = df[['Glucose', 'BloodPressure', 'SkinThickness', 'BMI']][(z_scores < soft_threshold).all(axis=1)].mean()
df.loc[:, ('Glucose', 'BloodPressure', 'SkinThickness', 'BMI')].fillna(means, inplace=True)
# Пробор по верхней границе
for col in ['Glucose', 'BloodPressure', 'SkinThickness', 'BMI']:
    df.loc[z_scores[col] >= hard_threshold, col] = means[col]

In [153]:
df.head(20)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6.0,148.0,72.0,35.0,,33.6,0.627,50,1.0
1,1.0,85.0,66.0,29.0,,26.6,0.351,31,0.0
2,8.0,183.0,64.0,,,23.3,0.672,32,1.0
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21,0.0
4,,137.0,40.0,35.0,168.0,43.1,2.288,33,1.0
5,5.0,116.0,74.0,,,25.6,0.201,30,0.0
6,3.0,78.0,50.0,32.0,88.0,31.0,0.248,26,1.0
7,10.0,115.0,,,,35.3,0.134,29,0.0
8,2.0,197.0,70.0,45.0,543.0,30.5,0.158,53,1.0
9,8.0,125.0,96.0,,,,0.232,54,1.0


Используем IQR (1.5, 3) для заполнения данных в Pregnancies, Insulin, DiabetesPedigreeFunction, Age

In [156]:
# Задание Q1, Q3 для IQR
Q1 = df[['Pregnancies', 'Insulin', 'DiabetesPedigreeFunction', 'Age']].quantile(0.25)
Q3 = df[['Pregnancies', 'Insulin', 'DiabetesPedigreeFunction', 'Age']].quantile(0.75)
IQR = Q3 - Q1

# Определение мягких и жестких границ для работы с (1.5, 3) значениями
lower_soft_bound = Q1 - 1.5 * IQR
upper_soft_bound = Q3 + 1.5 * IQR
lower_hard_bound = Q1 - 3.0 * IQR
upper_hard_bound = Q3 + 3.0 * IQR

# Подсчёт медианных и средних из значений не сильно выходящих за границу
non_outlier_mask = ~((df[['Pregnancies', 'Insulin', 'DiabetesPedigreeFunction', 'Age']] < lower_soft_bound) |
                    (df[['Pregnancies', 'Insulin', 'DiabetesPedigreeFunction', 'Age']] > upper_soft_bound)
                ) & df.notna()

medians = df[non_outlier_mask].median(numeric_only=True)
means = df[non_outlier_mask].mean(numeric_only=True)

display('before', medians, means)

for col in ['Pregnancies', 'Insulin', 'DiabetesPedigreeFunction', 'Age']:
    if pd.isna(medians[col]):
        medians[col] = df[col].median(skipna=True)
    if pd.isna(means[col]):
        means[col] = df[col].mean(skipna=True)

display('after', medians, means)

# Замена NaN значений медианами полученными выше
df.loc[:, ('Pregnancies', 'Insulin', 'DiabetesPedigreeFunction', 'Age')].fillna(medians, inplace=True)

# Замена средне выходящих за границу значений (между 1.5 и 3 IQR) медианой
for col in ['Pregnancies', 'Insulin', 'DiabetesPedigreeFunction', 'Age']:
    mild_outliers_mask = \
        ((df[col] < lower_soft_bound[col]) & (df[col] <= lower_hard_bound[col])) | \
        ((df[col] > upper_soft_bound[col]) & (df[col] >= upper_hard_bound[col]))

    df.loc[mild_outliers_mask, col] = medians[col]

# Замена сильно выходящих за границу значений (>= 3 IQR) средним
for col in ['Pregnancies', 'Insulin', 'DiabetesPedigreeFunction', 'Age']:
    extreme_outliers_mask = (df[col] < lower_hard_bound[col]) | (df[col] > upper_hard_bound[col])

    df.loc[extreme_outliers_mask, col] = means[col]

'before'

Pregnancies                   4.000
Glucose                         NaN
BloodPressure                   NaN
SkinThickness                   NaN
Insulin                     120.000
BMI                             NaN
DiabetesPedigreeFunction      0.356
Age                          29.000
Outcome                         NaN
dtype: float64

Pregnancies                   4.459542
Glucose                            NaN
BloodPressure                      NaN
SkinThickness                      NaN
Insulin                     132.610811
BMI                                NaN
DiabetesPedigreeFunction      0.429832
Age                          32.805007
Outcome                            NaN
dtype: float64

'after'

Pregnancies                   4.000
Glucose                         NaN
BloodPressure                   NaN
SkinThickness                   NaN
Insulin                     120.000
BMI                             NaN
DiabetesPedigreeFunction      0.356
Age                          29.000
Outcome                         NaN
dtype: float64

Pregnancies                   4.459542
Glucose                            NaN
BloodPressure                      NaN
SkinThickness                      NaN
Insulin                     132.610811
BMI                                NaN
DiabetesPedigreeFunction      0.429832
Age                          32.805007
Outcome                            NaN
dtype: float64

  df.loc[extreme_outliers_mask, col] = means[col]


In [115]:
df.head(20)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6.0,148.0,72.0,35.0,,33.6,0.627,50.0,1.0
1,1.0,85.0,66.0,29.0,,26.6,0.351,31.0,0.0
2,8.0,183.0,64.0,,,23.3,0.672,32.0,1.0
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0,0.0
4,,137.0,40.0,35.0,168.0,43.1,0.3725,33.0,1.0
5,5.0,116.0,74.0,,,25.6,0.201,30.0,0.0
6,3.0,78.0,50.0,32.0,88.0,31.0,0.248,26.0,1.0
7,10.0,115.0,,,,35.3,0.134,29.0,0.0
8,2.0,197.0,70.0,45.0,125.0,30.5,0.158,53.0,1.0
9,8.0,125.0,96.0,,,,0.232,54.0,1.0
