In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
import altair as alt
# Disables maximum rows allowed for altair plots
# alt.data_transformers.disable_max_rows()
# Uncomment below to re-enable max rows
# alt.data_transformers.enable('default', max_rows=5000)

In [8]:
url = "https://drive.google.com/file/d/1dTmTAiRGM5skZzMb9NwpOkcQrY0Dpq6t/view?usp=sharing"
url = 'https://drive.google.com/uc?id=' + url.split('/')[-2]
diabetes = pd.read_csv(url) #read data
display(diabetes.head(50))
display(diabetes.info())
diabetes["diabetes"].value_counts(normalize = True) #show classification variable distribution

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0
5,Female,20.0,0,0,never,27.32,6.6,85,0
6,Female,44.0,0,0,never,19.31,6.5,200,1
7,Female,79.0,0,0,No Info,23.86,5.7,85,0
8,Male,42.0,0,0,never,33.64,4.8,145,0
9,Female,32.0,0,0,never,27.32,5.0,100,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB


None

0    0.915
1    0.085
Name: diabetes, dtype: float64

Next we need to resample the data to create an even distribution of positive and negative labels.

In [9]:
np.random.seed(1) # set seed

diabetes_negative = diabetes[diabetes["diabetes"] == 0] #create even amounts of positive and negative labels
diabetes_positive = diabetes[diabetes["diabetes"] == 1]
diabetes_positive_upscaled = resample(
    diabetes_positive, n_samples = diabetes_negative.shape[0]
)
diabetes_positive_upscaled.shape[0]
diabetes_upsampled = pd.concat((diabetes_negative, diabetes_positive_upscaled))
diabetes_upsampled["diabetes"].value_counts(normalize = True)

0    0.5
1    0.5
Name: diabetes, dtype: float64

Now that the data is resampled we can create the train/test split

In [4]:
diabetes_train, diabetes_test = train_test_split(
    diabetes_upsampled, train_size = .75, stratify = (diabetes_upsampled["diabetes"]) # split data
)
display(diabetes_train.info())
diabetes_train["diabetes"].value_counts(normalize = True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 137250 entries, 94511 to 85718
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               137250 non-null  object 
 1   age                  137250 non-null  float64
 2   hypertension         137250 non-null  int64  
 3   heart_disease        137250 non-null  int64  
 4   smoking_history      137250 non-null  object 
 5   bmi                  137250 non-null  float64
 6   HbA1c_level          137250 non-null  float64
 7   blood_glucose_level  137250 non-null  int64  
 8   diabetes             137250 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 10.5+ MB


None

1    0.5
0    0.5
Name: diabetes, dtype: float64

Now we filter the data to the numeric columns to aggregate and observe trends, and then do the same with categorical values using ``value_counts``

In [5]:
diabetes_stats_upsample = diabetes_upsampled.drop(["gender", "hypertension", "smoking_history", "diabetes"], axis=1) # find mean values
display(diabetes_stats_upsample.agg(["mean","std"]))
diabetes_stats = diabetes_train.drop(["gender", "hypertension", "smoking_history", "diabetes"], axis=1) # find mean values
display(diabetes_stats.agg(["mean","std"])) #show average + variability demographics for survey
#display(diabetes["gender"].value_counts(normalize = True))
#display(diabetes["hypertension"].value_counts(normalize = True))
#display(diabetes["smoking_history"].value_counts(normalize = True))

Unnamed: 0,age,heart_disease,bmi,HbA1c_level,blood_glucose_level
mean,50.544479,0.089246,29.424498,6.163949,163.496246
std,21.532575,0.285099,7.435361,1.280604,56.956808


Unnamed: 0,age,heart_disease,bmi,HbA1c_level,blood_glucose_level
mean,50.515819,0.08937,29.42797,6.162753,163.420546
std,21.54606,0.285278,7.44374,1.280485,56.88576


Next we'll make the preprocessor to use in K Means Classification

In [6]:
diabetes_preprocessor = make_column_transformer(
    (StandardScaler(), ["age", "bmi", "HbA1c_level", "blood_glucose_level"]),
)
diabetes_preprocessor

In [7]:
diabetes_preprocessor.fit(diabetes)
diabetes_scaled = diabetes_preprocessor.transform(diabetes)
diabetes_scaled_df = pd.DataFrame(diabetes_scaled)#, columns=diabetes.feature_names)
diabetes_scaled_df

Unnamed: 0,0,1,2,3
0,1.692704,-0.321056,1.001706,0.047704
1,0.538006,-0.000116,1.001706,-1.426210
2,-0.616691,-0.000116,0.161108,0.489878
3,-0.261399,-0.583232,-0.492690,0.416183
4,1.515058,-1.081970,-0.679490,0.416183
...,...,...,...,...
99995,1.692704,-0.000116,0.628107,-1.180558
99996,-1.771388,-1.499343,0.908306,-0.934905
99997,1.070944,0.076729,0.161108,0.416183
99998,-0.794336,1.220361,-1.426688,-0.934905


Now, we'll make a GridSearch CV object and a range of potential K values to find the best K value

In [None]:
diabetes_grid = {
    "kneighborsclassifier__n_neighbors"  : range(
        1, 1_000, 200),
}
diabetes_pipe = make_pipeline(diabetes_preprocessor, KNeighborsClassifier())
diabetes_grid = GridSearchCV(
    estimator = diabetes_pipe,
    param_grid = diabetes_grid,
    cv = 5
)
accuracies_grid = pd.DataFrame(
    diabetes_grid.fit(
        diabetes_train[["age", "bmi", "HbA1c_level", "blood_glucose_level"]],
        diabetes_train["diabetes"],
    )
)
accuracies_grid.info()

In [None]:
accuracies_grid = (
    accuracies_grid[[
        "param_kneighborsclassifier__n_neighbors",
        "mean_test_score",
        "std_test_score"
    ]]
    .assign(sem_test_score=accuracies_grid["std_test_score"] / 10**(1/2))
    .rename(columns={"param_kneighborsclassifier__n_neighbors": "n_neighbors"})
    .drop(columns=["std_test_score"])
)
accuracies_grid