In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
import altair as alt
# Disables maximum rows allowed for altair plots
# alt.data_transformers.disable_max_rows()
# Uncomment below to re-enable max rows
# alt.data_transformers.enable('default', max_rows=5000)

In [29]:
url = "https://drive.google.com/file/d/1dTmTAiRGM5skZzMb9NwpOkcQrY0Dpq6t/view?usp=sharing"
url = 'https://drive.google.com/uc?id=' + url.split('/')[-2]
diabetes = pd.read_csv(url) #read data
display(diabetes)
display(diabetes.info())
diabetes["diabetes"].value_counts(normalize = True) #show classification variable distribution

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0
...,...,...,...,...,...,...,...,...,...
99995,Female,80.0,0,0,No Info,27.32,6.2,90,0
99996,Female,2.0,0,0,No Info,17.37,6.5,100,0
99997,Male,66.0,0,0,former,27.83,5.7,155,0
99998,Female,24.0,0,0,never,35.42,4.0,100,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB


None

0    0.915
1    0.085
Name: diabetes, dtype: float64

Next we need to resample the data to create an even distribution of positive and negative labels.

In [31]:
np.random.seed(1) # set seed

diabetes_negative = diabetes[diabetes["diabetes"] == 0] #create even amounts of positive and negative labels
diabetes_positive = diabetes[diabetes["diabetes"] == 1]
diabetes_negative_downscaled = resample(
    diabetes_negative, n_samples = diabetes_positive.shape[0]
)
diabetes_negative_downscaled.shape[0]
diabetes_downsampled = pd.concat((diabetes_positive, diabetes_negative_downscaled))
display(diabetes_downsampled["diabetes"].value_counts(normalize = True))
diabetes_downsampled.info()

1    0.5
0    0.5
Name: diabetes, dtype: float64

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17000 entries, 6 to 50426
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   gender               17000 non-null  object 
 1   age                  17000 non-null  float64
 2   hypertension         17000 non-null  int64  
 3   heart_disease        17000 non-null  int64  
 4   smoking_history      17000 non-null  object 
 5   bmi                  17000 non-null  float64
 6   HbA1c_level          17000 non-null  float64
 7   blood_glucose_level  17000 non-null  int64  
 8   diabetes             17000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 1.3+ MB


Now that the data is resampled we can create the train/test split

In [35]:
diabetes_train, diabetes_test = train_test_split(
    diabetes_downsampled, train_size = .75, stratify = (diabetes_downsampled["diabetes"]) # split data
)
display(diabetes_train.info())
diabetes_train["diabetes"].value_counts(normalize = True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12750 entries, 54906 to 55958
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   gender               12750 non-null  object 
 1   age                  12750 non-null  float64
 2   hypertension         12750 non-null  int64  
 3   heart_disease        12750 non-null  int64  
 4   smoking_history      12750 non-null  object 
 5   bmi                  12750 non-null  float64
 6   HbA1c_level          12750 non-null  float64
 7   blood_glucose_level  12750 non-null  int64  
 8   diabetes             12750 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 996.1+ KB


None

1    0.5
0    0.5
Name: diabetes, dtype: float64

Now we filter the data to the numeric columns to aggregate and observe trends, and then do the same with categorical values using ``value_counts``

In [36]:
diabetes_stats_upsample = diabetes_upsampled.drop(["gender", "hypertension", "smoking_history", "diabetes"], axis=1) # find mean values
display(diabetes_stats_upsample.agg(["mean","std"]))
diabetes_stats = diabetes_train.drop(["gender", "hypertension", "smoking_history", "diabetes"], axis=1) # find mean values
display(diabetes_stats.agg(["mean","std"])) #show average + variability demographics for survey
#display(diabetes["gender"].value_counts(normalize = True))
#display(diabetes["hypertension"].value_counts(normalize = True))
#display(diabetes["smoking_history"].value_counts(normalize = True))

Unnamed: 0,age,heart_disease,bmi,HbA1c_level,blood_glucose_level
mean,50.535948,0.089294,29.432829,6.164053,163.706529
std,21.495405,0.285176,7.459182,1.2833,56.797977


Unnamed: 0,age,heart_disease,bmi,HbA1c_level,blood_glucose_level
mean,50.615997,0.090745,29.449009,6.162588,163.433647
std,21.48049,0.287258,7.462296,1.27819,56.894316


Next we'll make the preprocessor to use in K Means Classification

In [37]:
diabetes_preprocessor = make_column_transformer(
    (StandardScaler(), ["age", "bmi", "HbA1c_level", "blood_glucose_level"]),
)
diabetes_preprocessor

In [38]:
diabetes_preprocessor.fit(diabetes)
diabetes_scaled = diabetes_preprocessor.transform(diabetes)
diabetes_scaled_df = pd.DataFrame(diabetes_scaled)#, columns=diabetes.feature_names)
diabetes_scaled_df

Unnamed: 0,0,1,2,3
0,1.692704,-0.321056,1.001706,0.047704
1,0.538006,-0.000116,1.001706,-1.426210
2,-0.616691,-0.000116,0.161108,0.489878
3,-0.261399,-0.583232,-0.492690,0.416183
4,1.515058,-1.081970,-0.679490,0.416183
...,...,...,...,...
99995,1.692704,-0.000116,0.628107,-1.180558
99996,-1.771388,-1.499343,0.908306,-0.934905
99997,1.070944,0.076729,0.161108,0.416183
99998,-0.794336,1.220361,-1.426688,-0.934905


Now, we'll make a GridSearch CV object and a range of potential K values to find the best K value

In [41]:
diabetes_grid = {
    "kneighborsclassifier__n_neighbors"  : range(
        1,60, 2),
}
diabetes_pipe = make_pipeline(diabetes_preprocessor, KNeighborsClassifier())
diabetes_grid = GridSearchCV(
    estimator = diabetes_pipe,
    param_grid = diabetes_grid,
    cv = 5
)
accuracies_grid = pd.DataFrame(
    diabetes_grid.fit(
        diabetes_train[["age", "bmi", "HbA1c_level", "blood_glucose_level"]],
        diabetes_train["diabetes"],
    ).cv_results_
)
accuracies_grid = (
    accuracies_grid[[
        "param_kneighborsclassifier__n_neighbors",
        "mean_test_score",
        "std_test_score"
    ]]
    .assign(sem_test_score=accuracies_grid["std_test_score"] / 10**(1/2))
    .rename(columns={"param_kneighborsclassifier__n_neighbors": "n_neighbors"})
    .drop(columns=["std_test_score"])
)
accuracies_grid

Unnamed: 0,n_neighbors,mean_test_score,sem_test_score
0,1,0.867216,0.001743
1,3,0.877176,0.001706
2,5,0.882039,0.000639
3,7,0.885176,0.001346
4,9,0.886667,0.001669
5,11,0.888235,0.001429
6,13,0.88698,0.00129
7,15,0.887608,0.001057
8,17,0.888784,0.001111
9,19,0.89051,0.001362


In [45]:
accuracy = alt.Chart(accuracies_grid).mark_line(point=True).encode(
    x = alt.X("n_neighbors"),
    y = alt.Y("mean_test_score"),
)
accuracy