In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
import altair as alt
from sklearn.model_selection import cross_val_score

# Disables maximum rows allowed for altair plots
# alt.data_transformers.disable_max_rows()
# Uncomment below to re-enable max rows
# alt.data_transformers.enable('default', max_rows=5000)

In [None]:
url = "https://drive.google.com/file/d/1dTmTAiRGM5skZzMb9NwpOkcQrY0Dpq6t/view?usp=sharing"
url = 'https://drive.google.com/uc?id=' + url.split('/')[-2]
diabetes = pd.read_csv(url) #read data
display(diabetes)
display(diabetes.info())
diabetes["diabetes"].value_counts(normalize = True) #show classification variable distribution

Next we need to resample the data to create an even distribution of positive and negative labels.

In [None]:
np.random.seed(1) # set seed

diabetes_negative = diabetes[diabetes["diabetes"] == 0] #create even amounts of positive and negative labels
diabetes_positive = diabetes[diabetes["diabetes"] == 1]
diabetes_negative_downscaled = resample(
    diabetes_negative, n_samples = diabetes_positive.shape[0]
)
diabetes_negative_downscaled.shape[0]
diabetes_downsampled = pd.concat((diabetes_positive, diabetes_negative_downscaled))
display(diabetes_downsampled["diabetes"].value_counts(normalize = True))
diabetes_downsampled.info()

Now that the data is resampled we can create the train/test split

In [None]:
diabetes_train, diabetes_test = train_test_split(
    diabetes_downsampled, train_size = .75, stratify = (diabetes_downsampled["diabetes"]), random_state=42 # split data
)
display(diabetes_train.info())
diabetes_train["diabetes"].value_counts(normalize = True)

Now we filter the data to the numeric columns to aggregate and observe trends, and then do the same with categorical values using ``value_counts``

In [None]:
diabetes_stats_downsample = diabetes_downsampled.drop(["gender", "hypertension", "smoking_history", "diabetes"], axis=1) # find mean values
display(diabetes_stats_downsample.agg(["mean","std"]))
diabetes_stats = diabetes_train.drop(["gender", "hypertension", "smoking_history", "diabetes"], axis=1) # find mean values
display(diabetes_stats.agg(["mean","std"])) #show average + variability demographics for survey
#display(diabetes["gender"].value_counts(normalize = True))
#display(diabetes["hypertension"].value_counts(normalize = True))
#display(diabetes["smoking_history"].value_counts(normalize = True))

Next we'll make the preprocessor to use in K Means Classification

In [None]:
feature_names = ["age", "bmi", "HbA1c_level", "blood_glucose_level"]

diabetes_preprocessor = make_column_transformer(
    (StandardScaler(), feature_names),
)
diabetes_preprocessor

In [None]:
diabetes_preprocessor.fit(diabetes)
diabetes_scaled = diabetes_preprocessor.transform(diabetes)
diabetes_scaled_df = pd.DataFrame(diabetes_scaled, columns=feature_names)
diabetes_scaled_df

Using our preprocessor, we can train a new model.

In [None]:
diabetes_pipe_knn = make_pipeline(diabetes_preprocessor, KNeighborsClassifier())

X_train = diabetes_train[["age", "bmi", "HbA1c_level", "blood_glucose_level"]]
y_train = diabetes_train["diabetes"]

cv_scores = cross_val_score(diabetes_pipe_knn, X_train, y_train, cv=5, scoring='accuracy')

cv_scores_std = np.std(cv_scores)
print("average cv score:", cv_scores.mean(), "±", cv_scores_std)

Here, we see a score of __ . This is fine, however let's see if we can improve our score by tuning the `n_neighbors` hyperparameter in `KNeighborsClassifier()`.

### Hyperparameter Optimization
Now, we'll make a GridSearch CV object and a range of potential K values to find the best K value

In [None]:
diabetes_grid = {
    "kneighborsclassifier__n_neighbors"  : range(
        1,60, 2),
}
diabetes_pipe = make_pipeline(diabetes_preprocessor, KNeighborsClassifier())
diabetes_grid = GridSearchCV(
    estimator = diabetes_pipe,
    param_grid = diabetes_grid,
    cv = 5
)
accuracies_grid = pd.DataFrame(
    diabetes_grid.fit(
        diabetes_train[["age", "bmi", "HbA1c_level", "blood_glucose_level"]],
        diabetes_train["diabetes"],
    ).cv_results_
)
accuracies_grid = (
    accuracies_grid[[
        "param_kneighborsclassifier__n_neighbors",
        "mean_test_score",
        "std_test_score"
    ]]
    .assign(sem_test_score=accuracies_grid["std_test_score"] / 10**(1/2))
    .rename(columns={"param_kneighborsclassifier__n_neighbors": "n_neighbors"})
    .drop(columns=["std_test_score"])
)
accuracies_grid

In [None]:
accuracy = alt.Chart(accuracies_grid).mark_line(point=True).encode(
    x = alt.X("n_neighbors"),
    y = alt.Y("mean_test_score", title='Mean Test Score', scale=alt.Scale(zero=False)),
).properties(
    title='Ideal n_neighbors value based off Mean Test Score',
    width=600
)
accuracy

Based off `accuracies_grid` and our accuracy plot, we can see that `KNeighborsClassifier()` will perform best when `n_neighbors = 31` based off mean test score. Although `n_neighbors = 33` has a high `sem_test_score` such that there may be more variability in the mean test score, it will likely perform better than all other `n_neighbors` values up to `n_neighbors = 60`. There is a possibility of a better `n_neighbors` value past 60, however we will be unable to determine such value due to limited computational power.

In [None]:
diabetes_pipe_knn_31 = make_pipeline(diabetes_preprocessor, KNeighborsClassifier(n_neighbors=31))

X_train = diabetes_train[["age", "bmi", "HbA1c_level", "blood_glucose_level"]]
y_train = diabetes_train["diabetes"]

cv_scores = cross_val_score(diabetes_pipe_knn_31, X_train, y_train, cv=5, scoring='accuracy')

cv_scores_std = np.std(cv_scores)
print("average cv score:", cv_scores.mean(), "±", cv_scores_std)

After performing a cross-validation where cv=5, our model has an accuracy of 0.89655 with a standard deviation of 0.00167. A high accuracy may indicate that our model is able to correctly make predictions, and a low standard deviation indicates that our model is likely not underfitting nor overfitting. 

### Feature Selection
So far, we have found a feature set that allows us to train a relatively decent model. However, we would like to see whether there is another pool of features which may produce a more accurate model. To do this, we will separate our data into numerical features, binary features, and categorical features. We will also drop 'gender', as it may introduce bias into our final model. We could choose to represent 'gender' as a binary feature or an ordinal feature, but we believe it is best to not to include it at all.

We will pass all numerical features through `StandardScaler()` like previously, but this time we will pass categorical features (`smoking_history`) through `OrdinalEncoder()`, as we will choose to interpret smoking history as different levels of smoking intensity (never < former < current). However, we would also like to note that that 'no info' may affect our results, and using `OneHotEncoder()` is another option to consider in the future.

As for our binary features, we will use 'passthrough' as they do not need any additional processing to be used in `KNeighborsClassifier()`.

Lastly, we need to consider that the previous `n_neighbors=31` hyperparamater that we found is specific to the feature set containing only numerical features. Thus, we will use the default value of `n_neighbors=5` in our `KNeighborsClassifier()`.

In [None]:
X_train = diabetes_train.drop(columns=['diabetes', 'gender'])
y_train = diabetes_train['diabetes']
X_test = diabetes_train.drop(columns=['diabetes', 'gender'])
y_test = diabetes_train['diabetes']

numerical_features = ['age', 'bmi', 'HbA1c_level', 'blood_glucose_level']
binary_features = ['hypertension', 'heart_disease'] ## No need to perform any preprocessing
categorical_features = ['smoking_history', ]

preprocessor_all = make_column_transformer(
    (StandardScaler(), numerical_features),
    (OrdinalEncoder(), categorical_features),
    ('passthrough', binary_features)
)

X_train_transformed = preprocessor_all.fit_transform(X_train)
X_test_transformed = preprocessor_all.fit_transform(X_test)
y_train_transformed = y_train
y_test_transformed = y_test

pipe = make_pipeline(preprocessor_all, KNeighborsClassifier())

cv_scores = cross_val_score(pipe, X_train, y_train, cv=5, scoring='accuracy')

cv_scores_std = np.std(cv_scores)
print("average cv score:", cv_scores.mean(), "±", cv_scores_std)

Our score has decreased from our previous model. Let's see if we can select an ideal combinations of features by manually sifting through all combinations and finding the set with the best score.

In [None]:
from sklearn.model_selection import cross_val_score
from itertools import combinations

## Used ChatGPT to help iterate over all possible combinations of features as to select the top ten feature sets with the greatest score

def score_feature_combinations(X, y, model, preprocessors):
    scores = []

    for r in range(1, len(X.columns) + 1):
        for feature_combination in combinations(X.columns, r):
            preprocessor = make_column_transformer(
                (StandardScaler(), [feature for feature in feature_combination if feature in numerical_features]),
                (OrdinalEncoder(), [feature for feature in feature_combination if feature in categorical_features]),
                ('passthrough', [feature for feature in feature_combination if feature in binary_features])
            )

            pipe = make_pipeline(preprocessor, model)

            avg_score = cross_val_score(pipe, X, y, cv=5, scoring='accuracy').mean()
            scores.append((feature_combination, avg_score))
    scores.sort(key=lambda x: x[1], reverse=True)

    return scores

knn_model = KNeighborsClassifier()
feature_combination_scores = score_feature_combinations(X_train, y_train, knn_model, preprocessor_all)

for feature_combination, score in feature_combination_scores[:10]:
    print(f"Feature Combination: {feature_combination}, Score: {score}")

The feature combination of `'age', 'hypertension', 'heart_disease', 'bmi', 'HbA1c_level', 'blood_glucose_level'` produces a model with a score of 0.89098, slightly outperforming our model which considers all features (0.88988) and the model which only considers only numerical features with hyperparameter optimization (0.88306).

There is other information we can infer from our feature combinations. `age, blood_glucose_level` and `HbA1c_level` seem to be the most important features to consider when predicting if a patient has diabetes as these three features all appear in top ten feature combinations. However, the addition of `hypertension`, `bmi` and/or `smoking_history` are necessary to help improve the accuracy of our models. 

In [None]:
X_train_opt = diabetes_train.drop(columns=['diabetes', 'gender', 'smoking_history'])
y_train_opt = diabetes_train['diabetes']
X_test_opt = diabetes_train.drop(columns=['diabetes', 'gender', 'smoking_history'])
y_test_opt = diabetes_train['diabetes']

numerical_features_opt = ['age', 'bmi', 'HbA1c_level', 'blood_glucose_level']
binary_features_opt = ['hypertension', 'heart_disease'] ## No need to perform any preprocessing

preprocessor_opt = make_column_transformer(
    (StandardScaler(), numerical_features_opt),
    ('passthrough', binary_features_opt)
)

X_train_transformed_opt = preprocessor_opt.fit_transform(X_train_opt)
X_test_transformed_opt = preprocessor_opt.fit_transform(X_test_opt)
y_train_transformed_opt = y_train_opt
y_test_transformed_opt = y_test_opt

pipe_opt = make_pipeline(preprocessor_opt, KNeighborsClassifier())

cv_scores_opt = cross_val_score(pipe_opt, X_train_opt, y_train_opt, cv=5, scoring='accuracy')

cv_scores_opt_std = np.std(cv_scores)
print("average cv score:", cv_scores_opt.mean(), "±", cv_scores_opt_std)

To further improve our model accuracy, we could perform hyperparamater optimization on `n_neighbors`. However due to time constraints, we will continue working with our model using the default value of `n_neighbors=5`.

TODO:  compare test score with numerical features only model, use model on new observation(s), write a conclusion

In [None]:
#TODO: test scores for numerical features only, knn=31, and for optimized feature set
diabetes_pipe_knn_31.fit(X_train, y_train)

X_test = diabetes_test[["age", "bmi", "HbA1c_level", "blood_glucose_level"]]
y_test = diabetes_test["diabetes"]

test_score = diabetes_pipe_knn_31.score(X_test, y_test)
print("Test set accuracy:", test_score)

In [None]:
# TODO: new observation(s) for both numerical features only, and for optimized feature set, discuss findings and results

# new_observation = pd.DataFrame({
#     "age": [26.0],  
#     "bmi": [25.5],
#     "HbA1c_level": [1.8],
#     "blood_glucose_level": [170.0]
# })

# knn_fit = make_pipeline(diabetes_preprocessor, KNeighborsClassifier(n_neighbors=33)).fit(
#     diabetes_downsampled[feature_names],
#     diabetes_downsampled["diabetes"]
# )

# new_prediction = knn_fit.predict(new_observation)
# new_prediction