In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import seaborn as sns
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.discriminant_analysis import StandardScaler
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score

In [2]:
# Read in the data
df = pd.read_csv('/Users/shahadaleissa/hyper_code/Dataset/cleaned_hypertension_data.csv')

In [3]:
def splitting_data(df, sampling):
    if sampling == 'none':
        X = df.drop(['Class'], axis=1)
        y = df['Class']
        return X, y
    elif sampling == 'SMOTEENN':
        smote_enn = SMOTEENN(random_state=0)
        X = df.drop(['Class'], axis=1)
        y = df['Class']
        X_resampled, y_resampled = smote_enn.fit_resample(X, y)
        return X_resampled, y_resampled
    elif sampling == 'SMOTE':
        smote = SMOTE(random_state=0)
        X = df.drop(['Class'], axis=1)
        y = df['Class']
        X_resampled, y_resampled = smote.fit_resample(X, y)
        return X_resampled, y_resampled
    elif sampling == 'under':
        rus = RandomUnderSampler(random_state=0)
        X = df.drop(['Class'], axis=1)
        y = df['Class']
        X_resampled, y_resampled = rus.fit_resample(X, y)
        return X_resampled, y_resampled
        

In [4]:
def training(X_train, y_train, k):
    # Create a KNN classifier with 5 neighbors
    knn = KNeighborsClassifier(n_neighbors=k)
    # Fit the classifier to the data
    knn.fit(X_train, y_train)
    return knn

In [5]:
def predict(knn, X_test ,y_test):
    # Predict the labels for the training data X
    y_pred = knn.predict(X_test)
    cr=classification_report(y_test, y_pred)
    print(cr)

In [6]:
def optimize(X,y):
    k_values = [i for i in range (1,31)]
    scores = []

    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    for k in k_values:
        knn = KNeighborsClassifier(n_neighbors=k)
        score = cross_val_score(knn, X, y, cv=5)
        scores.append(np.mean(score))

    best_index = np.argmax(scores)
    best_k = k_values[best_index]

    return best_k, scores
        

In [7]:
def optimize_with_grid(X_train,y_train):
    knn = KNeighborsClassifier()
    param_grid = {'n_neighbors': np.arange(1, 31)}
    knn_cv = GridSearchCV(knn, param_grid, cv=5)
    knn_cv.fit(X_train, y_train)
    print(knn_cv.best_params_)
    print(knn_cv.best_score_)
    return knn_cv.best_estimator_

<h1> KNN on original data with optimization </h1>

In [8]:
# using function with no sampling 
X, y= splitting_data(df, 'none')
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
# Scale the features using StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [9]:
#check number of observations in each class in the set
print("Number of observations in each class in the training set:")
print(y.value_counts())

Number of observations in each class in the training set:
Class
0    1013
1     987
Name: count, dtype: int64


In [10]:
knn1 =training(X_train, y_train, 5)
y_pred = predict(knn1, X_test, y_test)

              precision    recall  f1-score   support

           0       0.68      0.84      0.75       191
           1       0.81      0.63      0.71       209

    accuracy                           0.73       400
   macro avg       0.75      0.74      0.73       400
weighted avg       0.75      0.73      0.73       400



In [11]:
best_k, scores = optimize(X,y)
knn2 = training(X_train, y_train, best_k)
prediction = predict(knn2, X_test, y_test)

              precision    recall  f1-score   support

           0       0.71      0.92      0.80       191
           1       0.90      0.66      0.76       209

    accuracy                           0.78       400
   macro avg       0.80      0.79      0.78       400
weighted avg       0.81      0.78      0.78       400



In [12]:
best_knn = optimize_with_grid(X_train, y_train)
prediction = predict(best_knn, X_test, y_test)

{'n_neighbors': 21}
0.76625
              precision    recall  f1-score   support

           0       0.71      0.91      0.80       191
           1       0.89      0.67      0.76       209

    accuracy                           0.78       400
   macro avg       0.80      0.79      0.78       400
weighted avg       0.80      0.78      0.78       400



<h1> KNN using SMOTE sampling </h1>

In [13]:
X,y = splitting_data(df, 'SMOTE')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [14]:
#check number of observations in each class in the set
print("Number of observations in each class in the training set:")
print(y.value_counts())

Number of observations in each class in the training set:
Class
1    1013
0    1013
Name: count, dtype: int64


In [15]:
knn3 =training(X_train, y_train, 5)
y_pred = predict(knn3, X_test, y_test)

              precision    recall  f1-score   support

           0       0.74      0.82      0.78       211
           1       0.78      0.69      0.73       195

    accuracy                           0.76       406
   macro avg       0.76      0.75      0.75       406
weighted avg       0.76      0.76      0.75       406



In [16]:
best_k, scores = optimize(X,y)
knn4 = training(X_train, y_train, best_k)
prediction = predict(knn4, X_test, y_test)

              precision    recall  f1-score   support

           0       0.75      0.89      0.81       211
           1       0.85      0.69      0.76       195

    accuracy                           0.79       406
   macro avg       0.80      0.79      0.79       406
weighted avg       0.80      0.79      0.79       406



In [17]:
best_knn = optimize_with_grid(X_train, y_train)
prediction = predict(best_knn, X_test, y_test)

{'n_neighbors': 15}
0.7666666666666666
              precision    recall  f1-score   support

           0       0.77      0.88      0.82       211
           1       0.84      0.71      0.77       195

    accuracy                           0.80       406
   macro avg       0.81      0.79      0.80       406
weighted avg       0.80      0.80      0.80       406



<h1> KNN using SMOTEENN sampling </h1>

In [18]:
X,y = splitting_data(df, 'SMOTEENN')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [19]:
print("Number of observations in each class in the set:")
print(y.value_counts())

Number of observations in each class in the set:
Class
1    151
0    137
Name: count, dtype: int64


In [20]:
knn5 =training(X_train, y_train, 5)
y_pred = predict(knn5, X_test, y_test)

              precision    recall  f1-score   support

           0       0.64      0.72      0.68        25
           1       0.77      0.70      0.73        33

    accuracy                           0.71        58
   macro avg       0.70      0.71      0.70        58
weighted avg       0.71      0.71      0.71        58



In [21]:
best_k, scores = optimize(X,y)
knn6 = training(X_train, y_train, best_k)
prediction = predict(knn6, X_test, y_test)

              precision    recall  f1-score   support

           0       0.63      0.76      0.69        25
           1       0.79      0.67      0.72        33

    accuracy                           0.71        58
   macro avg       0.71      0.71      0.71        58
weighted avg       0.72      0.71      0.71        58



In [22]:
best_knn = optimize_with_grid(X_train, y_train)
prediction = predict(best_knn, X_test, y_test)

{'n_neighbors': 21}
0.7478260869565218
              precision    recall  f1-score   support

           0       0.67      0.80      0.73        25
           1       0.82      0.70      0.75        33

    accuracy                           0.74        58
   macro avg       0.74      0.75      0.74        58
weighted avg       0.75      0.74      0.74        58



<h1> KNN on Random undersampling </h1>

In [23]:
X,y = splitting_data(df, 'under')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [24]:
print("Number of observations in each class in the set:")
print(y.value_counts())

Number of observations in each class in the set:
Class
0    987
1    987
Name: count, dtype: int64


In [25]:
knn7 =training(X_train, y_train, 5)
y_pred = predict(knn7, X_test, y_test)

              precision    recall  f1-score   support

           0       0.74      0.82      0.78       200
           1       0.80      0.70      0.75       195

    accuracy                           0.76       395
   macro avg       0.77      0.76      0.76       395
weighted avg       0.77      0.76      0.76       395



In [26]:
best_k, scores = optimize(X,y)
knn8 = training(X_train, y_train, best_k)
prediction = predict(knn8, X_test, y_test)

              precision    recall  f1-score   support

           0       0.76      0.88      0.81       200
           1       0.85      0.71      0.77       195

    accuracy                           0.79       395
   macro avg       0.80      0.79      0.79       395
weighted avg       0.80      0.79      0.79       395



In [27]:
best_knn = optimize_with_grid(X_train, y_train)
prediction = predict(best_knn, X_test, y_test)

{'n_neighbors': 13}
0.7643982318665863
              precision    recall  f1-score   support

           0       0.75      0.85      0.80       200
           1       0.82      0.71      0.76       195

    accuracy                           0.78       395
   macro avg       0.79      0.78      0.78       395
weighted avg       0.79      0.78      0.78       395

