In [None]:
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.decomposition import PCA
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler

In [None]:
diabetes = load_diabetes()
X = diabetes.data  # Features
y = diabetes.target  # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (353, 10)
X_test shape: (89, 10)
y_train shape: (353,)
y_test shape: (89,)


In [None]:
data = load_diabetes(as_frame=True)
df = data.frame

# Display first few rows of the dataset
print(df.head())

# Define features and target variable
X = df.drop('target', axis=1)  # features
y = (df['target'] > df['target'].median()).astype(int)  # target (binary classification, above/below median)




        age       sex       bmi        bp        s1        s2        s3  \
0  0.038076  0.050680  0.061696  0.021872 -0.044223 -0.034821 -0.043401   
1 -0.001882 -0.044642 -0.051474 -0.026328 -0.008449 -0.019163  0.074412   
2  0.085299  0.050680  0.044451 -0.005670 -0.045599 -0.034194 -0.032356   
3 -0.089063 -0.044642 -0.011595 -0.036656  0.012191  0.024991 -0.036038   
4  0.005383 -0.044642 -0.036385  0.021872  0.003935  0.015596  0.008142   

         s4        s5        s6  target  
0 -0.002592  0.019907 -0.017646   151.0  
1 -0.039493 -0.068332 -0.092204    75.0  
2 -0.002592  0.002861 -0.025930   141.0  
3  0.034309  0.022688 -0.009362   206.0  
4 -0.002592 -0.031988 -0.046641   135.0  


In [None]:
# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Standardize the features for KNN
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define the KNN classifier and fit it to the training data
knn = KNeighborsClassifier(n_neighbors=4)  # starting with k=5
knn.fit(X_train, y_train)

# Predict on the test data
y_pred = knn.predict(X_test)

In [None]:
# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy knn: {accuracy:.2f}")

Accuracy knn: 0.71


In [None]:
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix knn:\n", conf_matrix)

Confusion Matrix knn:
 [[59 13]
 [26 35]]


In [None]:
X = df.drop('target', axis=1)  # features
y = (df['target'] > df['target'].median()).astype(int)  # target (binary classification, above/below median)

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=50)

# Standardize the features for logistic regression
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define the logistic regression model
log_reg = LogisticRegression()

# Train the model
log_reg.fit(X_train, y_train)

# Predict on the test data
y_pred = log_reg.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy lr: {accuracy:.2f}")



Accuracy lr: 0.72


In [None]:
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix lr:\n", conf_matrix)

Confusion Matrix lr:
 [[51 13]
 [24 45]]


In [None]:
pca = PCA(n_components=5)  # Start with 5 principal components, can be adjusted
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

# Train the KNN model on reduced data
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_pca, y_train)

# Predict on the test data
y_pred = knn.predict(X_test_pca)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy after PCA: {accuracy:.2f}")


Accuracy after PCA: 0.62


In [None]:
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix after PCA:\n", conf_matrix)

Confusion Matrix after PCA:
 [[52 20]
 [18 43]]
