In [1]:
# Part 1: Import Libraries and Load Data
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score

# Load the dataset
df = pd.read_csv('diabetes_prediction_dataset.csv')

# Display the first few rows of the dataset
print(df.head())

# Part 2: Data Preprocessing
# Handle missing data
df = df.dropna()  # Dropping rows with missing values for simplicity

# Encode categorical variables
le = LabelEncoder()
df['gender'] = le.fit_transform(df['gender'])
df['hypertension'] = le.fit_transform(df['hypertension'])
df['heart_disease'] = le.fit_transform(df['heart_disease'])
df['smoking_history'] = le.fit_transform(df['smoking_history'])

# Prepare data for classification
X = df.drop(['diabetes', 'blood_glucose_level'], axis=1)  # Exclude both target and blood glucose level for features
y = df['diabetes']  # Target variable

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Display processed features and target
print("Features and target prepared.")

# Part 3: Train-Test Split
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

print("Train-test split completed.")

# Part 4: Model Training with k-NN
# Initialize k-NN classifier with k neighbors (e.g., k=5)
k = 5
knn = KNeighborsClassifier(n_neighbors=k)

# Train k-NN classifier
knn.fit(X_train, y_train)
print("k-NN model training completed.")

# Part 5: Model Evaluation
# Make predictions on the test set
y_pred = knn.predict(X_test)

# Evaluate the k-NN model
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))

# Part 6: Predict for New Data
# Prepare new data for prediction (without the blood glucose level)
new_data = pd.DataFrame({
    'gender': [1],  # Male
    'age': [45],
    'hypertension': [1],  # Yes
    'heart_disease': [0],  # No
    'smoking_history': [0],  # No
    'bmi': [27.5],
    'HbA1c_level': [6.2]
})

# Preprocess and scale the new data using the same scaler
new_data_scaled = scaler.transform(new_data)  # No need to drop blood glucose as it's not in new_data

# Predict diabetes for the new data
diabetes_prediction = knn.predict(new_data_scaled)
print("Diabetes Prediction (1=Yes, 0=No):", diabetes_prediction)

# Example case where diabetes is likely to be 1
new_data_high_risk = pd.DataFrame({
    'gender': [1],  # Male
    'age': [60],
    'hypertension': [1],  # Yes
    'heart_disease': [1],  # Yes
    'smoking_history': [0],  # No
    'bmi': [30.5],  # Obese
    'HbA1c_level': [7.5]  # Poor blood sugar control
})

# Preprocess and scale the new data using the same scaler
new_data_high_risk_scaled = scaler.transform(new_data_high_risk)

# Predict diabetes for the new high-risk data
diabetes_prediction_high_risk = knn.predict(new_data_high_risk_scaled)
print("Diabetes Prediction for High-Risk Case (1=Yes, 0=No):", diabetes_prediction_high_risk)


   gender   age  hypertension  heart_disease smoking_history    bmi  \
0  Female  80.0             0              1           never  25.19   
1  Female  54.0             0              0         No Info  27.32   
2    Male  28.0             0              0           never  27.32   
3  Female  36.0             0              0         current  23.45   
4    Male  76.0             1              1         current  20.14   

   HbA1c_level  blood_glucose_level  diabetes  
0          6.6                  140         0  
1          6.6                   80         0  
2          5.7                  158         0  
3          5.0                  155         0  
4          4.8                  155         0  
Features and target prepared.
Train-test split completed.
k-NN model training completed.
Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.99      0.97     18292
           1       0.77      0.45      0.57      1708

    accu