In [1]:
# Part 1: Import Libraries and Load Data
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Load the dataset
df = pd.read_csv('diabetes_prediction_dataset.csv')

# Display the first few rows of the dataset
print(df.head())


   gender   age  hypertension  heart_disease smoking_history    bmi  \
0  Female  80.0             0              1           never  25.19   
1  Female  54.0             0              0         No Info  27.32   
2    Male  28.0             0              0           never  27.32   
3  Female  36.0             0              0         current  23.45   
4    Male  76.0             1              1         current  20.14   

   HbA1c_level  blood_glucose_level  diabetes  
0          6.6                  140         0  
1          6.6                   80         0  
2          5.7                  158         0  
3          5.0                  155         0  
4          4.8                  155         0  


In [2]:
# Part 2: Data Preprocessing
# Handle missing data
df = df.dropna()  # Dropping rows with missing values for simplicity

# Encode categorical variables
le = LabelEncoder()
df['gender'] = le.fit_transform(df['gender'])
df['hypertension'] = le.fit_transform(df['hypertension'])
df['heart_disease'] = le.fit_transform(df['heart_disease'])
df['smoking_history'] = le.fit_transform(df['smoking_history'])

# Prepare data for classification
X = df.drop(['diabetes', 'blood_glucose_level'], axis=1)  # Exclude both target and blood glucose level for features
y = df['diabetes']  # Target variable

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Display processed features and target
print("Features and target prepared.")


Features and target prepared.


In [3]:
# Part 3: Train-Test Split
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

print("Train-test split completed.")


Train-test split completed.


In [4]:
# Part 4: Model Training
from sklearn.linear_model import LogisticRegression

# Train classification model
clf = LogisticRegression()
clf.fit(X_train, y_train)

print("Model training completed.")


Model training completed.


In [5]:
# Part 5: Model Evaluation
from sklearn.metrics import classification_report, accuracy_score

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate the classification model
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))


Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.99      0.97     18292
           1       0.84      0.43      0.57      1708

    accuracy                           0.94     20000
   macro avg       0.90      0.71      0.77     20000
weighted avg       0.94      0.94      0.94     20000

Accuracy Score: 0.94465


In [6]:
# Part 6: Predict for New Data
# Prepare new data for prediction (without the blood glucose level)
new_data = pd.DataFrame({
    'gender': [1],  # Male
    'age': [45],
    'hypertension': [1],  # Yes
    'heart_disease': [0],  # No
    'smoking_history': [0],  # No
    'bmi': [27.5],
    'HbA1c_level': [6.2]
})

# Preprocess and scale the new data using the same scaler
new_data_scaled = scaler.transform(new_data)  # No need to drop blood glucose as it's not in new_data

# Predict diabetes for the new data
diabetes_prediction = clf.predict(new_data_scaled)
print("Diabetes Prediction (1=Yes, 0=No):", diabetes_prediction)


Diabetes Prediction (1=Yes, 0=No): [0]


In [7]:
# Example case where diabetes is likely to be 1
new_data_high_risk = pd.DataFrame({
    'gender': [1],  # Male
    'age': [60],
    'hypertension': [1],  # Yes
    'heart_disease': [1],  # Yes
    'smoking_history': [0],  # No
    'bmi': [30.5],  # Obese
    'HbA1c_level': [7.5]  # Poor blood sugar control
})

# Preprocess and scale the new data using the same scaler
new_data_high_risk_scaled = scaler.transform(new_data_high_risk)

# Predict diabetes for the new high-risk data
diabetes_prediction_high_risk = clf.predict(new_data_high_risk_scaled)
print("Diabetes Prediction for High-Risk Case (1=Yes, 0=No):", diabetes_prediction_high_risk)


Diabetes Prediction for High-Risk Case (1=Yes, 0=No): [1]


In [8]:
# Case 1: Middle-Aged Obese Female
case_1 = pd.DataFrame({
    'gender': [0],  # Female
    'age': [55],
    'hypertension': [1],  # Yes
    'heart_disease': [0],  # No
    'smoking_history': [0],  # No
    'bmi': [32.0],  # Obese
    'HbA1c_level': [8.0]  # Poor blood sugar control
})

# Case 2: Young Male with High HbA1c
case_2 = pd.DataFrame({
    'gender': [1],  # Male
    'age': [40],
    'hypertension': [1],  # Yes
    'heart_disease': [0],  # No
    'smoking_history': [1],  # Yes
    'bmi': [29.0],  # Overweight
    'HbA1c_level': [7.0]  # Elevated blood sugar
})

# Case 3: Older Female with High BMI and HbA1c
case_3 = pd.DataFrame({
    'gender': [0],  # Female
    'age': [65],
    'hypertension': [1],  # Yes
    'heart_disease': [1],  # Yes
    'smoking_history': [0],  # No
    'bmi': [34.0],  # Obese
    'HbA1c_level': [9.0]  # Very poor blood sugar control
})

# Case 4: Middle-Aged Male with Family History
case_4 = pd.DataFrame({
    'gender': [1],  # Male
    'age': [50],
    'hypertension': [1],  # Yes
    'heart_disease': [1],  # Yes
    'smoking_history': [0],  # No
    'bmi': [31.5],  # Obese
    'HbA1c_level': [7.8]  # Poor blood sugar control
})

# List of cases
cases = [case_1, case_2, case_3, case_4]

# Loop through cases to predict diabetes
for i, case in enumerate(cases):
    # Preprocess and scale the new data using the same scaler
    case_scaled = scaler.transform(case)

    # Predict diabetes for the new data
    diabetes_prediction = clf.predict(case_scaled)
    print(f"Diabetes Prediction for Case {i + 1} (1=Yes, 0=No):", diabetes_prediction)


Diabetes Prediction for Case 1 (1=Yes, 0=No): [1]
Diabetes Prediction for Case 2 (1=Yes, 0=No): [0]
Diabetes Prediction for Case 3 (1=Yes, 0=No): [1]
Diabetes Prediction for Case 4 (1=Yes, 0=No): [1]


In [9]:
# Case 1: Middle-Aged Obese Female
case_1 = pd.DataFrame({
    'gender': [0],  # Female
    'age': [55],
    'hypertension': [1],  # Yes
    'heart_disease': [0],  # No
    'smoking_history': [0],  # No
    'bmi': [32.0],  # Obese
    'HbA1c_level': [7.0]  # Poor blood sugar control
})

# Case 2: Young Male with High HbA1c
case_2 = pd.DataFrame({
    'gender': [1],  # Male
    'age': [40],
    'hypertension': [1],  # Yes
    'heart_disease': [0],  # No
    'smoking_history': [1],  # Yes
    'bmi': [29.0],  # Overweight
    'HbA1c_level': [7.0]  # Elevated blood sugar
})

# Case 3: Older Female with High BMI and HbA1c
case_3 = pd.DataFrame({
    'gender': [0],  # Female
    'age': [65],
    'hypertension': [1],  # Yes
    'heart_disease': [1],  # Yes
    'smoking_history': [0],  # No
    'bmi': [34.0],  # Obese
    'HbA1c_level': [9.0]  # Very poor blood sugar control
})

# Case 4: Middle-Aged Male with Family History
case_4 = pd.DataFrame({
    'gender': [1],  # Male
    'age': [50],
    'hypertension': [1],  # Yes
    'heart_disease': [1],  # Yes
    'smoking_history': [0],  # No
    'bmi': [31.5],  # Obese
    'HbA1c_level': [7.8]  # Poor blood sugar control
})

# List of cases
cases = [case_1, case_2, case_3, case_4]

# Loop through cases to predict diabetes
for i, case in enumerate(cases):
    # Preprocess and scale the new data using the same scaler
    case_scaled = scaler.transform(case)

    # Predict diabetes for the new data
    diabetes_prediction = clf.predict(case_scaled)
    print(f"Diabetes Prediction for Case {i + 1} (1=Yes, 0=No):", diabetes_prediction)


Diabetes Prediction for Case 1 (1=Yes, 0=No): [1]
Diabetes Prediction for Case 2 (1=Yes, 0=No): [0]
Diabetes Prediction for Case 3 (1=Yes, 0=No): [1]
Diabetes Prediction for Case 4 (1=Yes, 0=No): [1]
