In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Step 1: Load the dataset. Make sure that when you use the dataset has clean data. This means the latest information
#for every paiteent, if the the data set has things missing, make sure to choose the last measure for that patient, 
# if the patient is fully missing one measure, make sure to use statistically correct processes to not squew the model
# Example dataset
data = {
    'age': [25, 45, 35, 50, 30, 60, 40, 55],
    'gender_at_birth': ['Male', 'Female', 'Female', 'Male', 'Male', 'Female', 'Male', 'Female'],
    'bmi': [22.5, 27.8, 25.4, 31.2, 23.9, 29.4, 26.1, 28.7],
    'family_history': [0, 1, 0, 1, 0, 1, 0, 1],  # 0: No, 1: Yes
    'sdoh_index': [1, 3, 2, 4, 1, 5, 2, 4],  # Social Determinants of Health index (1-5 scale)
    'hba1c': [5.4, 6.5, 5.9, 7.2, 5.6, 7.8, 6.1, 6.9],  # HbA1C levels
    'fasting_sugar': [90, 110, 100, 130, 95, 145, 105, 125],  # Last fasting sugar levels
    'fever': [98.6, 101.2, 99.1, 100.5, 103.3, 98.7, 100.1, 102.1],
    'cough': [0, 1, 1, 1, 1, 0, 0, 1],
    'fatigue': [0, 1, 0, 1, 1, 0, 0, 1],
    'headache': [0, 1, 1, 1, 1, 0, 1, 1],
    'diabetes': [0, 1, 0, 1, 0, 1, 0, 1]  # 0: No diabetes, 1: Diabetes
}

df = pd.DataFrame(data)

# Step 2: Prepare the data
# Encode categorical variables
le_gender_at_birth = LabelEncoder()
df['gender_at_birth'] = le_gender_at_birth.fit_transform(df['gender_at_birth'])

# Define features and target variable
X = df[['age', 'gender_at_birth', 'bmi', 'family_history', 'sdoh_index', 'hba1c', 'fasting_sugar', 'fever', 'cough', 'fatigue', 'headache']]
y = df['diabetes']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Step 3: Train the model
model = LogisticRegression()
model.fit(X_train, y_train)

# Step 4: Make predictions
y_pred = model.predict(X_test)

# Step 5: Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Step 6: Segment patients based on risk
# Predict probabilities for each patient
y_prob = model.predict_proba(X)[:, 1]

# Define risk segments based on probability thresholds
df['risk_segment'] = pd.cut(y_prob, bins=[0, 0.33, 0.66, 1], labels=['Low', 'Medium', 'High'])

# Add patient identifiers (A-Z) for segmentation
df['patient_id'] = [chr(65 + i) for i in range(len(df))]

# Display the segmentation results
print(df[['patient_id', 'age', 'gender_at_birth', 'bmi', 'family_history', 'sdoh_index', 'hba1c', 'fasting_sugar', 'risk_segment']])

# Step 7: Use the model for a new prediction
new_patient = [[45, le_gender_at_birth.transform(['Male'])[0], 28.0, 1, 4, 6.8, 120, 100.0, 1, 0, 1]]  # Example: Patient details
new_patient_scaled = scaler.transform(new_patient)
new_prediction = model.predict(new_patient_scaled)
new_probability = model.predict_proba(new_patient_scaled)[0][1]

# Determine risk segment for the new patient
if new_probability < 0.33:
    risk_segment = 'Low'
elif new_probability < 0.66:
    risk_segment = 'Medium'
else:
    risk_segment = 'High'

print(f"The new patient is predicted to be at {risk_segment} risk of developing diabetes.")


Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

           1       1.00      1.00      1.00         2

    accuracy                           1.00         2
   macro avg       1.00      1.00      1.00         2
weighted avg       1.00      1.00      1.00         2

  patient_id  age  gender_at_birth   bmi  family_history  sdoh_index  hba1c  \
0          A   25                1  22.5               0           1    5.4   
1          B   45                0  27.8               1           3    6.5   
2          C   35                0  25.4               0           2    5.9   
3          D   50                1  31.2               1           4    7.2   
4          E   30                1  23.9               0           1    5.6   
5          F   60                0  29.4               1           5    7.8   
6          G   40                1  26.1               0           2    6.1   
7          H   55                0  28.7               1 

