In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pickle
import numpy as np

In [2]:
# Load the dataset
data = pd.read_csv('data/diabetes.csv')

In [3]:
# Check for missing values (zeros in some columns are invalid)
columns_with_zeros = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
data[columns_with_zeros] = data[columns_with_zeros].replace(0, pd.NA)
data.fillna(data.mean(), inplace=True)

  data.fillna(data.mean(), inplace=True)


In [4]:
# Features and target
X = data.drop('Outcome', axis=1)
y = data['Outcome']

In [5]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [7]:
# Save scaler for Flask app
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

print("Preprocessing complete!")

Preprocessing complete!


In [8]:
# Derive FamilyHistory from DiabetesPedigreeFunction
data['FamilyHistory'] = (data['DiabetesPedigreeFunction'] > 0.5).astype(int)

In [9]:
# Simulate new features
np.random.seed(42)
data['DailySugarIntake'] = np.random.uniform(10, 100, len(data))  # Grams/day
data['PhysicalActivity'] = np.random.uniform(0, 20, len(data))  # Hours/week
data['Gender'] = np.random.choice([0, 1], len(data), p=[0.5, 0.5])  # 0 = female, 1 = male
data['SmokingHistory'] = np.random.choice([0, 1], len(data), p=[0.7, 0.3])  # 0 = non-smoker, 1 = smoker
data['DrinkingHistory'] = np.random.choice([0, 1], len(data), p=[0.8, 0.2])  # 0 = non-drinker, 1 = drinker

In [12]:
# # Simulate correlations: higher sugar intake, lower activity, smoking, or drinking increase risk
# data.loc[data['DailySugarIntake'] > 70, 'Outcome'] = data['Outcome'] | 1
# data.loc[data['PhysicalActivity'] < 5, 'Outcome'] = data['Outcome'] | 1
# data.loc[data['SmokingHistory'] == 1, 'Outcome'] = data['Outcome'] | 0.7  # 70% chance of high risk
# data.loc[data['DrinkingHistory'] == 1, 'Outcome'] = data['Outcome'] | 0.6  # 60% chance
# data['Outcome'] = data['Outcome'].clip(0, 1).astype(int)

# Start with base probability
data['Outcome'] = 0.0

# Add risk factors
data.loc[data['DailySugarIntake'] > 70, 'Outcome'] += 0.4
data.loc[data['PhysicalActivity'] < 5, 'Outcome'] += 0.3
data.loc[data['SmokingHistory'] == 1, 'Outcome'] += 0.7
data.loc[data['DrinkingHistory'] == 1, 'Outcome'] += 0.6

# Clip to [0, 1] range
data['Outcome'] = data['Outcome'].clip(0, 1)

In [13]:
# Save augmented dataset
data.to_csv('data/diabetes_augmented.csv', index=False)
print("Augmented dataset saved as data/diabetes_augmented.csv")

Augmented dataset saved as data/diabetes_augmented.csv


Model Development

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import pickle

In [9]:
# Train model
model = LogisticRegression(max_iter=200)
model.fit(X_train, y_train)

In [10]:
# Predict and evaluate
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print(classification_report(y_test, y_pred, target_names=['Low Risk', 'High Risk']))

Accuracy: 0.75
              precision    recall  f1-score   support

    Low Risk       0.80      0.83      0.81        99
   High Risk       0.67      0.62      0.64        55

    accuracy                           0.75       154
   macro avg       0.73      0.72      0.73       154
weighted avg       0.75      0.75      0.75       154



In [11]:
# Save model
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)

In [None]:
data.head()