In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load dataset
data = pd.read_csv('data/diabetes.csv')

In [3]:
# Derive FamilyHistory from DiabetesPedigreeFunction
data['FamilyHistory'] = (data['DiabetesPedigreeFunction'] > 0.5).astype(int)

In [4]:
# Simulate new features
np.random.seed(42)
data['DailySugarIntake'] = np.random.uniform(10, 100, len(data))  # Grams/day
data['PhysicalActivity'] = np.random.uniform(0, 20, len(data))  # Hours/week
data['Gender'] = np.random.choice([0, 1], len(data), p=[0.5, 0.5])  # 0 = female, 1 = male
data['SmokingHistory'] = np.random.choice([0, 1], len(data), p=[0.7, 0.3])  # 0 = non-smoker, 1 = smoker
data['DrinkingHistory'] = np.random.choice([0, 1], len(data), p=[0.8, 0.2])  # 0 = non-drinker, 1 = drinker

In [6]:
# Start with base probability
data['Outcome'] = 0.0

# Add risk factors
data.loc[data['DailySugarIntake'] > 70, 'Outcome'] += 0.4
data.loc[data['PhysicalActivity'] < 5, 'Outcome'] += 0.3
data.loc[data['SmokingHistory'] == 1, 'Outcome'] += 0.7
data.loc[data['DrinkingHistory'] == 1, 'Outcome'] += 0.6

# Clip to [0, 1] range
data['Outcome'] = data['Outcome'].clip(0, 1)

In [7]:
# Save augmented dataset
data.to_csv('data/diabetes_augmented.csv', index=False)
print("Augmented dataset saved as data/diabetes_augmented.csv")

Augmented dataset saved as data/diabetes_augmented.csv


In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import pickle

In [9]:
# Load augmented dataset
data = pd.read_csv('data/diabetes_augmented.csv')

In [10]:
# Select features
features = ['Glucose', 'BMI', 'Age', 'FamilyHistory', 'BloodPressure', 
            'DailySugarIntake', 'PhysicalActivity', 'Gender', 'SmokingHistory', 'DrinkingHistory']
X = data[features]
y = data['Outcome']

In [17]:
# # Handle missing values (zeros in Glucose, BMI, BloodPressure)
# X.loc[:, ['Glucose', 'BMI', 'BloodPressure']] = X[['Glucose', 'BMI', 'BloodPressure']].replace(0, pd.NA)
# X.fillna(X.mean(), inplace=True)

# Handle missing values (zeros in Glucose, BMI, BloodPressure)
X.loc[:, ['Glucose', 'BMI', 'BloodPressure']] = X[['Glucose', 'BMI', 'BloodPressure']].replace(0, np.nan)
X.fillna(X.mean(), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.fillna(X.mean(), inplace=True)


In [23]:
# # Split data
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:
# Convert continuous target to binary classes
y_train_binary = (y_train > 0.5).astype(int)
y_test_binary = (y_test > 0.5).astype(int)

In [25]:
# # Scale features
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [26]:
# # Train model
# model = LogisticRegression(max_iter=200)
# model.fit(X_train, y_train)

# from sklearn.linear_model import LinearRegression
# model = LinearRegression()
# model.fit(X_train, y_train)

# Train model
model = LogisticRegression(max_iter=200)
model.fit(X_train, y_train_binary)

In [27]:
# # Evaluate model
# y_pred = model.predict(X_test)
# print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
# print(classification_report(y_test, y_pred, target_names=['Low Risk', 'High Risk']))

y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test_binary, y_pred):.2f}")
print(classification_report(y_test_binary, y_pred, target_names=['Low Risk', 'High Risk']))

Accuracy: 0.99
              precision    recall  f1-score   support

    Low Risk       0.99      1.00      0.99        80
   High Risk       1.00      0.99      0.99        74

    accuracy                           0.99       154
   macro avg       0.99      0.99      0.99       154
weighted avg       0.99      0.99      0.99       154



In [28]:
# Save model and scaler
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

print("Model and scaler saved with new features!")

Model and scaler saved with new features!
