In [4]:
pip install pandas

Collecting pandas
  Downloading pandas-2.3.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (91 kB)
Collecting numpy>=1.26.0 (from pandas)
  Downloading numpy-2.3.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (62 kB)
Downloading pandas-2.3.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m26.2 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hDownloading numpy-2.3.1-cp312-cp312-manylinux_2_28_x86_64.whl (16.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.6/16.6 MB[0m [31m40.3 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: numpy, pandas
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [pandas]2m1/2[0m [pandas]
[1A[2KSuccessfully installed numpy-2.3.1 pandas-2.3.1
Note: you may need to restart the kernel to use updated packages.


In [6]:
pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.7.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (17 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.5.1-py3-none-any.whl.metadata (5.6 kB)
Downloading scikit_learn-1.7.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.5/12.5 MB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hDownloading joblib-1.5.1-py3-none-any.whl (307 kB)
Installing collected packages: joblib, scikit-learn
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [scikit-learn][0m [scikit-learn]
[1A[2KSuccessfully installed joblib-1.5.1 scikit-learn-1.7.0
Note: you may need to restart the kernel to use updated packages.


In [7]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [8]:
# Load dataset
data = pd.read_csv('data/diabetes.csv')

In [9]:
# Derive FamilyHistory from DiabetesPedigreeFunction
data['FamilyHistory'] = (data['DiabetesPedigreeFunction'] > 0.5).astype(int)

In [10]:
# Simulate new features
np.random.seed(42)
data['DailySugarIntake'] = np.random.uniform(10, 100, len(data))  # Grams/day
data['PhysicalActivity'] = np.random.uniform(0, 20, len(data))  # Hours/week
data['Gender'] = np.random.choice([0, 1], len(data), p=[0.5, 0.5])  # 0 = female, 1 = male
data['SmokingHistory'] = np.random.choice([0, 1], len(data), p=[0.7, 0.3])  # 0 = non-smoker, 1 = smoker
data['DrinkingHistory'] = np.random.choice([0, 1], len(data), p=[0.8, 0.2])  # 0 = non-drinker, 1 = drinker

In [11]:
# Start with base probability
data['Outcome'] = 0.0

# Add risk factors
data.loc[data['DailySugarIntake'] > 70, 'Outcome'] += 0.4
data.loc[data['PhysicalActivity'] < 5, 'Outcome'] += 0.3
data.loc[data['SmokingHistory'] == 1, 'Outcome'] += 0.7
data.loc[data['DrinkingHistory'] == 1, 'Outcome'] += 0.6

# Clip to [0, 1] range
data['Outcome'] = data['Outcome'].clip(0, 1)

In [7]:
# Save augmented dataset
data.to_csv('data/diabetes_augmented.csv', index=False)
print("Augmented dataset saved as data/diabetes_augmented.csv")

Augmented dataset saved as data/diabetes_augmented.csv


In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import pickle

In [13]:
# Load augmented dataset
data = pd.read_csv('data/diabetes_augmented.csv')

In [15]:
# Select features
features = ['Glucose', 'BMI', 'Age', 'FamilyHistory', 'BloodPressure', 
            'DailySugarIntake', 'PhysicalActivity', 'Gender', 'SmokingHistory', 'DrinkingHistory']
X = data[features]
y = data['Outcome']

In [16]:
# # Handle missing values (zeros in Glucose, BMI, BloodPressure)
# X.loc[:, ['Glucose', 'BMI', 'BloodPressure']] = X[['Glucose', 'BMI', 'BloodPressure']].replace(0, pd.NA)
# X.fillna(X.mean(), inplace=True)

# Handle missing values (zeros in Glucose, BMI, BloodPressure)
X.loc[:, ['Glucose', 'BMI', 'BloodPressure']] = X[['Glucose', 'BMI', 'BloodPressure']].replace(0, np.nan)
X.fillna(X.mean(), inplace=True)

 166. 100. 118. 107. 103. 115. 126.  99. 196. 119. 143. 125. 147.  97.
 145. 117. 109. 158.  88.  92. 122. 103. 138. 102.  90. 111. 180. 133.
 106. 171. 159. 180. 146.  71. 103. 105. 103. 101.  88. 176. 150.  73.
 187. 100. 146. 105.  84. 133.  44. 141. 114.  99. 109. 109.  95. 146.
 100. 139. 126. 129.  79.  nan  62.  95. 131. 112. 113.  74.  83. 101.
 137. 110. 106. 100. 136. 107.  80. 123.  81. 134. 142. 144.  92.  71.
  93. 122. 163. 151. 125.  81.  85. 126.  96. 144.  83.  95. 171. 155.
  89.  76. 160. 146. 124.  78.  97.  99. 162. 111. 107. 132. 113.  88.
 120. 118. 117. 105. 173. 122. 170.  84.  96. 125. 100.  93. 129. 105.
 128. 106. 108. 108. 154. 102.  57. 106. 147.  90. 136. 114. 156. 153.
 188. 152.  99. 109.  88. 163. 151. 102. 114. 100. 131. 104. 148. 120.
 110. 111. 102. 134.  87.  79.  75. 179.  85. 129. 143. 130.  87. 119.
  nan  73. 141. 194. 181. 128. 109. 139. 111. 123. 159. 135.  85. 158.
 105. 107. 109. 148. 113. 138. 108.  99. 103. 111. 196. 162.  96. 184.
  81. 

In [17]:
# # Split data
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
# Convert continuous target to binary classes
y_train_binary = (y_train > 0.5).astype(int)
y_test_binary = (y_test > 0.5).astype(int)

In [23]:
# # Scale features
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_scaled = scaler.fit_transform(X)

In [25]:
# # Train model
# model = LogisticRegression(max_iter=200)
# model.fit(X_train, y_train)

# from sklearn.linear_model import LinearRegression
# model = LinearRegression()
# model.fit(X_train, y_train)

# Train model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train_binary)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [26]:
# # Evaluate model
# y_pred = model.predict(X_test)
# print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
# print(classification_report(y_test, y_pred, target_names=['Low Risk', 'High Risk']))

y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test_binary, y_pred):.2f}")
print(classification_report(y_test_binary, y_pred, target_names=['Low Risk', 'High Risk']))

Accuracy: 0.99
              precision    recall  f1-score   support

    Low Risk       0.99      1.00      0.99        80
   High Risk       1.00      0.99      0.99        74

    accuracy                           0.99       154
   macro avg       0.99      0.99      0.99       154
weighted avg       0.99      0.99      0.99       154



In [28]:
# Save model and scaler
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

print("Model and scaler saved with new features!")

Model and scaler saved with new features!


In [29]:
import joblib

In [30]:
joblib.dump (scaler, 'scakel.pkl')
joblib.dump (model, 'model.pkl')

print("Scaler and model regenerated with scikit-learn 1.4.2")

Scaler and model regenerated with scikit-learn 1.4.2
