In [2]:
import pandas as pd

# Load dataset
data = pd.read_csv("diabetes (1).csv")

# Check first 5 rows
print(data.head())

# Check missing values and basic statistics
print(data.info())
print(data.describe())


   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768

In [3]:
cols = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]
for col in cols:
    data[col] = data[col].replace(0, data[col].median())


In [4]:
X = data.drop("Outcome", axis=1)
y = data["Outcome"]


In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Create and train model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:,1]  # Probability of being diabetic

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.7597402597402597
              precision    recall  f1-score   support

           0       0.80      0.83      0.82        99
           1       0.67      0.64      0.65        55

    accuracy                           0.76       154
   macro avg       0.74      0.73      0.74       154
weighted avg       0.76      0.76      0.76       154



In [9]:
def predict_diabetes(input_data):
    # input_data is a list of 8 values
    import numpy as np
    data_np = np.array(input_data).reshape(1, -1)
    
    probability = model.predict_proba(data_np)[0][1] * 100  # % risk
    prediction = model.predict(data_np)[0]
    
    if prediction == 1:
        return f"Diabetic (Risk: {probability:.2f}%)"
    else:
        return f"Non-Diabetic (Risk: {probability:.2f}%)"


In [11]:
input_data = [5, 166, 72, 19, 175, 25.8, 0.587, 51]  # Sample patient
print(predict_diabetes(input_data))


Diabetic (Risk: 66.19%)




In [15]:
import joblib
joblib.dump(model, "diabetes_model.pkl")


['diabetes_model.pkl']

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Assuming X_train, X_test, y_train, y_test are already prepared

# Create and train model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Check accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy*100:.2f}%")


Model Accuracy: 75.97%


In [21]:
import pickle

with open('diabetes_model.pkl', 'wb') as f:
    pickle.dump(model, f)
