In [1]:
import pandas as pd

In [2]:
import numpy as np

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
from sklearn.linear_model import LogisticRegression

In [5]:
from sklearn.metrics import accuracy_score, classification_report

In [6]:
from sklearn.preprocessing import StandardScaler

In [7]:
import pickle

In [8]:
data = pd.read_csv('diabetes.csv')

In [9]:
print(data.head())

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


In [10]:
X = data.drop('Outcome', axis=1)
y = data['Outcome']

In [11]:
print(X.head())
print(y.head())

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  
0                     0.627   50  
1                     0.351   31  
2                     0.672   32  
3                     0.167   21  
4                     2.288   33  
0    1
1    0
2    1
3    0
4    1
Name: Outcome, dtype: int64


In [12]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# Display the shapes of the training and testing sets
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(614, 8) (154, 8) (614,) (154,)


In [14]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [15]:
# Save the scaler
with open('scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)


In [16]:
# Initialize the logistic regression model
model = LogisticRegression(max_iter=2000)

In [17]:
# Train the model
model.fit(X_train, y_train)


In [18]:
# Make predictions on the test set
y_pred = model.predict(X_test)


In [19]:
# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")


Accuracy: 0.7532467532467533
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.80      0.81        99
           1       0.65      0.67      0.66        55

    accuracy                           0.75       154
   macro avg       0.73      0.74      0.73       154
weighted avg       0.76      0.75      0.75       154



In [20]:
# Example new data for prediction
new_data = pd.DataFrame({
    'Pregnancies': [2],
    'Glucose': [120],
    'BloodPressure': [70],
    'SkinThickness': [20],
    'Insulin': [80],
    'BMI': [25.0],
    'DiabetesPedigreeFunction': [0.5],
    'Age': [30]
})

# Make predictions on new data
new_prediction = model.predict(new_data)
result = "Diabetic" if new_prediction[0] == 1 else "Not Diabetic"

print(f"Prediction for new data: {result}")


Prediction for new data: Diabetic




In [21]:
# Save the model to a file
with open('diabetes_model.pkl', 'wb') as file:
    pickle.dump(model, file)
print("Model saved successfully!")


Model saved successfully!


In [22]:
# Load the model from the file
with open('diabetes_model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

# Use the loaded model to make predictions
data = {"features": [2, 3, 4, 5, 6, 7, 8, 9]}  # Ensure 8 features
features = np.array(data["features"]).reshape(1, -1)
prediction = loaded_model.predict(features)
result = "Diabetic" if prediction[0] == 1 else "Not Diabetic"

print(f"Prediction: {result}")


Prediction: Diabetic


In [23]:
# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred,zero_division=0)

print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")


Accuracy: 0.7532467532467533
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.80      0.81        99
           1       0.65      0.67      0.66        55

    accuracy                           0.75       154
   macro avg       0.73      0.74      0.73       154
weighted avg       0.76      0.75      0.75       154

