In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,precision_score,recall_score,f1_score
import joblib

In [2]:
df=pd.read_csv("C:/Users/priya/Desktop/Major project/Multiple Disease Prediction System/Datasets/diabetes_prediction_dataset.csv")
df

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0
...,...,...,...,...,...,...,...,...,...
99995,Female,80.0,0,0,No Info,27.32,6.2,90,0
99996,Female,2.0,0,0,No Info,17.37,6.5,100,0
99997,Male,66.0,0,0,former,27.83,5.7,155,0
99998,Female,24.0,0,0,never,35.42,4.0,100,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB


In [15]:
encoder_gender=LabelEncoder()
df['gender']=encoder_gender.fit_transform(df['gender'])
encoder_sh=LabelEncoder()
df['smoking_history']=encoder_sh.fit_transform(df['smoking_history'])

In [16]:
X=df.drop('diabetes',axis=1)
y=df['diabetes']

In [17]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [18]:
scaler=StandardScaler()
X_train_scaled=scaler.fit_transform(X_train)
X_test_scaled=scaler.transform(X_test)

In [19]:
log_model=LogisticRegression(max_iter=2000)
log_model.fit(X_train_scaled,y_train)


In [20]:
y_pred_log=log_model.predict(X_test_scaled)
y_pred_log

array([0, 0, 0, ..., 0, 0, 0], shape=(20000,))

In [21]:
rf_model=RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    random_state=42)
rf_model.fit(X_train_scaled,y_train)

In [22]:
y_pred_rf=rf_model.predict(X_test_scaled)
y_pred_rf

array([0, 0, 0, ..., 0, 0, 0], shape=(20000,))

In [23]:
print("Evaluate Model: Logistic Regression")
print("Accuracy:",accuracy_score(y_test,y_pred_log))
print("Confusion Matrix: \n",confusion_matrix(y_test,y_pred_log))
print("Classification Report: \n",classification_report(y_test,y_pred_log))

Evaluate Model: Logistic Regression
Accuracy: 0.95865
Confusion Matrix: 
 [[18127   165]
 [  662  1046]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.96      0.99      0.98     18292
           1       0.86      0.61      0.72      1708

    accuracy                           0.96     20000
   macro avg       0.91      0.80      0.85     20000
weighted avg       0.96      0.96      0.96     20000



In [24]:
print("Evaluate Model: Random Forest")
print("Accuracy:",accuracy_score(y_test,y_pred_rf))
print("Confusion Matrix: \n",confusion_matrix(y_test,y_pred_rf))
print("Classification Report: \n",classification_report(y_test,y_pred_rf))

Evaluate Model: Random Forest
Accuracy: 0.97055
Confusion Matrix: 
 [[18234    58]
 [  531  1177]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.97      1.00      0.98     18292
           1       0.95      0.69      0.80      1708

    accuracy                           0.97     20000
   macro avg       0.96      0.84      0.89     20000
weighted avg       0.97      0.97      0.97     20000



In [25]:
joblib.dump(log_model, "diabetes_logistic_regression.pkl")
joblib.dump(rf_model, "diabetes_random_forest.pkl")
joblib.dump(scaler, "diabetes_scaler.pkl")
joblib.dump(encoder_gender, "diabetes_gender_encoder.pkl")
joblib.dump(encoder_sh, "diabetes_smoke_encoder.pkl")

['diabetes_smoke_encoder.pkl']