In [4]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, confusion_matrix
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pandas as pd

# Load the dataset
file_path = 'cleaned_diabetes_data.csv'
data = pd.read_csv(file_path)

# Separate features and target variable
X = data.drop(columns=['diabetes'])
y = data['diabetes']

# Identify numerical and categorical columns
numerical_features = ['age', 'BMI', 'diabetes_pedigree_function', 'weight', 'sleep_duration', 'pregnancies']
categorical_features = ['gender', 'diet_type', 'social_media_usage', 'stress_level', 'physical_activity_level', 'alcohol_consumption']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Preprocessing: Scale numerical data and encode categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(drop='first'), categorical_features)
    ]
)

# Build Logistic Regression pipeline
logreg_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=42, max_iter=1000))
])

# Train the Logistic Regression model
logreg_pipeline.fit(X_train, y_train)

# Predictions and evaluation
y_pred = logreg_pipeline.predict(X_test)
y_pred_proba = logreg_pipeline.predict_proba(X_test)[:, 1]

# Evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
classification_rep = classification_report(y_test, y_pred)

# Display results
results = {
    'Accuracy': accuracy,
    'ROC AUC': roc_auc,
    'Classification Report': classification_rep
}
results


{'Accuracy': 0.9718068469086079,
 'ROC AUC': np.float64(0.9567525390164416),
 'Classification Report': '              precision    recall  f1-score   support\n\n         0.0       0.84      0.46      0.60      1053\n         1.0       0.98      1.00      0.99     22286\n\n    accuracy                           0.97     23339\n   macro avg       0.91      0.73      0.79     23339\nweighted avg       0.97      0.97      0.97     23339\n'}

In [5]:
from sklearn.impute import SimpleImputer

# Impute missing values for numerical and categorical columns
numerical_imputer = SimpleImputer(strategy="mean")
categorical_imputer = SimpleImputer(strategy="most_frequent")

# Apply imputers to respective columns
X[numerical_features] = numerical_imputer.fit_transform(X[numerical_features])
X[categorical_features] = categorical_imputer.fit_transform(X[categorical_features])

# Retry train-test split after imputing missing values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Retry preprocessing and Logistic Regression pipeline
logreg_pipeline.fit(X_train, y_train)

# Predictions and evaluation
y_pred = logreg_pipeline.predict(X_test)
y_pred_proba = logreg_pipeline.predict_proba(X_test)[:, 1]

# Metrics
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
classification_rep = classification_report(y_test, y_pred)

# Display results
results = {
    'Accuracy': accuracy,
    'ROC AUC': roc_auc,
    'Classification Report': classification_rep
}
results


{'Accuracy': 0.9718068469086079,
 'ROC AUC': np.float64(0.9567525390164416),
 'Classification Report': '              precision    recall  f1-score   support\n\n         0.0       0.84      0.46      0.60      1053\n         1.0       0.98      1.00      0.99     22286\n\n    accuracy                           0.97     23339\n   macro avg       0.91      0.73      0.79     23339\nweighted avg       0.97      0.97      0.97     23339\n'}

In [6]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest model
random_forest_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42, n_estimators=100))
])

# Train the Random Forest model
random_forest_model.fit(X_train, y_train)

# Predictions and evaluation
y_pred_rf = random_forest_model.predict(X_test)
y_pred_proba_rf = random_forest_model.predict_proba(X_test)[:, 1]

# Metrics
accuracy_rf = accuracy_score(y_test, y_pred_rf)
roc_auc_rf = roc_auc_score(y_test, y_pred_proba_rf)
classification_rep_rf = classification_report(y_test, y_pred_rf)

# Display results
rf_results = {
    'Accuracy': accuracy_rf,
    'ROC AUC': roc_auc_rf,
    'Classification Report': classification_rep_rf
}
rf_results


{'Accuracy': 0.9714212262736193,
 'ROC AUC': np.float64(0.9563299271262418),
 'Classification Report': '              precision    recall  f1-score   support\n\n         0.0       0.85      0.45      0.59      1053\n         1.0       0.97      1.00      0.99     22286\n\n    accuracy                           0.97     23339\n   macro avg       0.91      0.72      0.79     23339\nweighted avg       0.97      0.97      0.97     23339\n'}