In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


In [2]:
# Assuming the dataset is stored in a CSV file
dataset = pd.read_csv("diabetes_dataset.csv")


In [3]:
# Check for missing values
missing_values = dataset.isnull().sum()

# Handle missing values if any
# For instance, replace 0 values with NaN for relevant columns
dataset[["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]] = dataset[["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]].replace(0, np.nan)

# Impute missing values using appropriate techniques
dataset["Glucose"].fillna(dataset["Glucose"].mean(), inplace=True)
dataset["BloodPressure"].fillna(dataset["BloodPressure"].mean(), inplace=True)
dataset["SkinThickness"].fillna(dataset["SkinThickness"].median(), inplace=True)
dataset["Insulin"].fillna(dataset["Insulin"].median(), inplace=True)
dataset["BMI"].fillna(dataset["BMI"].median(), inplace=True)

# Split the dataset into features (X) and target variable (y)
X = dataset.drop("Outcome", axis=1)
y = dataset["Outcome"]

# Split the data into training and testing subsets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize the features using StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [4]:
# Initialize and train logistic regression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

# Initialize and train decision tree classifier
dtree = DecisionTreeClassifier()
dtree.fit(X_train, y_train)

# Initialize and train random forest classifier
rf = RandomForestClassifier()
rf.fit(X_train, y_train)


In [5]:
# Make predictions on the test set
logreg_preds = logreg.predict(X_test)
dtree_preds = dtree.predict(X_test)
rf_preds = rf.predict(X_test)

# Evaluate the accuracy of each model
logreg_accuracy = accuracy_score(y_test, logreg_preds)
dtree_accuracy = accuracy_score(y_test, dtree_preds)
rf_accuracy = accuracy_score(y_test, rf_preds)

# Print the accuracy scores
print("Logistic Regression Accuracy:", logreg_accuracy)
print("Decision Tree Accuracy:", dtree_accuracy)
print("Random Forest Accuracy:", rf_accuracy)


Logistic Regression Accuracy: 0.0
Decision Tree Accuracy: 0.0
Random Forest Accuracy: 0.0


In [6]:
# Compare the accuracies and select the best performing model
best_accuracy = max(logreg_accuracy, dtree_accuracy, rf_accuracy)
best_algorithm = ""

if best_accuracy == logreg_accuracy:
    best_algorithm = "Logistic Regression"
elif best_accuracy == dtree_accuracy:
    best_algorithm = "Decision Tree"
else:
    best_algorithm = "Random Forest"

print("Best Algorithm:", best_algorithm)


Best Algorithm: Logistic Regression
