In [2]:
!pip install statsmodels




In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from statsmodels.stats.weightstats import ztest


# Load dataset
url = "/content/diabetes_data_upload.csv"
df = pd.read_csv(url)

# Encode categorical variables
le = LabelEncoder()
for col in df.select_dtypes(include=['object']).columns:
    df[col] = le.fit_transform(df[col])

# Split features and target
X = df.drop(columns=['class'])
y = df['class']

# Train-test split (80-20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train models
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(probability=True),
    "KNN": KNeighborsClassifier(),
    "Gradient Boosting": GradientBoostingClassifier()
}

results = {}
conf_matrices = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)

    conf_matrices[name] = cm
    results[name] = {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "False Positive Rate": cm[0][1] / (cm[0][1] + cm[0][0]),  # Type I error
        "False Negative Rate": cm[1][0] / (cm[1][0] + cm[1][1])   # Type II error
    }

# Display results
results_df = pd.DataFrame(results).T
print("Model Performance:\n", results_df)

# Identify model with lowest Type II error (false negatives)
best_model = min(results, key=lambda x: results[x]["False Negative Rate"])
print(f"\nBest model for minimizing undiagnosed diabetes cases: {best_model}")

# Z-Test on mean age of correctly vs. misclassified diabetic patients
y_pred_lr = models["Logistic Regression"].predict(X_test)
misclassified_age = X_test[:, 0][(y_test != y_pred_lr)]
correctly_classified_age = X_test[:, 0][(y_test == y_pred_lr)]

z_stat, p_value = ztest(misclassified_age, correctly_classified_age)
print(f"\nZ-Test on mean age difference (Correct vs. Misclassified): Z={z_stat:.2f}, p={p_value:.4f}")

# Significance check
if p_value < 0.05:
    print("There is a significant age difference. Consider adjusting model features.")
else:
    print("No significant age difference. Age may not strongly influence classification.")

# Z-Test on false positive rate (Random Forest)
rf_fpr = results["Random Forest"]["False Positive Rate"]
z_stat_rf, p_value_rf = ztest([rf_fpr], value=0.20)
print(f"\nZ-Test for Random Forest FPR > 20%: Z={z_stat_rf:.2f}, p={p_value_rf:.4f}")

if p_value_rf < 0.05:
    print("Random Forest false positive rate is significantly different from 20%. Consider feature selection or threshold tuning.")
else:
    print("No significant difference from 20%. Adjustments may not be necessary.")

# Compare Type II errors (false negatives) for Logistic Regression, SVM, and KNN
fn_lr = results["Logistic Regression"]["False Negative Rate"]
fn_svm = results["SVM"]["False Negative Rate"]
fn_knn = results["KNN"]["False Negative Rate"]

z_stat_fn, p_value_fn = ztest([fn_lr], [fn_svm])
print(f"\nZ-Test on False Negative Rate (Logistic Regression vs. SVM): Z={z_stat_fn:.2f}, p={p_value_fn:.4f}")

# Final model recommendation
best_overall_model = min(results, key=lambda x: results[x]["False Negative Rate"] + results[x]["False Positive Rate"])
print(f"\nRecommended model for medical deployment: {best_overall_model}")


Model Performance:
                      Accuracy  Precision    Recall  False Positive Rate  \
Logistic Regression  0.923077   0.931507  0.957746             0.151515   
Decision Tree        0.942308   0.985075  0.929577             0.030303   
Random Forest        0.990385   1.000000  0.985915             0.000000   
SVM                  0.990385   0.986111  1.000000             0.030303   
KNN                  0.894231   0.954545  0.887324             0.090909   
Gradient Boosting    0.971154   1.000000  0.957746             0.000000   

                     False Negative Rate  
Logistic Regression             0.042254  
Decision Tree                   0.070423  
Random Forest                   0.014085  
SVM                             0.000000  
KNN                             0.112676  
Gradient Boosting               0.042254  

Best model for minimizing undiagnosed diabetes cases: SVM

Z-Test on mean age difference (Correct vs. Misclassified): Z=-1.77, p=0.0770
No significant a

  var = x1_var / (nobs1 - ddof)
  var /= nobs1 + nobs2 - 2 * ddof
