In [2]:
import pandas as pd
import numpy as np
import scipy.stats as stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from statsmodels.stats.proportion import proportions_ztest

# Load dataset
df = pd.read_csv('/content/diabetes_data_upload.csv')

# Encode categorical values
le = LabelEncoder()
for col in df.columns:
    df[col] = le.fit_transform(df[col])

# Split data into features and target
X = df.drop(columns=['class'])  # Features
y = df['class']  # Target


In [5]:
X,y

(     Age  Gender  Polyuria  Polydipsia  sudden weight loss  weakness  \
 0     16       1         0           1                   0         1   
 1     34       1         0           0                   0         1   
 2     17       1         1           0                   0         1   
 3     21       1         0           0                   1         1   
 4     36       1         1           1                   1         1   
 ..   ...     ...       ...         ...                 ...       ...   
 515   15       0         1           1                   1         0   
 516   24       0         1           1                   1         1   
 517   34       0         1           1                   1         1   
 518    8       0         0           0                   0         1   
 519   18       1         0           0                   0         0   
 
      Polyphagia  Genital thrush  visual blurring  Itching  Irritability  \
 0             0               0              

In [6]:

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [7]:

### 1. Train Logistic Regression, Decision Tree, and Random Forest ###
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier()
}

results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    results[name] = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'False Negative Rate (Type II Error)': fn / (fn + tp)
    }

# Find model with lowest Type II Error
best_model = min(results, key=lambda x: results[x]['False Negative Rate (Type II Error)'])
print(f'Best model for diabetes detection (low Type II error): {best_model}\n', results)


Best model for diabetes detection (low Type II error): Random Forest
 {'Logistic Regression': {'Accuracy': 0.9230769230769231, 'Precision': 0.9315068493150684, 'Recall': 0.9577464788732394, 'False Negative Rate (Type II Error)': np.float64(0.04225352112676056)}, 'Decision Tree': {'Accuracy': 0.9519230769230769, 'Precision': 0.9852941176470589, 'Recall': 0.9436619718309859, 'False Negative Rate (Type II Error)': np.float64(0.056338028169014086)}, 'Random Forest': {'Accuracy': 0.9903846153846154, 'Precision': 1.0, 'Recall': 0.9859154929577465, 'False Negative Rate (Type II Error)': np.float64(0.014084507042253521)}}


In [8]:

### 2. Z-Test: Mean Age of Correctly vs. Misclassified Diabetic Patients ###
y_pred_logistic = LogisticRegression().fit(X_train, y_train).predict(X_test)
correct_indices = np.where(y_pred_logistic == y_test)[0]
incorrect_indices = np.where(y_pred_logistic != y_test)[0]

test_stat, p_value = stats.ttest_ind(X_test[correct_indices, 0], X_test[incorrect_indices, 0], equal_var=False)
print(f'Z-Test on mean age: p-value = {p_value}')
if p_value < 0.05:
    print("Significant difference in mean age of correctly vs. misclassified cases.")


Z-Test on mean age: p-value = 0.00023728463474212143
Significant difference in mean age of correctly vs. misclassified cases.


In [9]:

### 3. Type I Error Analysis for Random Forest ###
y_pred_rf = RandomForestClassifier().fit(X_train, y_train).predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_rf).ravel()
false_positive_rate = fp / (fp + tn)
if false_positive_rate > 0.2:
    z_stat, p_val = stats.norm.cdf(false_positive_rate, loc=0.2, scale=np.std(false_positive_rate))
    print(f'One-Sample Z-Test for FPR > 20%: p-value = {p_val}')


In [15]:

### 4. Compare Type II Errors of SVM, KNN, and Logistic Regression ###
models_2 = {
    'SVM': SVC(),
    'KNN': KNeighborsClassifier(),
    'Logistic Regression': LogisticRegression()
}

fn_rates = {}
for name, model in models_2.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    fn_rates[name] = fn / (fn + tp)

# Get false negatives and total positives for SVM and Logistic Regression
fn_svm = fn_rates['SVM'] * (y_test == 1).sum()
fn_lr = fn_rates['Logistic Regression'] * (y_test == 1).sum()
total_pos = (y_test == 1).sum()

# Perform Z-Test for proportions
count = [fn_svm, fn_lr]
nobs = [total_pos, total_pos]  # Total number of actual positive cases

z_stat, p_value = proportions_ztest(count, nobs, alternative='two-sided')

print(f'Z-Test on Type II Error Rates: Z-Statistic = {z_stat:.4f}, p-value = {p_value:.4f}')



Z-Test on Type II Error Rates: Z-Statistic = -1.7506, p-value = 0.0800


In [16]:

### 5. Gradient Boosting Model Misclassification Analysis ###
y_pred_gbm = GradientBoostingClassifier().fit(X_train, y_train).predict(X_test)
correct_indices_gbm = np.where(y_pred_gbm == y_test)[0]
incorrect_indices_gbm = np.where(y_pred_gbm != y_test)[0]

test_stat_gbm, p_value_gbm = stats.ttest_ind(X_test[correct_indices_gbm, 0], X_test[incorrect_indices_gbm, 0], equal_var=False)
print(f'Gradient Boosting Z-Test on mean age: p-value = {p_value_gbm}')


Gradient Boosting Z-Test on mean age: p-value = 0.1998955572525326


In [17]:

## 6. Compare Three Models on Type I & II Errors ###

from statsmodels.stats.proportion import proportions_ztest

final_models = {
    'Logistic Regression': LogisticRegression(),
    'SVM': SVC(),
    'Random Forest': RandomForestClassifier()
}

errors = {}

for name, model in final_models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    errors[name] = {
        'Type I Error': fp / (fp + tn),
        'Type II Error': fn / (fn + tp),
        'False Positives': fp,
        'False Negatives': fn,
        'Total Negatives': (y_test == 0).sum(),
        'Total Positives': (y_test == 1).sum()
    }

# Z-Test for Type I Errors (False Positives)
count_type1 = [errors['Logistic Regression']['False Positives'], errors['Random Forest']['False Positives']]
nobs_type1 = [errors['Logistic Regression']['Total Negatives'], errors['Random Forest']['Total Negatives']]

z_stat_type1, p_value_type1 = proportions_ztest(count_type1, nobs_type1, alternative='two-sided')
print(f'Z-Test on Type I Errors: Z-Statistic = {z_stat_type1:.4f}, p-value = {p_value_type1:.4f}')

# Z-Test for Type II Errors (False Negatives)
count_type2 = [errors['SVM']['False Negatives'], errors['Random Forest']['False Negatives']]
nobs_type2 = [errors['SVM']['Total Positives'], errors['Random Forest']['Total Positives']]

z_stat_type2, p_value_type2 = proportions_ztest(count_type2, nobs_type2, alternative='two-sided')
print(f'Z-Test on Type II Errors: Z-Statistic = {z_stat_type2:.4f}, p-value = {p_value_type2:.4f}')

print("Final Model Selection Based on Statistical Analysis")


Z-Test on Type I Errors: Z-Statistic = 2.3259, p-value = 0.0200
Z-Test on Type II Errors: Z-Statistic = -1.0035, p-value = 0.3156
Final Model Selection Based on Statistical Analysis
