In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve

# Set seed for reproducibility
np.random.seed(42)

# Create dataset
n = 10000
n_diseased = int(n * 0.01)
n_healthy = n - n_diseased

# Diseased individuals (1% prevalence)
diseased = pd.DataFrame({
    'has_disease': 1,
    'test_result': np.random.choice([1, 0], size=n_diseased, p=[0.99, 0.01])
})

# Healthy individuals (99% of population)
healthy = pd.DataFrame({
    'has_disease': 0,
    'test_result': np.random.choice([1, 0], size=n_healthy, p=[0.01, 0.99])
})

# Combine and shuffle
df = pd.concat([diseased, healthy], ignore_index=True)
df = df.sample(frac=1).reset_index(drop=True)

print(df['test_result'].value_counts())
print(df['has_disease'].value_counts())


test_result
0    9806
1     194
Name: count, dtype: int64
has_disease
0    9900
1     100
Name: count, dtype: int64


In [None]:
df

Unnamed: 0,has_disease,test_result
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
...,...,...
9995,0,0
9996,0,0
9997,0,0
9998,0,0


In [None]:
from sklearn.model_selection import train_test_split

# Features and target
X = df[['test_result']]
y = df['has_disease']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2)

# Logistic Regression
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

print("This model has a high accuracy overall, but is not picking up the minority class because this class only accounts for a small percentage of the data set soliciting a fallacy pitfall.")
print(classification_report(y_test, y_pred, target_names=["Healthy", "Diseased"]))

# Probability of having disease if test is positive
positive_test = X_test[y_test == 1]
positive_preds = model.predict_proba(positive_test)[:, 1]

print(f"Average predicted probability for actual disease cases: {positive_preds.mean():.2f}")


This model has a high accuracy overall, but is not picking up the minority class because this class only accounts for a small percentage of the data set soliciting a fallacy pitfall.
              precision    recall  f1-score   support

     Healthy       0.99      1.00      0.99      1980
    Diseased       0.00      0.00      0.00        20

    accuracy                           0.99      2000
   macro avg       0.49      0.50      0.50      2000
weighted avg       0.98      0.99      0.99      2000

Average predicted probability for actual disease cases: 0.49


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:



# Combine and shuffle
df = pd.concat([diseased, healthy], ignore_index=True)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# 2. Undersample majority class to balance data
df_majority = df[df['has_disease'] == 0]
df_minority = df[df['has_disease'] == 1]
df_majority_downsampled = df_majority.sample(n=len(df_minority)*10, random_state=42)
df_balanced = pd.concat([df_majority_downsampled, df_minority])
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# 3. Split data into training and testing sets
X = df_balanced[['test_result']]
y = df_balanced['has_disease']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2)

# 4. Train logistic regression model with class weighting
model = LogisticRegression(class_weight='balanced', max_iter=1000)
model.fit(X_train, y_train)

# 5. Make predictions
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

# 6. Evaluate performance
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_prob)

# Print metrics
print("This model has a slightly lower accuracy overall, but is now picking up the minority class because of added weights and added undersampling.")
print(classification_report(y_test, y_pred, target_names=["Healthy", "Diseased"]))

# Probability of having disease if test is positive
positive_test = X_test[y_test == 1]
positive_preds = model.predict_proba(positive_test)[:, 1]

print(f"Average predicted probability for actual disease cases: {positive_preds.mean():.2f}")


This model has a slightly lower accuracy overall, but is now picking up the minority class because of added weights and added undersampling.
              precision    recall  f1-score   support

     Healthy       1.00      0.98      0.99       200
    Diseased       0.83      1.00      0.91        20

    accuracy                           0.98       220
   macro avg       0.92      0.99      0.95       220
weighted avg       0.98      0.98      0.98       220

Average predicted probability for actual disease cases: 0.98


In [None]:
df

Unnamed: 0,has_disease,test_result
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
...,...,...
9995,0,0
9996,0,0
9997,0,0
9998,0,0
