# Data Analysis Of Medical Data

## 1. Data Analysis

In [31]:
# Data Handling and Analysis
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

###
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from xgboost import XGBClassifier

###
import time
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC, SVC
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report




In [None]:
# Read csv file and look at contents
file_path = "T49.2_Sep2025_1_StGallen.csv"
df = pd.read_csv(file_path, sep=";")

print(df.shape)
print(df.head())

# Proportions of classes
print("Value Counts (in %):")
class_shares = df["OUTCOME_3Kat_KHK"].value_counts(normalize=True) * 100
print(class_shares.round(2))

## 2. Machine Learning Models

## 2.1 Logistic Regression

1. FIRST TRY

In [36]:

# 2. Features (X) and target (y)
X = df.drop("OUTCOME_3Kat_KHK", axis=1)
y = df["OUTCOME_3Kat_KHK"]

# 3. Encode categorical features if any
X = pd.get_dummies(X, drop_first=True)

# 4. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 5. Train multinomial logistic regression model
model = LogisticRegression(
    max_iter=1000,
    multi_class="multinomial",
    solver="lbfgs"
)
model.fit(X_train, y_train)

# 6. Predictions
y_pred = model.predict(X_test)

# 7. Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))



Accuracy: 0.5584415584415584

Confusion Matrix:
 [[  9  19  45]
 [  4  29  93]
 [  9  34 220]]

Classification Report:
               precision    recall  f1-score   support

           0       0.41      0.12      0.19        73
           1       0.35      0.23      0.28       126
           2       0.61      0.84      0.71       263

    accuracy                           0.56       462
   macro avg       0.46      0.40      0.39       462
weighted avg       0.51      0.56      0.51       462



### Accuracy  
Sounds alright, but we know that **class 2 makes up 57% of the data**.  
Our model isn’t better than a simple **“always class 2” predictor**.  

---

### Confusion Matrix  

Class 0 – *No Stenosis* (15.7% of Data)  
- Of the 73 real cases, only **9** are recognized correctly.  
- **45** are incorrectly categorized as class 2.  

Class 1 – *Light Stenosis* (27% of Data)  
- **29 of 126** correct → recall **23%**.  
- Most real class-1 cases are also categorized as class 2.  

Class 2 – *Stenosis* (57% of Data)  
- **220 of 263** correct → recall **84%**.  
- Most misclassifications are confused with class 1.  


2. SECOND TRY

In [35]:

# 1. Load data
file_path = "T49.2_Sep2025_1_StGallen.csv"
df = pd.read_csv(file_path, sep=";")

# 2. Features (X) and target (y)
X = df.drop("OUTCOME_3Kat_KHK", axis=1)
y = df["OUTCOME_3Kat_KHK"]

# 3. Encode categorical features if any
X = pd.get_dummies(X, drop_first=True)

# 4. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 5. Train multinomial logistic regression model with BALANCED CLASS WEIGHTS
model = LogisticRegression(
    max_iter=1000,
    multi_class="multinomial",
    solver="lbfgs",
    class_weight="balanced"   # NEW MEASURE: This should balance the classes
)
model.fit(X_train, y_train)

# 6. Predictions
y_pred = model.predict(X_test)

# 7. Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))




Accuracy: 0.5324675324675324

Confusion Matrix:
 [[ 10  22  41]
 [  8  36  82]
 [ 16  47 200]]

Classification Report:
               precision    recall  f1-score   support

           0       0.29      0.14      0.19        73
           1       0.34      0.29      0.31       126
           2       0.62      0.76      0.68       263

    accuracy                           0.53       462
   macro avg       0.42      0.39      0.39       462
weighted avg       0.49      0.53      0.50       462



## 2.2 XGBoost

In [None]:


# 1. Train XGBoost classifier
xgb_model = XGBClassifier(
    n_estimators=200,        # number of trees
    learning_rate=0.1,       # shrinkage step
    max_depth=5,             # tree depth
    subsample=0.8,           # sample rows
    colsample_bytree=0.8,    # sample features
    random_state=42,
    scale_pos_weight=None,   # can be used for imbalance, but start with None
    use_label_encoder=False,
    eval_metric="mlogloss"
)
xgb_model.fit(X_train, y_train)

# 2. Predictions
y_pred_xgb = xgb_model.predict(X_test)

# 3. Evaluation
print("Accuracy (XGBoost):", accuracy_score(y_test, y_pred_xgb))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_xgb))
print("\nClassification Report:\n", classification_report(y_test, y_pred_xgb))


## 2.3 Support Vector Machine (SVM)

The Support Vector Machine is notoriously slow on data with high dimensionality.

In [None]:
# --- SVM comparison: LinearSVC vs. SVC (RBF) with PCA ---

def evaluate_model(name, model, X_train, y_train, X_test, y_test):
    t0 = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - t0

    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"\n=== {name} ===")
    print(f"Train time: {train_time:.2f} s")
    print(f"Accuracy:   {acc:.4f}")
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    return acc

# 1) Linear SVM (fast, good for many features)
linear_svm = Pipeline([
    ("scaler", StandardScaler(with_mean=False)),   # works with sparse One-Hot; safe for dense too
    ("svm", LinearSVC(
        C=1.0,
        class_weight="balanced",   # handle imbalance
        max_iter=10000,
        dual="auto",
        random_state=42
    ))
])

# 2) RBF SVM with PCA (reduce 649 dims -> ~100; speeds up and helps generalization)
n_pca = min(100, X_train.shape[1], max(10, X_train.shape[0] - 1))  # cap at 100 comps
svm_pca_rbf = Pipeline([
    ("scaler", StandardScaler(with_mean=False)),
    ("pca", PCA(n_components=n_pca, random_state=42)),
    ("svm", SVC(
        kernel="rbf",
        C=1.0,
        gamma="scale",
        class_weight="balanced",   # handle imbalance
        probability=False,         # keep False for speed
        random_state=42
    ))
])

acc_lin = evaluate_model("LinearSVC (scaled)", linear_svm, X_train, y_train, X_test, y_test)
acc_rbf = evaluate_model(f"SVC RBF + PCA({n_pca})", svm_pca_rbf, X_train, y_train, X_test, y_test)

print("\nSummary:")
print(f"- LinearSVC Accuracy:     {acc_lin:.4f}")
print(f"- SVC RBF + PCA Accuracy: {acc_rbf:.4f}")


In [None]:


def evaluate_model(name, model, X_train, y_train, X_test, y_test):
    t0 = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - t0

    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    macro_f1 = f1_score(y_test, y_pred, average="macro", zero_division=0)
    bal_acc = balanced_accuracy_score(y_test, y_pred)

    print("\n" + "="*78)
    print(f"=== {name} ===")
    print(f"Train time: {train_time:.2f} s")
    print(f"Accuracy:   {acc:.4f} | Macro-F1: {macro_f1:.4f} | Balanced Acc: {bal_acc:.4f}")

    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

    print("\nClassification Report:")
    # digits=3 prevents Jupyter from collapsing; printing the string avoids truncation artifacts
    report_str = classification_report(y_test, y_pred, digits=3, zero_division=0)
    print(report_str)
    print("="*78)
    return {"model": name, "accuracy": acc, "macro_f1": macro_f1, "balanced_acc": bal_acc, "train_time_s": train_time}

# 1) Linear SVM (fast, good for many features)
linear_svm = Pipeline([
    ("scaler", StandardScaler(with_mean=False)),   # safe for one-hot / sparse-like design
    ("svm", LinearSVC(
        C=1.0,
        class_weight="balanced",
        max_iter=10000,
        dual="auto",
        random_state=42
    ))
])

# 2) RBF SVM with PCA (reduce 649 -> ~100 comps to speed up & improve generalization)
n_pca = min(100, X_train.shape[1], max(10, X_train.shape[0] - 1))  # cap at 100 comps
svm_pca_rbf = Pipeline([
    ("scaler", StandardScaler(with_mean=False)),
    ("pca", PCA(n_components=n_pca, random_state=42)),
    ("svm", SVC(
        kernel="rbf",
        C=1.0,
        gamma="scale",
        class_weight="balanced",
        probability=False,    # keep False for speed
        random_state=42
    ))
])

# Run & collect metrics
res_lin = evaluate_model("LinearSVC (scaled)", linear_svm, X_train, y_train, X_test, y_test)
res_rbf = evaluate_model(f"SVC RBF + PCA({n_pca})", svm_pca_rbf, X_train, y_train, X_test, y_test)

# Compact summary
import pandas as pd
summary = pd.DataFrame([res_lin, res_rbf]).sort_values("macro_f1", ascending=False)
print("\nSummary (sorted by Macro-F1):")
print(summary.to_string(index=False))
