# Data Analysis Of Medical Data

## 1. Data Analysis

In [26]:
# Data Handling and Analysis
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

###
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from xgboost import XGBClassifier



In [None]:
# Read csv file and look at contents
file_path = "T49.2_Sep2025_1_StGallen.csv"
df = pd.read_csv(file_path, sep=";")

print(df.shape)
print(df.head())

# Proportions of classes
print("Value Counts (in %):")
class_shares = df["OUTCOME_3Kat_KHK"].value_counts(normalize=True) * 100
print(class_shares.round(2))



## 2. Machine Learning Models

## 2.1 Logistic Regression

1. FIRST TRY

In [None]:

# 2. Features (X) and target (y)
X = df.drop("OUTCOME_3Kat_KHK", axis=1)
y = df["OUTCOME_3Kat_KHK"]

# 3. Encode categorical features if any
X = pd.get_dummies(X, drop_first=True)

# 4. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 5. Train multinomial logistic regression model
model = LogisticRegression(
    max_iter=1000,
    multi_class="multinomial",
    solver="lbfgs"
)
model.fit(X_train, y_train)

# 6. Predictions
y_pred = model.predict(X_test)

# 7. Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

### Accuracy  
Sounds alright, but we know that **class 2 makes up 57% of the data**.  
Our model isn’t better than a simple **“always class 2” predictor**.  

---

### Confusion Matrix  

Class 0 – *No Stenosis* (15.7% of Data)  
- Of the 73 real cases, only **9** are recognized correctly.  
- **45** are incorrectly categorized as class 2.  

Class 1 – *Light Stenosis* (27% of Data)  
- **29 of 126** correct → recall **23%**.  
- Most real class-1 cases are also categorized as class 2.  

Class 2 – *Stenosis* (57% of Data)  
- **220 of 263** correct → recall **84%**.  
- Most misclassifications are confused with class 1.  


2. SECOND TRY

In [None]:

# 1. Load data
file_path = "T49.2_Sep2025_1_StGallen.csv"
df = pd.read_csv(file_path, sep=";")

# 2. Features (X) and target (y)
X = df.drop("OUTCOME_3Kat_KHK", axis=1)
y = df["OUTCOME_3Kat_KHK"]

# 3. Encode categorical features if any
X = pd.get_dummies(X, drop_first=True)

# 4. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 5. Train multinomial logistic regression model with BALANCED CLASS WEIGHTS
model = LogisticRegression(
    max_iter=1000,
    multi_class="multinomial",
    solver="lbfgs",
    class_weight="balanced"   # NEW MEASURE: This should balance the classes
)
model.fit(X_train, y_train)

# 6. Predictions
y_pred = model.predict(X_test)

# 7. Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


## 2.2 XGBoost

In [27]:


# 1. Train XGBoost classifier
xgb_model = XGBClassifier(
    n_estimators=200,        # number of trees
    learning_rate=0.1,       # shrinkage step
    max_depth=5,             # tree depth
    subsample=0.8,           # sample rows
    colsample_bytree=0.8,    # sample features
    random_state=42,
    scale_pos_weight=None,   # can be used for imbalance, but start with None
    use_label_encoder=False,
    eval_metric="mlogloss"
)
xgb_model.fit(X_train, y_train)

# 2. Predictions
y_pred_xgb = xgb_model.predict(X_test)

# 3. Evaluation
print("Accuracy (XGBoost):", accuracy_score(y_test, y_pred_xgb))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_xgb))
print("\nClassification Report:\n", classification_report(y_test, y_pred_xgb))


Parameters: { "use_label_encoder" } are not used.



Accuracy (XGBoost): 0.5692640692640693

Confusion Matrix:
 [[  9  21  43]
 [  6  20 100]
 [  7  22 234]]

Classification Report:
               precision    recall  f1-score   support

           0       0.41      0.12      0.19        73
           1       0.32      0.16      0.21       126
           2       0.62      0.89      0.73       263

    accuracy                           0.57       462
   macro avg       0.45      0.39      0.38       462
weighted avg       0.50      0.57      0.50       462

