# Data Analysis Of Medical Data

## 1. Data Analysis

In [3]:
# Data Handling and Analysis
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

### NEW
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report



In [None]:
# Read csv file and look at contents
file_path = "T49.2_Sep2025_1_StGallen.csv"
df = pd.read_csv(file_path, sep=";")

print(df.shape)
print(df.head())

# Proportions of classes
print("Value Counts (in %):")
class_shares = df["OUTCOME_3Kat_KHK"].value_counts(normalize=True) * 100
print(class_shares.round(2))



## 2. Machine Learning Models

## 2.1 Logistic Regression

In [18]:

# 2. Features (X) and target (y)
X = df.drop("OUTCOME_3Kat_KHK", axis=1)
y = df["OUTCOME_3Kat_KHK"]

# 3. Encode categorical features if any
X = pd.get_dummies(X, drop_first=True)

# 4. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 5. Train multinomial logistic regression model
model = LogisticRegression(
    max_iter=1000,
    multi_class="multinomial",
    solver="lbfgs"
)
model.fit(X_train, y_train)

# 6. Predictions
y_pred = model.predict(X_test)

# 7. Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))



Accuracy: 0.5584415584415584

Confusion Matrix:
 [[  9  19  45]
 [  4  29  93]
 [  9  34 220]]

Classification Report:
               precision    recall  f1-score   support

           0       0.41      0.12      0.19        73
           1       0.35      0.23      0.28       126
           2       0.61      0.84      0.71       263

    accuracy                           0.56       462
   macro avg       0.46      0.40      0.39       462
weighted avg       0.51      0.56      0.51       462



### Accuracy  
Sounds alright, but we know that **class 2 makes up 57% of the data**.  
Our model isn’t better than a simple **“always class 2” predictor**.  

---

### Confusion Matrix  

Class 0 – *No Stenosis* (15.7% of Data)  
- Of the 73 real cases, only **9** are recognized correctly.  
- **45** are incorrectly categorized as class 2.  

Class 1 – *Light Stenosis* (27% of Data)  
- **29 of 126** correct → recall **23%**.  
- Most real class-1 cases are also categorized as class 2.  

Class 2 – *Stenosis* (57% of Data)  
- **220 of 263** correct → recall **84%**.  
- Most misclassifications are confused with class 1.  


In [None]:

# 1. Load data
file_path = "T49.2_Sep2025_1_StGallen.csv"
df = pd.read_csv(file_path, sep=";")

# 2. Features (X) and target (y)
X = df.drop("OUTCOME_3Kat_KHK", axis=1)
y = df["OUTCOME_3Kat_KHK"]

# 3. Encode categorical features if any
X = pd.get_dummies(X, drop_first=True)

# 4. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 5. Train multinomial logistic regression model with BALANCED CLASS WEIGHTS
model = LogisticRegression(
    max_iter=1000,
    multi_class="multinomial",
    solver="lbfgs",
    class_weight="balanced"   # NEW MEASURE: This should balance the classes
)
model.fit(X_train, y_train)

# 6. Predictions
y_pred = model.predict(X_test)

# 7. Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))




Accuracy: 0.5324675324675324

Confusion Matrix:
 [[ 10  22  41]
 [  8  36  82]
 [ 16  47 200]]

Classification Report:
               precision    recall  f1-score   support

           0       0.29      0.14      0.19        73
           1       0.34      0.29      0.31       126
           2       0.62      0.76      0.68       263

    accuracy                           0.53       462
   macro avg       0.42      0.39      0.39       462
weighted avg       0.49      0.53      0.50       462

