In [2]:
import pandas as pd
import panel as pn
import numpy as np
pn.extension('tabulator')
import hvplot.pandas
from matplotlib.figure import Figure

In [3]:
hd_data = pd.read_csv("data/heart_disease.csv", index_col=0)

In [4]:
hd_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 200000 entries, 1 to 200000
Data columns (total 22 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Herzinfarkt            200000 non-null  object 
 1   Hoher_Blutdruck        200000 non-null  int64  
 2   Hoher_Cholspiegel      199999 non-null  float64
 3   Cholspiegel_Check      200000 non-null  int64  
 4   BMI                    199998 non-null  float64
 5   Rauchen                199999 non-null  float64
 6   Schlaganfall           199998 non-null  float64
 7   Diabetes               199997 non-null  float64
 8   Phys_Aktivität         199999 non-null  float64
 9   Essgewohnheit_Obst     199999 non-null  float64
 10  Essgewohnheit_Gemuese  199999 non-null  float64
 11  Starker_Alkkonsum      199998 non-null  float64
 12  Gesundheitsvorsorge    200000 non-null  int64  
 13  Fin_Schwierigkeit      199998 non-null  float64
 14  Allg_Gesundheit        199998 non-null  o

In [5]:
for column in hd_data.columns:
    unique_values = hd_data[column].unique()
    print(f"Uniques in '{column}':")
    print(unique_values)
    print()

Uniques in 'Herzinfarkt':
['kein Infarkt' 'Infarkt' 'unbekannt']

Uniques in 'Hoher_Blutdruck':
[  1   0 630 565 379 609]

Uniques in 'Hoher_Cholspiegel':
[  1.   0. 867. 473. 526.  nan 793.]

Uniques in 'Cholspiegel_Check':
[1 0]

Uniques in 'BMI':
[40. 25. 28. 27. 24. 30. 34. 26. 33. 21. 23. 22. 38. 32. 37. 31. 29. 20.
 35. 45. 39. 19. 47. 18. 36. 43. 55. 49. 42. 17. 16. 41. 44. 50. 59. 48.
 52. 46. 54. 57. 53. 14. 15. 51. 58. 63. 61. 56. 74. 62. 64. 66. 73. 85.
 60. 67. 65. nan 70. 82. 79. 92. 68. 72. 88. 96. 13. 81. 71. 75. 12. 77.
 69. 76. 87. 89. 84. 95. 98. 91. 86. 83. 80. 90.]

Uniques in 'Rauchen':
[  1.   0. 266.  nan 796. 816. 833.]

Uniques in 'Schlaganfall':
[ 0.  1. nan]

Uniques in 'Diabetes':
[ 0.  2.  1. nan]

Uniques in 'Phys_Aktivität':
[  0.   1. 404. 815. 891. 480.  nan]

Uniques in 'Essgewohnheit_Obst':
[ 0.  1. nan]

Uniques in 'Essgewohnheit_Gemuese':
[ 1.  0. nan]

Uniques in 'Starker_Alkkonsum':
[ 0.  1. nan]

Uniques in 'Gesundheitsvorsorge':
[1 0]

Uniques i

In [6]:
missing = hd_data.isnull().sum(axis=1)
missing_total = len(missing[missing > 0])
print(f"Total number of NAs: {missing_total}")

Total number of NAs: 25


In [7]:
hd_data = hd_data[~hd_data.astype(str).isin(['unbekannt', 'keine Angabe']).any(axis=1)]

In [8]:
hd_data = hd_data.dropna()

In [9]:
binary_cols = ["Hoher_Blutdruck", "Hoher_Cholspiegel", "Rauchen", "Phys_Aktivität"]

hd_data[binary_cols] = hd_data[binary_cols].applymap(lambda x: x if x in (0 ,1) else np.nan)
hd_data[binary_cols] = hd_data[binary_cols].fillna(pd.NA).astype('Int64')

In [10]:
for column in hd_data.columns:
    unique_values = hd_data[column].unique()
    print(f"Uniques in '{column}':")
    print(unique_values)
    print()

Uniques in 'Herzinfarkt':
['kein Infarkt' 'Infarkt']

Uniques in 'Hoher_Blutdruck':
<IntegerArray>
[0, 1, <NA>]
Length: 3, dtype: Int64

Uniques in 'Hoher_Cholspiegel':
<IntegerArray>
[0, 1, <NA>]
Length: 3, dtype: Int64

Uniques in 'Cholspiegel_Check':
[0 1]

Uniques in 'BMI':
[25. 28. 27. 24. 30. 34. 26. 33. 21. 23. 22. 38. 32. 37. 31. 29. 20. 35.
 45. 39. 19. 40. 47. 18. 36. 43. 55. 49. 42. 17. 16. 41. 44. 50. 59. 48.
 52. 46. 54. 57. 53. 14. 15. 51. 58. 63. 61. 56. 74. 62. 64. 66. 73. 85.
 60. 67. 65. 70. 82. 79. 92. 68. 72. 88. 96. 13. 81. 71. 75. 12. 77. 69.
 76. 87. 89. 84. 95. 98. 91. 86. 83. 80. 90.]

Uniques in 'Rauchen':
<IntegerArray>
[1, 0, <NA>]
Length: 3, dtype: Int64

Uniques in 'Schlaganfall':
[0. 1.]

Uniques in 'Diabetes':
[0. 2. 1.]

Uniques in 'Phys_Aktivität':
<IntegerArray>
[1, 0, <NA>]
Length: 3, dtype: Int64

Uniques in 'Essgewohnheit_Obst':
[0. 1.]

Uniques in 'Essgewohnheit_Gemuese':
[0. 1.]

Uniques in 'Starker_Alkkonsum':
[0. 1.]

Uniques in 'Gesundheitsvor

In [11]:
hd_data = hd_data.dropna()
hd_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 199828 entries, 2 to 200000
Data columns (total 22 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Herzinfarkt            199828 non-null  object 
 1   Hoher_Blutdruck        199828 non-null  Int64  
 2   Hoher_Cholspiegel      199828 non-null  Int64  
 3   Cholspiegel_Check      199828 non-null  int64  
 4   BMI                    199828 non-null  float64
 5   Rauchen                199828 non-null  Int64  
 6   Schlaganfall           199828 non-null  float64
 7   Diabetes               199828 non-null  float64
 8   Phys_Aktivität         199828 non-null  Int64  
 9   Essgewohnheit_Obst     199828 non-null  float64
 10  Essgewohnheit_Gemuese  199828 non-null  float64
 11  Starker_Alkkonsum      199828 non-null  float64
 12  Gesundheitsvorsorge    199828 non-null  int64  
 13  Fin_Schwierigkeit      199828 non-null  float64
 14  Allg_Gesundheit        199828 non-null  o

In [12]:
ihd = hd_data.interactive()

In [13]:
hd_data.hvplot.box(y='BMI',
                   by='Herzinfarkt')


In [14]:
hd_data['Herzinfarkt'].value_counts().hvplot.bar(title='Verteilung Herzinfarkt')

In [15]:
hd_data['BMI'].hvplot.hist(title='Verteilung BMI', bins=30)

## Split train & test

In [16]:
from sklearn.model_selection import train_test_split
from pandas import get_dummies

y = hd_data["Herzinfarkt"].values
hd_x = pd.get_dummies(hd_data.drop("Herzinfarkt", axis = 1))
x = hd_x.values

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size = .1, random_state = 187)

## Downsample

In [17]:
sample_size = 5000

# Downsample hd_data DataFrame based on the 'Herzinfarkt' column
downsampled_hd_data = hd_data.groupby('Herzinfarkt').apply(lambda x: x.sample(n=sample_size)).reset_index(drop=True)

# Split the downsampled dataset into features (x) and target (y)
downsampled_y = downsampled_hd_data['Herzinfarkt'].values
downsampled_x = pd.get_dummies(downsampled_hd_data.drop('Herzinfarkt', axis=1)).values

# Split the downsampled dataset into training and testing sets
downsampled_x_train, downsampled_x_test, downsampled_y_train, downsampled_y_test = train_test_split(downsampled_x, downsampled_y, train_size=0.1, random_state=187)


## Random Forest Model

In [18]:
from sklearn.ensemble import RandomForestClassifier

steps = [("rf", RandomForestClassifier(n_estimators = 100))]

### Setting up a pipeline

In [19]:
from sklearn.pipeline import Pipeline

# Create the pipeline
pipeline = Pipeline(steps)

# Fit the pipeline on the downsampled training data
pipeline.fit(downsampled_x_train, downsampled_y_train)

# Predict the target variable on the testing data
y_pred = pipeline.predict(x_test)

### Evaluation

In [20]:
from sklearn.metrics import classification_report

# Generate predictions on the testing data
y_pred = pipeline.predict(x_test)

# Generate the classification report
rf_report = classification_report(y_test, y_pred)
print(rf_report)

              precision    recall  f1-score   support

     Infarkt       0.23      0.77      0.35     16988
kein Infarkt       0.97      0.73      0.83    162858

    accuracy                           0.73    179846
   macro avg       0.60      0.75      0.59    179846
weighted avg       0.90      0.73      0.79    179846



## Logistic Regression

In [27]:
from sklearn.linear_model import LogisticRegression

steps = [("logreg", LogisticRegression())]

# Pipeline
pipeline = Pipeline(steps)
pipeline.fit(downsampled_x_train, downsampled_y_train)

#Evaluation
y_pred = pipeline.predict(x_test)
lr_report = classification_report(y_test, y_pred)
print(lr_report)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


              precision    recall  f1-score   support

     Infarkt       0.25      0.74      0.38     16988
kein Infarkt       0.97      0.77      0.86    162858

    accuracy                           0.77    179846
   macro avg       0.61      0.76      0.62    179846
weighted avg       0.90      0.77      0.81    179846



## Decision Tree

In [22]:
from sklearn.tree import DecisionTreeClassifier

# Define the desired parameters for the decision tree classifier
parameters = {
    'max_depth': 5,  # Maximum depth of the tree (default: None)
    'criterion': 'gini',  # Criterion for splitting: 'gini' or 'entropy' (default: 'gini')
    'class_weight': None  # Option to enable class weights for imbalanced data
}

# Create the decision tree classifier with the desired parameters
decision_tree = DecisionTreeClassifier(**parameters)

# Update the pipeline step with the decision tree classifier
steps = [("decision_tree", decision_tree)]
pipeline = Pipeline(steps)

# Fit the pipeline on the downsampled training data
pipeline.fit(downsampled_x_train, downsampled_y_train)

# Predict the target variable on the testing data
y_pred = pipeline.predict(x_test)

# Generate the classification report
dt_report = classification_report(y_test, y_pred)
print(dt_report)

              precision    recall  f1-score   support

     Infarkt       0.20      0.73      0.31     16988
kein Infarkt       0.96      0.69      0.80    162858

    accuracy                           0.69    179846
   macro avg       0.58      0.71      0.56    179846
weighted avg       0.89      0.69      0.76    179846



## XGBoost

In [23]:
from xgboost import XGBClassifier

# Convert string labels to integers
label_mapping = {'kein Infarkt': 0, 'Infarkt': 1}
downsampled_y_train_bool = [label_mapping[label] for label in downsampled_y_train]

# Define the desired parameters for the XGBoost classifier
parameters = {
    'n_estimators': 100,  # Number of trees (default: 100)
    'learning_rate': 0.1,  # Learning rate (default: 0.1)
    'max_depth': 3,  # Maximum depth of each tree (default: 3)
    'reg_alpha': 0,  # L1 regularization term on weights (default: 0)
    'reg_lambda': 1,  # L2 regularization term on weights (default: 1)
    'scale_pos_weight': 1  # Control the balance of positive and negative weights (default: 1)
}

# Create the XGBoost classifier with the desired parameters
xgb = XGBClassifier(**parameters)

# Update the pipeline step with the XGBoost classifier
steps = [("xgb", xgb)]
pipeline = Pipeline(steps)

# Fit the pipeline on the downsampled training data
pipeline.fit(downsampled_x_train, downsampled_y_train_bool)

# Predict the target variable on the testing data
y_pred = pipeline.predict(x_test)

predicted_labels = ['Infarkt' if label == 1 else 'kein Infarkt' for label in y_pred]
# Generate the classification report
xgb_report = classification_report(y_test, predicted_labels)
print(xgb_report)

              precision    recall  f1-score   support

     Infarkt       0.25      0.74      0.37     16988
kein Infarkt       0.97      0.77      0.86    162858

    accuracy                           0.76    179846
   macro avg       0.61      0.75      0.61    179846
weighted avg       0.90      0.76      0.81    179846



In [24]:
print(f'Random Forest:\n{rf_report}')
print(f'Logistic Regression:\n{lr_report}')
print(f'Decision Tree:\n{dt_report}')
print(f'XGBoost:\n{xgb_report}')

Random Forest:
              precision    recall  f1-score   support

     Infarkt       0.23      0.77      0.35     16988
kein Infarkt       0.97      0.73      0.83    162858

    accuracy                           0.73    179846
   macro avg       0.60      0.75      0.59    179846
weighted avg       0.90      0.73      0.79    179846

Logistic Regression:
              precision    recall  f1-score   support

     Infarkt       0.25      0.74      0.38     16988
kein Infarkt       0.97      0.77      0.86    162858

    accuracy                           0.77    179846
   macro avg       0.61      0.76      0.62    179846
weighted avg       0.90      0.77      0.81    179846

Decision Tree:
              precision    recall  f1-score   support

     Infarkt       0.20      0.73      0.31     16988
kein Infarkt       0.96      0.69      0.80    162858

    accuracy                           0.69    179846
   macro avg       0.58      0.71      0.56    179846
weighted avg       0.89

## Skipping things

In [28]:
lr_model = pipeline.named_steps['logreg']

In [29]:
lr_model