# Patient Analysis

This script analysis certain information about the patients.

In [7]:
import numpy as np
import pandas as pd
import scipy as sp
from scipy import io as sio
from scipy import signal as sps
from scipy import linalg as spl
import matplotlib.pyplot as plt
import seaborn as sns
from os.path import join as osj

import pickle
import wfdb


## Get patient infos

In [8]:
ROOT = osj("..", "physionet.org/files/mitdb/1.0.0")
RECORDS = osj(ROOT, "RECORDS")

In [9]:
def get_ecg_info(patient_ids):
    """
    The MIT-BIH data additionally contains ecg info, providing additional information, such as the age, sex, gender, and comments.
    """
    info = {}
    for id_ in patient_ids:
        _, info_ = wfdb.io.rdsamp(osj(ROOT, str(id_)))
        info[id_] = info_["comments"]
    return info

In [10]:
patient_ids = pd.read_csv(osj("..", "files", "patient_ids.csv"), header=None).to_numpy().reshape(-1)
ecg_info = get_ecg_info(patient_ids)

In [11]:
ecg_info

{100: ['69 M 1085 1629 x1', 'Aldomet, Inderal'],
 101: ['75 F 1011 654 x1', 'Diapres'],
 102: ['84 F 1525 167 x1',
  'Digoxin',
  'The rhythm is paced with a demand pacemaker.  The PVCs are multiform.'],
 103: ['-1 M 742 654 x1', 'Diapres, Xyloprim'],
 104: ['66 F 1567 694 x1',
  'Digoxin, Pronestyl',
  'The rate of paced rhythm is close to that of the underlying sinus rhythm,',
  'resulting in many pacemaker fusion beats.  The PVCs are multiform.  Several',
  'bursts of muscle noise occur, but the signals are generally of good quality.'],
 105: ['73 F 1624 1629 x1',
  'Digoxin, Nitropaste, Pronestyl',
  'The PVCs are uniform.  The predominant feature of this tape is',
  'high-grade noise and artifact.'],
 106: ['24 F 1581 654 x1', 'Inderal', 'The PVCs are multiform.'],
 107: ['63 M 1340 167 x1',
  'Digoxin',
  'Complete heart block is present. The PVCs are multiform.'],
 108: ['87 F 1227 654 x1',
  'Digoxin, Quinaglute',
  'There is borderline first degree AV block and sinus arrhythmi

In [12]:
dict_patient_info = {}

for patient_id, entries in ecg_info.items():

    # Get age and gender
    first_entry = [part.strip() for part in entries[0].split(' ')]
    age = int(first_entry[0])
    gender = first_entry[1]

    # Get medication
    medication_raw = entries[1]
    medication = [med.strip() for med in medication_raw.split(',')]

    # Get comments
    comments = entries[2:] if len(entries) > 2 else []
    comment_string = " ".join(comments)

    # Store in new dictionary
    dict_patient_info[patient_id] = {
        "age": age,
        "gender": gender,
        "medication": medication,
        "comment": comment_string
    }

In [13]:
dict_patient_info

{100: {'age': 69,
  'gender': 'M',
  'medication': ['Aldomet', 'Inderal'],
  'comment': ''},
 101: {'age': 75, 'gender': 'F', 'medication': ['Diapres'], 'comment': ''},
 102: {'age': 84,
  'gender': 'F',
  'medication': ['Digoxin'],
  'comment': 'The rhythm is paced with a demand pacemaker.  The PVCs are multiform.'},
 103: {'age': -1,
  'gender': 'M',
  'medication': ['Diapres', 'Xyloprim'],
  'comment': ''},
 104: {'age': 66,
  'gender': 'F',
  'medication': ['Digoxin', 'Pronestyl'],
  'comment': 'The rate of paced rhythm is close to that of the underlying sinus rhythm, resulting in many pacemaker fusion beats.  The PVCs are multiform.  Several bursts of muscle noise occur, but the signals are generally of good quality.'},
 105: {'age': 73,
  'gender': 'F',
  'medication': ['Digoxin', 'Nitropaste', 'Pronestyl'],
  'comment': 'The PVCs are uniform.  The predominant feature of this tape is high-grade noise and artifact.'},
 106: {'age': 24,
  'gender': 'F',
  'medication': ['Inderal'],

In [14]:
def save_patient_infos(dict_patient_info):
    with open(osj("..", "files", "patient_infos.pkl"), "wb") as f:
        pickle.dump(dict_patient_info, f)

In [None]:
# save_patient_infos(dict_patient_info)

## Get patient files

In [None]:
patient_ids = pd.read_csv(osj("..", "files", "patient_ids.csv"), header=None).to_numpy().reshape(-1)

valid_patients = pd.read_csv(osj("..", "files", "valid_patients.csv"), header=None).to_numpy().reshape(-1)
paced_patients = pd.read_csv(osj("..", "files", "paced_patients.csv"), header=None).to_numpy().reshape(-1)
excluded_patients = pd.read_csv(osj("..", "files", "excluded_patients.csv"), header=None).to_numpy().reshape(-1)
class_counts = pd.read_csv(osj("..", "files", "patient_beat_class_counts.csv"), header=None).to_numpy().reshape(-1)

patient_infos = pickle.load(open(osj("..", "files", "patient_infos.pkl"), "rb"))

In [3]:
len(patient_ids), len(paced_patients), len(excluded_patients), len(valid_patients)

(48, 4, 10, 34)

In [4]:
valid_patients

array([100, 101, 103, 106, 108, 109, 111, 112, 113, 115, 116, 117, 118,
       119, 121, 122, 123, 124, 200, 203, 205, 208, 210, 212, 214, 215,
       219, 220, 221, 228, 230, 231, 232, 233])

## Get class statistics

In [5]:
columns   = class_counts[:3]
data_only = class_counts[3:]

triplets = [data_only[i:i+3] for i in range(0, len(data_only), 3)]
df_class_counts = pd.DataFrame(triplets, columns=columns)

df_class_counts["patient_id"] = df_class_counts["patient_id"].astype(int)
df_class_counts["num_healthy"] = df_class_counts["num_healthy"].astype(int)
df_class_counts["num_arrhythmia"] = df_class_counts["num_arrhythmia"].astype(int)

df_class_counts

Unnamed: 0,patient_id,num_healthy,num_arrhythmia
0,100,1870,34
1,101,1511,5
2,102,0,4
3,103,1725,2
4,104,0,19
5,105,2112,46
6,106,1222,520
7,107,0,57
8,108,1449,23
9,109,2065,40


In [6]:
valid_class_counts = df_class_counts[df_class_counts["patient_id"].isin(valid_patients)]

# get sum of arrhythmia and healthy beats
total_healthy = valid_class_counts["num_healthy"].sum()
total_arrhythmia = valid_class_counts["num_arrhythmia"].sum()

# get percentage of arrhythmia and healthy beats
total = total_healthy + total_arrhythmia
perc_healthy = total_healthy / total * 100
perc_arrhythmia = total_arrhythmia / total * 100

print(f"Total number of valid beats: {total}")
print(f"Sum of healthy beats: {total_healthy} ({perc_healthy:.2f}%)")
print(f"Sum of arrhythmic beats: {total_arrhythmia} ({perc_arrhythmia:.2f}%)")

Total number of valid beats: 64391
Sum of healthy beats: 56428 (87.63%)
Sum of arrhythmic beats: 7963 (12.37%)
