In [1]:
import numpy as np
import pandas as pd

In [2]:
# read in csv
df = pd.read_csv('coorteeqsrafva.csv', sep=';', header=0, index_col=0)
df.head()

Unnamed: 0,diagnosi,ecg_id,ritmi,patient_id,age,sex,height,weight,nurse,site,...,validated_by_human,baseline_drift,static_noise,burst_noise,electrodes_problems,extra_beats,pacemaker,strat_fold,filename_lr,filename_hr
0,STACH,10900,VA,15654.0,54.0,0,,,0.0,0.0,...,False,,,,,,,6,records100/10000/10900_lr,records500/10000/10900_hr
1,AFLT,10900,AF,15654.0,54.0,0,,,0.0,0.0,...,False,,,,,,,6,records100/10000/10900_lr,records500/10000/10900_hr
2,SR,8209,SR,12281.0,55.0,0,,,1.0,2.0,...,True,,,,,,,10,records100/08000/08209_lr,records500/08000/08209_hr
3,STACH,17620,VA,2007.0,29.0,1,164.0,56.0,7.0,1.0,...,True,,,,,,,1,records100/17000/17620_lr,records500/17000/17620_hr
4,SBRAD,12967,VA,8685.0,57.0,0,,,0.0,0.0,...,False,,", I-AVR,",,,,,1,records100/12000/12967_lr,records500/12000/12967_hr


In [3]:
# drop columns
sub_df = df.drop(columns=['diagnosi', 'ecg_id', 'patient_id', 'nurse', 'site', 'device', 'recording_date', 'report', 'scp_codes', 'infarction_stadium1', 'infarction_stadium2', 'initial_autogenerated_report', 'baseline_drift', 'static_noise', 'burst_noise', 'electrodes_problems', 'extra_beats', 'filename_lr', 'filename_hr'])
sub_df.head()

Unnamed: 0,ritmi,age,sex,height,weight,heart_axis,validated_by,second_opinion,validated_by_human,pacemaker,strat_fold
0,VA,54.0,0,,,MID,,False,False,,6
1,AF,54.0,0,,,MID,,False,False,,6
2,SR,55.0,0,,,LAD,1.0,False,True,,10
3,VA,29.0,1,164.0,56.0,,0.0,False,True,,1
4,VA,57.0,0,,,MID,,False,False,,1


In [4]:
# get info for columns
sub_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6428 entries, 0 to 6427
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ritmi               6428 non-null   object 
 1   age                 6394 non-null   float64
 2   sex                 6428 non-null   int64  
 3   height              1866 non-null   float64
 4   weight              2428 non-null   float64
 5   heart_axis          4124 non-null   object 
 6   validated_by        3676 non-null   float64
 7   second_opinion      6428 non-null   bool   
 8   validated_by_human  6428 non-null   bool   
 9   pacemaker           294 non-null    object 
 10  strat_fold          6428 non-null   int64  
dtypes: bool(2), float64(4), int64(2), object(3)
memory usage: 514.7+ KB


In [5]:
# drop missing values for height and weight
sub_df = sub_df[sub_df['height'].notna()]
sub_df = sub_df[sub_df['weight'].notna()]

# fill missing values for age with the mean
sub_df['age'].fillna((sub_df['age'].mean()), inplace=True)

# fill missing values for validated_by with 0
sub_df['validated_by'] = sub_df['validated_by'].fillna(0)

# fill missing values for heart_axis with missing
sub_df['heart_axis'] = sub_df['heart_axis'].fillna('Missing')

# fill missing values for pacemaker with missing 
sub_df['pacemaker'] = sub_df['pacemaker'].fillna('Missing')

In [6]:
# get info for columns
sub_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1803 entries, 3 to 6426
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ritmi               1803 non-null   object 
 1   age                 1803 non-null   float64
 2   sex                 1803 non-null   int64  
 3   height              1803 non-null   float64
 4   weight              1803 non-null   float64
 5   heart_axis          1803 non-null   object 
 6   validated_by        1803 non-null   float64
 7   second_opinion      1803 non-null   bool   
 8   validated_by_human  1803 non-null   bool   
 9   pacemaker           1803 non-null   object 
 10  strat_fold          1803 non-null   int64  
dtypes: bool(2), float64(4), int64(2), object(3)
memory usage: 144.4+ KB


In [7]:
# map categorical values to numerical values
sub_df['ritmi'] = sub_df['ritmi'].map({'SR': 0, 'AF': 1, 'VA': 2}).values
sub_df['second_opinion'] = sub_df['second_opinion'].map({False: 0, True:1}).values
sub_df['validated_by_human'] = sub_df['validated_by_human'].map({False: 0, True:1}).values
sub_df['heart_axis'] = sub_df['heart_axis'].map({'Missing':0, 'LAD':1, 'AXL':2, 'MID':3, 'RAD':4, 'ALAD':5, 'AXR':6}).values
sub_df['pacemaker'] = sub_df['pacemaker'].map({'Missing':0, 'PACE????, nan': 0, '?, nan':0, 'ja, pacemaker':1}).values

In [8]:
# reset index
sub_df = sub_df.reset_index(drop=True)
sub_df

Unnamed: 0,ritmi,age,sex,height,weight,heart_axis,validated_by,second_opinion,validated_by_human,pacemaker,strat_fold
0,2,29.0,1,164.0,56.0,0,0.0,0,1,0,1
1,0,59.0,0,156.0,75.0,0,0.0,0,1,0,9
2,2,84.0,1,152.0,51.0,0,0.0,0,1,0,7
3,0,79.0,0,172.0,66.0,0,0.0,0,1,0,5
4,1,67.0,0,178.0,73.0,4,0.0,0,1,0,5
...,...,...,...,...,...,...,...,...,...,...,...
1798,0,76.0,0,166.0,67.0,0,0.0,0,1,0,4
1799,2,78.0,1,176.0,76.0,1,4.0,0,1,0,1
1800,2,81.0,0,178.0,70.0,1,0.0,0,1,0,4
1801,2,88.0,0,152.0,45.0,0,0.0,0,1,0,10


In [9]:
# export to csv
sub_df.to_csv('training_11_features.csv', index=False)