Import Libraries

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

Data Preprocessing

In [11]:
# data collection ->

df = pd.read_csv("../data/raw/heart_disease_encoded.csv")

# 0. removing un-necessary features -> 

df = df.drop(columns=['id']) # id does not carry any meaningful information for model training

In [12]:
# 1. HANDLING MISSING VALUES(NaN) ->

df.isna().sum() # o/p -> number of missing values for each column
# result : the following columns have missing values -> 
# trestbps, chol, fbs, restecg, thalch, exang, oldpeak, slope, ca, thal

missing_features = ['trestbps','chol','fbs','restecg','thalch','exang','oldpeak','slope','ca','thal']
categorical_features = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal', 'target']
continuous_features = ['age', 'trestbps', 'chol', 'thalch', 'oldpeak']


# CONTEXT -> in the images directory you can check in the univariate analysis folder that non of the
#            features follow normal distribution. 
# INFERENCE -> median -> measure of central tendency for continuous features
#              mode -> measure of central tendency for categorical features

# function to handle missing values ->
def fill_missing_values(df,categorical_f,continuous_f):
    for feature in df.columns:
        if(feature in categorical_f):
            df[feature] = df[feature].fillna(value=df[feature].mode()[0])
        elif(feature in continuous_f) :
            df[feature] = df[feature].fillna(value=df[feature].median())

fill_missing_values(df,categorical_features,continuous_features)

df.isna().sum()

age            0
sex            0
cp             0
trestbps       0
chol           0
fbs            0
restecg        0
thalch         0
exang          0
oldpeak        0
slope          0
ca             0
thal           0
num(target)    0
dtype: int64

In [13]:
# 2. DROPING ANY DUPLICATE ROWS IN THE df ->
print(df.shape)
# originally -> there were 920 rows
df = df.drop_duplicates()

df.reset_index(drop=True,inplace=True)# dropping duplicates doesnot reset duplicates -> hence we have to reset index
print(df.shape)
# current -> there are 918 rows -> 2 duplicate rows are dropped

(920, 14)
(918, 14)


In [14]:
# 3. OUTLIER HANDLING -> METHOD USED -> IQR method and caping outlier values with boundary values ->
#                                    -> prevents values from biasing model while keeping the data intact

# concept -> only continuous features can have outlier values -> 
print(continuous_features)

for feature in continuous_features:
    Q1 = df[feature].quantile(0.25)
    Q3 = df[feature].quantile(0.75)
    IQR = Q3-Q1
    UB = Q3 + 1.5*IQR
    LB = Q1 - 1.5*IQR# calculation of upper and lower bounds for each continuous feature

    capped_upper = (df[feature] > UB).sum()# calculating number of values greater than upper bound
    capped_lower = (df[feature] < LB).sum()# calculating number of values smaller than lower bound
    df[feature] = np.where(df[feature]>UB, UB, np.where(df[feature]<LB, LB, df[feature]))# outlier handling

    print(f"{feature} IQR range between {LB:.2f} and {UB:.2f}")
    print(f"{feature}: capped {capped_upper} upper and {capped_lower} lower values")
    print("-"*30)

['age', 'trestbps', 'chol', 'thalch', 'oldpeak']
age IQR range between 27.50 and 79.50
age: capped 0 upper and 0 lower values
------------------------------
trestbps IQR range between 90.00 and 170.00
trestbps: capped 26 upper and 2 lower values
------------------------------
chol IQR range between 42.62 and 401.62
chol: capped 13 upper and 172 lower values
------------------------------
thalch IQR range between 66.38 and 209.38
thalch: capped 0 upper and 2 lower values
------------------------------
oldpeak IQR range between -2.25 and 3.75
oldpeak: capped 15 upper and 1 lower values
------------------------------


In [15]:
# 4. TRAIN-TEST SPLITING OF DATA ->

# checking class imbalance for target label -> 
df['num(target)'].value_counts(normalize=True)*100
# there is a class imbalance in the label -> there is a considerable amount of difference in the sample size 
# b/w class 0 and class 4 -> we need to take that into consideration 

from sklearn.model_selection import train_test_split

display(df.shape)
X = df.drop(columns=['num(target)'],axis=1) # X contains the features for model training
display(X.shape)
y = df['num(target)'] # Y contains the target label

X_train, X_test, y_train, y_test = train_test_split(# train-test split method ->
    X,y,
    test_size=0.2,
    random_state=42,
    stratify=y # stratify y since there is an imbalance in class for target label
)

(918, 14)

(918, 13)

In [16]:
# 5. MIN-MAX FEATURE SCALING -> scaling continuous features to values from 0 to 1
# concept -> never apply min-max scaling before spliting -> since scaling requires min and max values -> this creates
# data leakage and will give unrealistically high accuracy

display(X_train, X_test) # these are the dataframes to scale
print(continuous_features) # these are the features in the dataframe which we want to scale

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0,1))

scaler.fit(X_train[continuous_features])# learning min and max values from training data only

X_train[continuous_features] = scaler.transform(X_train[continuous_features])
X_test[continuous_features] = scaler.transform(X_test[continuous_features])

display(X_train)
display(X_test)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal
636,53.0,1,2,120.0,42.625,0.0,0.0,95.0,0.0,0.0,2.0,0.0,3.0
352,41.0,1,2,120.0,295.000,0.0,0.0,170.0,0.0,0.0,2.0,0.0,3.0
136,70.0,1,4,145.0,174.000,0.0,0.0,125.0,1.0,2.6,3.0,0.0,7.0
40,65.0,0,4,150.0,225.000,0.0,2.0,114.0,0.0,1.0,2.0,3.0,7.0
529,39.0,1,4,110.0,280.000,0.0,0.0,150.0,0.0,0.0,2.0,0.0,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
143,64.0,1,3,125.0,309.000,0.0,0.0,131.0,1.0,1.8,2.0,0.0,7.0
230,52.0,0,3,136.0,196.000,0.0,2.0,169.0,0.0,0.1,2.0,0.0,3.0
769,55.0,1,3,120.0,42.625,0.0,1.0,125.0,1.0,2.5,2.0,0.0,7.0
281,47.0,1,3,130.0,253.000,0.0,0.0,179.0,0.0,0.0,1.0,0.0,3.0


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal
528,38.0,1,4,110.0,223.000,0.0,0.0,150.0,1.0,1.0,2.0,0.0,3.0
710,68.0,1,4,135.0,42.625,0.0,1.0,120.0,1.0,0.0,1.0,0.0,7.0
917,62.0,1,2,120.0,254.000,0.0,2.0,93.0,1.0,0.0,2.0,0.0,3.0
146,57.0,1,4,165.0,289.000,1.0,2.0,124.0,0.0,1.0,2.0,3.0,7.0
886,69.0,1,3,130.0,271.000,0.0,2.0,140.0,0.0,0.5,2.0,0.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
59,51.0,1,1,125.0,213.000,0.0,2.0,125.0,1.0,1.4,1.0,1.0,3.0
571,55.0,1,2,160.0,292.000,1.0,0.0,143.0,1.0,2.0,2.0,0.0,3.0
686,61.0,1,4,150.0,42.625,0.0,0.0,117.0,1.0,2.0,2.0,0.0,7.0
661,57.0,1,4,140.0,42.625,0.0,0.0,120.0,1.0,2.0,2.0,0.0,6.0


['age', 'trestbps', 'chol', 'thalch', 'oldpeak']


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal
636,0.510204,1,2,0.3750,0.000000,0.0,0.0,0.211060,0.0,0.375000,2.0,0.0,3.0
352,0.265306,1,2,0.3750,0.702994,0.0,0.0,0.764055,0.0,0.375000,2.0,0.0,3.0
136,0.857143,1,4,0.6875,0.365947,0.0,0.0,0.432258,1.0,0.808333,3.0,0.0,7.0
40,0.755102,0,4,0.7500,0.508008,0.0,2.0,0.351152,0.0,0.541667,2.0,3.0,7.0
529,0.224490,1,4,0.2500,0.661212,0.0,0.0,0.616590,0.0,0.375000,2.0,0.0,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
143,0.734694,1,3,0.4375,0.741992,0.0,0.0,0.476498,1.0,0.675000,2.0,0.0,7.0
230,0.489796,0,3,0.5750,0.427228,0.0,2.0,0.756682,0.0,0.391667,2.0,0.0,3.0
769,0.551020,1,3,0.3750,0.000000,0.0,1.0,0.432258,1.0,0.791667,2.0,0.0,7.0
281,0.387755,1,3,0.5000,0.586003,0.0,0.0,0.830415,0.0,0.375000,1.0,0.0,3.0


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal
528,0.204082,1,4,0.2500,0.502437,0.0,0.0,0.616590,1.0,0.541667,2.0,0.0,3.0
710,0.816327,1,4,0.5625,0.000000,0.0,1.0,0.395392,1.0,0.375000,1.0,0.0,7.0
917,0.693878,1,2,0.3750,0.588788,0.0,2.0,0.196313,1.0,0.375000,2.0,0.0,3.0
146,0.591837,1,4,0.9375,0.686281,1.0,2.0,0.424885,0.0,0.541667,2.0,3.0,7.0
886,0.836735,1,3,0.5000,0.636142,0.0,2.0,0.542857,0.0,0.458333,2.0,0.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
59,0.469388,1,1,0.4375,0.474582,0.0,2.0,0.432258,1.0,0.608333,1.0,1.0,3.0
571,0.551020,1,2,0.8750,0.694638,1.0,0.0,0.564977,1.0,0.708333,2.0,0.0,3.0
686,0.673469,1,4,0.7500,0.000000,0.0,0.0,0.373272,1.0,0.708333,2.0,0.0,7.0
661,0.591837,1,4,0.6250,0.000000,0.0,0.0,0.395392,1.0,0.708333,2.0,0.0,6.0


In [17]:
# 6. HANDLING CLASS IMBALANCE -> using class weights
display(y_train.value_counts())
# our training dataset is imbalanced -> class 4 is a minority class

class_counts = y_train.value_counts()
total_count = len(y_train)
k = y_train.nunique()

class_weights = {}

display(class_counts,total_count,k)

for cls, count in class_counts.items():
    wt = total_count / (k * count)
    class_weights[cls] = wt

class_weights

num(target)
0    328
1    212
3     86
2     86
4     22
Name: count, dtype: int64

num(target)
0    328
1    212
3     86
2     86
4     22
Name: count, dtype: int64

734

5

{0: 0.4475609756097561,
 1: 0.6924528301886792,
 3: 1.7069767441860466,
 2: 1.7069767441860466,
 4: 6.672727272727273}

In [19]:
# SAVING THE TRAIN AND TESTING DATASET ->

# Saving X_train & X_test ->
X_train.to_csv("../data/processed/X_train.csv", index=False)
X_test.to_csv("../data/processed/X_test.csv", index=False)

# Saving y_train & y_test ->
y_train.to_csv("../data/processed/y_train.csv", index=False)
y_test.to_csv("../data/processed/y_test.csv", index=False)