Import Libraries

In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

Data Preprocessing

In [74]:
# data collection ->

df = pd.read_csv("../data/raw/heart_disease_encoded.csv")

# 0. removing un-necessary features -> 

df = df.drop(columns=['id']) # id does not carry any meaningful information for model training

In [None]:
# 1. HANDLING MISSING VALUES(NaN) ->

df.isna().sum() # o/p -> number of missing values for each column
# result : the following columns have missing values -> 
# trestbps, chol, fbs, restecg, thalch, exang, oldpeak, slope, ca, thal

missing_features = ['trestbps','chol','fbs','restecg','thalch','exang','oldpeak','slope','ca','thal']
categorical_features = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal', 'target']
continuous_features = ['age', 'trestbps', 'chol', 'thalch', 'oldpeak']


# CONTEXT -> in the images directory you can check in the univariate analysis folder that non of the
#            features follow normal distribution. 
# INFERENCE -> median -> measure of central tendency for continuous features
#              mode -> measure of central tendency for categorical features

# function to handle missing values ->
def fill_missing_values(df,categorical_f,continuous_f):
    for feature in df.columns:
        if(feature in categorical_f):
            df[feature] = df[feature].fillna(value=df[feature].mode()[0])
        elif(feature in continuous_f) :
            df[feature] = df[feature].fillna(value=df[feature].median())

fill_missing_values(df,categorical_features,continuous_features)

df.isna().sum()

In [None]:
# 2. DROPING ANY DUPLICATE ROWS IN THE df ->
print(df.shape)
# originally -> there were 920 rows
df = df.drop_duplicates()

df.reset_index(drop=True,inplace=True)# dropping duplicates doesnot reset duplicates -> hence we have to reset index
print(df.shape)
# current -> there are 918 rows -> 2 duplicate rows are dropped

In [None]:
# 3. OUTLIER HANDLING -> METHOD USED -> IQR method and caping outlier values with boundary values ->
#                                    -> prevents values from biasing model while keeping the data intact

# concept -> only continuous features can have outlier values -> 
print(continuous_features)
display(df.describe())

for feature in continuous_features:
    Q1 = df[feature].quantile(0.25)
    Q3 = df[feature].quantile(0.75)
    IQR = Q3-Q1
    UB = Q3 + 1.5*IQR
    LB = Q1 - 1.5*IQR# calculation of upper and lower bounds for each continuous feature

    capped_upper = (df[feature] > UB).sum()# calculating number of values greater than upper bound
    capped_lower = (df[feature] < LB).sum()# calculating number of values smaller than lower bound
    df[feature] = np.where(df[feature]>UB, UB, np.where(df[feature]<LB, LB, df[feature]))# outlier handling

    print(f"{feature} IQR range between {LB:.2f} and {UB:.2f}")
    print(f"{feature}: capped {capped_upper} upper and {capped_lower} lower values")
    print("-"*30)

display(df.describe())

In [None]:
# 4. TRAIN-TEST SPLITING OF DATA ->

# checking class imbalance for target label -> 
df['num(target)'].value_counts(normalize=True)*100
# there is a class imbalance in the label -> there is a considerable amount of difference in the sample size 
# b/w class 0 and class 4 -> we need to take that into consideration 

from sklearn.model_selection import train_test_split

display(df.shape)
X = df.drop(columns=['num(target)'],axis=1) # X contains the features for model training
display(X.shape)
y = df['num(target)'] # Y contains the target label

X_train, X_test, y_train, y_test = train_test_split(# train-test split method ->
    X,y,
    test_size=0.2,
    random_state=42,
    stratify=y # stratify y since there is an imbalance in class for target label
)

In [None]:
# 5. MIN-MAX FEATURE SCALING -> scaling continuous features to values from 0 to 1
# concept -> never apply min-max scaling before spliting -> since scaling requires min and max values -> this creates
# data leakage and will give unrealistically high accuracy

display(X_train, X_test) # these are the dataframes to scale
print(continuous_features) # these are the features in the dataframe which we want to scale

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0,1))

scaler.fit(X_train[continuous_features])# learning min and max values from training data only

X_train[continuous_features] = scaler.transform(X_train[continuous_features])
X_test[continuous_features] = scaler.transform(X_test[continuous_features])

display(X_train)
display(X_test)

In [None]:
# 6. HANDLING CLASS IMBALANCE -> using class weights
display(y_train.value_counts())
# our training dataset is imbalanced -> class 4 is a minority class

class_counts = y_train.value_counts()
total_count = len(y_train)
k = y_train.nunique()

class_weights = {}

display(class_counts,total_count,k)

for cls, count in class_counts.items():
    wt = total_count / (k * count)
    class_weights[cls] = wt

class_weights