# 1. Data Exploration

In [None]:
import pandas as pd
df = pd.read_csv('heart_disease_uci.csv')

In [None]:
df.head()

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        920 non-null    int64  
 1   age       920 non-null    int64  
 2   sex       920 non-null    object 
 3   dataset   920 non-null    object 
 4   cp        920 non-null    object 
 5   trestbps  861 non-null    float64
 6   chol      890 non-null    float64
 7   fbs       830 non-null    object 
 8   restecg   918 non-null    object 
 9   thalch    865 non-null    float64
 10  exang     865 non-null    object 
 11  oldpeak   858 non-null    float64
 12  slope     611 non-null    object 
 13  ca        309 non-null    float64
 14  thal      434 non-null    object 
 15  num       920 non-null    int64  
dtypes: float64(5), int64(3), object(8)
memory usage: 115.1+ KB


# 2. Handling Missing Data

In [None]:
df.replace('?', pd.NA, inplace=True)

In [None]:
df.notnull().sum()

Unnamed: 0,0
id,920
age,920
sex,920
dataset,920
cp,920
trestbps,861
chol,890
fbs,830
restecg,918
thalch,865


In [None]:
for col in ['ca', 'thal']:
    df[col].fillna(df[col].mode()[0], inplace=True) # filling missing with mode of row

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True) # filling missing with mode of row


In [None]:
df.dropna(inplace=True)

# 3. Feature Creation

In [None]:
bins = [0, 40, 60, 100]
labels = ['<40', '40-60', '>60']
df['AgeGroup'] = pd.cut(df['age'], bins=bins, labels=labels)

In [None]:
def cholesterol_level(chol):
    if chol < 200:
        return 'Low'
    elif 200 <= chol <= 239:
        return 'Normal'
    else:
        return 'High'

df['CholesterolLevel'] = df['chol'].apply(cholesterol_level)

In [None]:
df['IsRisk'] = ((df['chol'] > 240) | (df['trestbps'] > 140) | (df['age'] > 60)).astype(int)

# 4. Feature Transformation

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoders = {}
for col in ['sex', 'cp', 'thal', 'AgeGroup']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
numerical_cols = ['chol', 'trestbps', 'thalch']
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# 5. Feature Interaction

In [None]:
df['BP_Chol_Interaction'] = df['trestbps'] * df['chol']

In [None]:
threshold = 100
df['ExerciseRisk'] = ((df['exang'] == 1) & (df['thalch'] < threshold)).astype(int)

# 6. Feature Selection