Task :- Data Cleaning & Preprocessing

Description :
Prepare raw data so it can be used reliably for analysis and modeling.

Objectives :
Handle missing and incorrect values
Remove duplicate records
Convert data types where required
Prepare a clean dataset suitable for analysis.

Tools:
Python, pandas, NumPy

## import required libraries

In [11]:
import pandas as pd
import numpy as np

## load dataset

In [12]:
df=pd.read_csv("Heart_Disease_Prediction - Copy.csv")
#first 5 rows
df.head()

Unnamed: 0,Age,Gender,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium,Heart Disease
0,57.0,1.0,2.0,124.0,,0.0,0.0,141.0,0.0,0.3,1.0,0.0,7.0,Presence
1,67.0,0.0,,115.0,564.0,0.0,2.0,160.0,0.0,1.6,,0.0,7.0,Absence
2,57.0,1.0,2.0,124.0,,0.0,0.0,141.0,0.0,0.3,1.0,0.0,7.0,Presence
3,,1.0,4.0,128.0,263.0,0.0,0.0,,1.0,0.2,2.0,1.0,7.0,Absence
4,74.0,0.0,2.0,120.0,269.0,0.0,2.0,121.0,1.0,0.2,1.0,1.0,3.0,Absence


## check missing values

In [13]:
df.isnull().sum()

Age                        31
Gender                     30
Chest pain type            31
BP                         30
Cholesterol                32
FBS over 120               30
EKG results                30
Max HR                     31
Exercise angina            30
ST depression              30
Slope of ST                31
Number of vessels fluro    30
Thallium                   30
Heart Disease               0
dtype: int64

## handle missing values

In [14]:
#fill numeric missing values with mean(column)
df.fillna(df.mean(numeric_only=True),inplace=True)
df.head()

Unnamed: 0,Age,Gender,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium,Heart Disease
0,57.0,1.0,2.0,124.0,248.445283,0.0,0.0,141.0,0.0,0.3,1.0,0.0,7.0,Presence
1,67.0,0.0,3.161654,115.0,564.0,0.0,2.0,160.0,0.0,1.6,1.582707,0.0,7.0,Absence
2,57.0,1.0,2.0,124.0,248.445283,0.0,0.0,141.0,0.0,0.3,1.0,0.0,7.0,Presence
3,54.503759,1.0,4.0,128.0,263.0,0.0,0.0,148.973684,1.0,0.2,2.0,1.0,7.0,Absence
4,74.0,0.0,2.0,120.0,269.0,0.0,2.0,121.0,1.0,0.2,1.0,1.0,3.0,Absence


In [15]:
df.isnull().sum()

Age                        0
Gender                     0
Chest pain type            0
BP                         0
Cholesterol                0
FBS over 120               0
EKG results                0
Max HR                     0
Exercise angina            0
ST depression              0
Slope of ST                0
Number of vessels fluro    0
Thallium                   0
Heart Disease              0
dtype: int64

## remove duplicate records

In [16]:
#check duplicates rows
df.duplicated().sum()

np.int64(53)

In [17]:
#remove duplicates
df.drop_duplicates(inplace=True)
df.head()

Unnamed: 0,Age,Gender,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium,Heart Disease
0,57.0,1.0,2.0,124.0,248.445283,0.0,0.0,141.0,0.0,0.3,1.0,0.0,7.0,Presence
1,67.0,0.0,3.161654,115.0,564.0,0.0,2.0,160.0,0.0,1.6,1.582707,0.0,7.0,Absence
3,54.503759,1.0,4.0,128.0,263.0,0.0,0.0,148.973684,1.0,0.2,2.0,1.0,7.0,Absence
4,74.0,0.0,2.0,120.0,269.0,0.0,2.0,121.0,1.0,0.2,1.0,1.0,3.0,Absence
5,65.0,1.0,4.0,120.0,177.0,0.0,0.0,140.0,0.0,0.4,1.0,0.0,7.0,Absence


In [18]:
df.duplicated().sum()

np.int64(0)

## convert data types

In [19]:
df.dtypes

Age                        float64
Gender                     float64
Chest pain type            float64
BP                         float64
Cholesterol                float64
FBS over 120               float64
EKG results                float64
Max HR                     float64
Exercise angina            float64
ST depression              float64
Slope of ST                float64
Number of vessels fluro    float64
Thallium                   float64
Heart Disease               object
dtype: object

In [20]:
df["Gender"] = df["Gender"].astype(int)
df["Age"] = df["Age"].astype(int)
df.head()

Unnamed: 0,Age,Gender,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium,Heart Disease
0,57,1,2.0,124.0,248.445283,0.0,0.0,141.0,0.0,0.3,1.0,0.0,7.0,Presence
1,67,0,3.161654,115.0,564.0,0.0,2.0,160.0,0.0,1.6,1.582707,0.0,7.0,Absence
3,54,1,4.0,128.0,263.0,0.0,0.0,148.973684,1.0,0.2,2.0,1.0,7.0,Absence
4,74,0,2.0,120.0,269.0,0.0,2.0,121.0,1.0,0.2,1.0,1.0,3.0,Absence
5,65,1,4.0,120.0,177.0,0.0,0.0,140.0,0.0,0.4,1.0,0.0,7.0,Absence
