In [50]:
# Import necessary libraries
import numpy as np  # NumPy for numerical operations
import pandas as pd  # Pandas for data manipulation

In [51]:
# Reading the CSV file containing heart data into a DataFrame
df = pd.read_csv("/home/bharat/DSBDA PRACTICALS/DATASETS/heart.csv")

# Displaying the first few rows of the DataFrame
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [52]:
#DATA CLEANING

In [53]:
# Removing duplicate rows from the DataFrame
df = df.drop_duplicates()

# Generating descriptive statistics of the DataFrame including count, min, max, etc. for each column
df.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0
mean,54.42053,0.682119,0.963576,131.602649,246.5,0.149007,0.52649,149.569536,0.327815,1.043046,1.397351,0.718543,2.31457,0.543046
std,9.04797,0.466426,1.032044,17.563394,51.753489,0.356686,0.526027,22.903527,0.470196,1.161452,0.616274,1.006748,0.613026,0.49897
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,48.0,0.0,0.0,120.0,211.0,0.0,0.0,133.25,0.0,0.0,1.0,0.0,2.0,0.0
50%,55.5,1.0,1.0,130.0,240.5,0.0,1.0,152.5,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,274.75,0.0,1.0,166.0,1.0,1.6,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


In [54]:
# Displaying information about each column in the DataFrame
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 302 entries, 0 to 878
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       302 non-null    int64  
 1   sex       302 non-null    int64  
 2   cp        302 non-null    int64  
 3   trestbps  302 non-null    int64  
 4   chol      302 non-null    int64  
 5   fbs       302 non-null    int64  
 6   restecg   302 non-null    int64  
 7   thalach   302 non-null    int64  
 8   exang     302 non-null    int64  
 9   oldpeak   302 non-null    float64
 10  slope     302 non-null    int64  
 11  ca        302 non-null    int64  
 12  thal      302 non-null    int64  
 13  target    302 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 35.4 KB


In [55]:
# Finding the number of null values in each column of the DataFrame
df.isna().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [56]:
#DATA INTEGRATION

In [57]:
# Displaying the first few rows of the DataFrame after data integration
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [58]:
# Creating subset 1 containing selected columns
subSet1 = df[['age', 'cp', 'chol', 'thalach']]

# Creating subset 2 containing selected columns
subSet2 = df[['exang', 'slope', 'target']]

# Merging subset 1 and subset 2 to form a new DataFrame using cross join
merged_df = subSet1.merge(right=subSet2, how='cross')

# Displaying the first few rows of the merged DataFrame
merged_df.head()

Unnamed: 0,age,cp,chol,thalach,exang,slope,target
0,52,0,212,168,0,2,0
1,52,0,212,168,1,0,0
2,52,0,212,168,1,0,0
3,52,0,212,168,0,2,0
4,52,0,212,168,0,1,0


In [59]:
#ERROR CORRECTING

In [60]:
# This line selects rows in the DataFrame where the value in the 'ca' column is equal to 4,
# and replaces those values with NaN (Not a Number), effectively removing them from the DataFrame.
df.loc[df['ca'] == 4, 'ca'] = np.NaN

In [61]:
# This line checks for missing values (NaN) in each column of the DataFrame `df` 
# using the `isna()` function, and then calculates the sum of missing values 
# for each column using the `sum()` function.
df.isna().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          4
thal        0
target      0
dtype: int64

In [62]:
# This line fills missing values (NaN) in the DataFrame `df` with the median value of each column.
df = df.fillna(df.median())

In [63]:
# This line checks for missing values (NaN) in each column of the DataFrame `df` 
# using the `isna()` function, and then calculates the sum of missing values 
# for each column using the `sum()` function.
df.isna().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [64]:
#MODEL BUILDING

In [65]:
# Displaying the first few rows of the DataFrame `df` for inspection.
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2.0,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0.0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0.0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1.0,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3.0,2,0


In [75]:
# Splitting the DataFrame `df` into training and testing sets for features (X) and target (y)
# using train_test_split function from scikit-learn.
# Importing the train_test_split function from scikit-learn's model_selection module.
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,:-1], df.iloc[:,-1], test_size=0.3, random_state=0)

In [76]:
# Displaying the shapes of the training and testing sets for feature matrices (X_train and X_test) 
# and the target vector (y_train).
X_train.shape, X_test.shape, y_train.shape

((211, 13), (91, 13), (211,))

In [77]:
# Importing Gaussian Naive Bayes and Multinomial Naive Bayes classes 
# from scikit-learn's naive_bayes module.
from sklearn.naive_bayes import GaussianNB, MultinomialNB

In [78]:
# Creating an instance of the Gaussian Naive Bayes classifier.
gnb = GaussianNB()

# Fitting the classifier to the training data.
gnb.fit(X_train, y_train)

In [79]:
# Predicting the target values for the test set using the trained Gaussian Naive Bayes classifier.
y_pred = gnb.predict(X_test)
y_pred

array([1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1,
       1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1,
       1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1,
       0, 0, 1])

In [80]:
# Printing the accuracy score of the model on the test data.
print('Model accuracy score: {0:0.4f}'.format(accuracy_score(y_test, y_pred)))

Model accuracy score: 0.8022


In [29]:
#DATA TRANSFORMATION

In [30]:
# Count the occurrences of each value in the 'sex' column of DataFrame 'df'
sex_counts = df['sex'].value_counts()

# Display the count of each unique value in the 'sex' column
sex_counts

sex
1    206
0     96
Name: count, dtype: int64

In [31]:
# Replace values in the 'Sex' column of DataFrame 'df'
# Replace "M" with 1 and "F" with 0
# inplace=True modifies the DataFrame 'df' in place rather than returning a new DataFrame
# Replace values in the 'Sex' column of DataFrame 'df' using the recommended syntax
df["sex"] = df["sex"].replace([1, 0], ["M", "F"])

In [32]:
# Count the occurrences of each value in the 'RestingECG' column of DataFrame 'df'
resting_ecg_counts = df.restecg.value_counts()

# Display the count of each unique value in the 'RestingECG' column
print(resting_ecg_counts)

restecg
1    151
0    147
2      4
Name: count, dtype: int64


In [33]:
# Replace values in the 'RestingECG' column of DataFrame 'df1'
# Replace "Normal" with 0, "ST" with 1, and "LVH" with 2
# inplace=True modifies the DataFrame 'df1' in place rather than returning a new DataFrame
# Replace values in the 'RestingECG' column of DataFrame 'df' using the recommended syntax
df["restecg"] = df["restecg"].replace([0, 1, 2], ["Normal", "ST", "LVH"])

In [34]:
# Count the occurrences of each value in the 'ExerciseAngina' column of DataFrame 'df1'
exercise_angina_counts = df.exang.value_counts()

# Display the count of each unique value in the 'ExerciseAngina' column
print(exercise_angina_counts)

exang
0    203
1     99
Name: count, dtype: int64


In [35]:
# Replace values in the 'ExerciseAngina' column of DataFrame 'df1'
# Replace "N" with 0 and "Y" with 1
# inplace=True modifies the DataFrame 'df1' in place rather than returning a new DataFrame
# Replace values in the 'ExerciseAngina' column of DataFrame 'df' using the recommended syntax
df["exang"] = df["exang"].replace([0, 1],["N", "Y"])

In [36]:
# Count the occurrences of each value in the 'ST_Slope' column of DataFrame 'df1'
st_slope_counts = df.slope.value_counts()

# Display the count of each unique value in the 'ST_Slope' column
print(st_slope_counts)

slope
2    141
1    140
0     21
Name: count, dtype: int64


In [37]:
# Replace values in the 'ST_Slope' column of DataFrame 'df1'
# Replace "Flat" with 0, "Up" with 1, and "Down" with 2
# inplace=True modifies the DataFrame 'df1' in place rather than returning a new DataFrame
# Replace values in the 'ST_Slope' column of DataFrame 'df1' using the recommended syntax
df["slope"] = df["slope"].replace([0, 1, 2], ["Flat", "Up", "Down"])

In [38]:
# Display the first few rows of DataFrame 'df'
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,M,0,125,212,0,ST,168,N,1.0,Down,2,3,0
1,53,M,0,140,203,1,Normal,155,Y,3.1,Flat,0,3,0
2,70,M,0,145,174,0,ST,125,Y,2.6,Flat,0,3,0
3,61,M,0,148,203,0,ST,161,N,0.0,Down,1,3,0
4,62,F,0,138,294,1,ST,106,N,1.9,Up,3,2,0
