# Naive Bayes Training Notebook

## Imports

In [1]:
import numpy as np
import pandas as pd

## Read CSV Data

In [2]:
heart_attack_df = pd.read_csv("AI_training_data.csv")
heart_attack_df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,28,1,2,130,132,0,2,185,0,0.0,?,?,?,0
1,29,1,2,120,243,0,0,160,0,0.0,?,?,?,0
2,29,1,2,140,?,0,0,170,0,0.0,?,?,?,0
3,30,0,1,170,237,0,1,170,0,0.0,?,?,6,0
4,31,0,2,100,219,0,1,150,0,0.0,?,?,?,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
289,52,1,4,160,331,0,0,94,1,2.5,?,?,?,1
290,54,0,3,130,294,0,1,100,1,0.0,2,?,?,1
291,56,1,4,155,342,1,0,150,1,3.0,2,?,?,1
292,58,0,2,180,393,0,0,110,1,1.0,2,?,7,1


## Clean up dataframe

Dropping data that we cannot use or simulate for our own project and renaming some columns.

In [3]:
heart_attack_df.drop('cp', inplace=True, axis=1)
heart_attack_df.drop('restecg', inplace=True, axis=1)
heart_attack_df.drop('exang', inplace=True, axis=1)
heart_attack_df.drop('oldpeak', inplace=True, axis=1)
heart_attack_df.drop('slope', inplace=True, axis=1)
heart_attack_df.drop('ca', inplace=True, axis=1)
heart_attack_df.drop('thal', inplace=True, axis=1)

heart_attack_df.rename(columns={"trestbps": "resting heart rate", "chol": "cholestrol", "fbs" : "fasting blood sugar > 120 mg/dl", "thalach" : "maximum heart rate achieved", "num" : "heart attack"}, inplace=True)

heart_attack_df

Unnamed: 0,age,sex,resting heart rate,cholestrol,fasting blood sugar > 120 mg/dl,maximum heart rate achieved,num
0,28,1,130,132,0,185,0
1,29,1,120,243,0,160,0
2,29,1,140,?,0,170,0
3,30,0,170,237,0,170,0
4,31,0,100,219,0,150,0
...,...,...,...,...,...,...,...
289,52,1,160,331,0,94,1
290,54,0,130,294,0,100,1
291,56,1,155,342,1,150,1
292,58,0,180,393,0,110,1


Removing rows that are missing data and thus are useless for training.

In [4]:
heart_attack_df = heart_attack_df.apply(pd.to_numeric, errors='coerce')
heart_attack_df = heart_attack_df.dropna()
heart_attack_df

Unnamed: 0,age,sex,resting heart rate,cholestrol,fasting blood sugar > 120 mg/dl,maximum heart rate achieved,num
0,28,1,130.0,132.0,0.0,185.0,0
1,29,1,120.0,243.0,0.0,160.0,0
3,30,0,170.0,237.0,0.0,170.0,0
4,31,0,100.0,219.0,0.0,150.0,0
5,32,0,105.0,198.0,0.0,165.0,0
...,...,...,...,...,...,...,...
289,52,1,160.0,331.0,0.0,94.0,1
290,54,0,130.0,294.0,0.0,100.0,1
291,56,1,155.0,342.0,1.0,150.0,1
292,58,0,180.0,393.0,0.0,110.0,1


In [5]:
training_set = heart_attack_df.to_numpy()
training_set

array([[ 28.,   1., 130., ...,   0., 185.,   0.],
       [ 29.,   1., 120., ...,   0., 160.,   0.],
       [ 30.,   0., 170., ...,   0., 170.,   0.],
       ...,
       [ 56.,   1., 155., ...,   1., 150.,   1.],
       [ 58.,   0., 180., ...,   0., 110.,   1.],
       [ 65.,   1., 130., ...,   0., 115.,   1.]])