## Importing the tools

In [None]:
#initial tools
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

#plotting tools
%matplotlib inline
#show plots directly
import matplotlib.pyplot as plt # plotting in details
import seaborn as sns # to ease complex plots

# modeling and scoring tools
from sklearn.ensemble import RandomForestClassifier # the model we will use
from sklearn.model_selection import train_test_split # splitting the data int train and test tool
from sklearn.metrics import confusion_matrix,f1_score # scoring tools and technics

import warnings 
warnings.filterwarnings("ignore") # ignore file warnings 

## brief about the Features

Age : Age of the patient

Sex : Sex of the patient

exang: exercise induced angina (1 = yes; 0 = no)

ca: number of major vessels (0-3)

cp : Chest Pain type chest pain type
* Value 1: typical angina
* Value 2: atypical angina
* Value 3: non-anginal pain
* Value 4: asymptomatic
* trtbps : resting blood pressure (in mm Hg)

chol : cholestoral in mg/dl fetched via BMI sensor

fbs : (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)

rest_ecg : resting electrocardiographic results

* Value 0: normal
* Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
* Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria
thalach : maximum heart rate achieved

target : 0= less chance of heart attack 1= more chance of heart attack

## Read the data

In [None]:
df = pd.read_csv("/kaggle/input/heart-attack-analysis-prediction-dataset/heart.csv")
df.head()

In [None]:
# information about the data
df.info()

we see no null values , most of the types are int64 but oldpeak is float

## Simple EDA

In [None]:
sns.set_style("darkgrid") # set a background style for the plots

In [None]:
#lets see the age range in a histogram 
plt.figure(figsize=(10,6)) #setting size of the plot
sns.histplot(data=df,x="age",bins=10);

In [None]:
#angina and age
plt.figure(figsize=(15,10)) #setting the plot size
sns.barplot(data=df,x="age",y="exng"); 

we see ages (29 34 37 39 49 65 69 71 76) do not experience angina during exercise. 

In [None]:
#relation between age and highest heart rate and how likely that patient will have heart attack

sns.lmplot(data=df,x="age",y="thalachh",hue="output",fit_reg=False);

we see that probability of having heart attack increase after the maximum heart rate achieved exceed 120 

## Creating features and Splitting the data

In [None]:
#creat new feature and read an existing feature

#creating a feature
df["c/o"]=df["cp"]/df["thalachh"]

#reading an existing feature
df["sat"] = pd.read_csv("/kaggle/input/heart-attack-analysis-prediction-dataset/o2Saturation.csv")

In [None]:
#defining the dependant and independant columns

X = df.drop(["output"],axis=1) #independant columns
y = df.output #dependant column

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42) #splitting the data into train and test 

## Modeling

In [None]:
reg = RandomForestClassifier(n_jobs=-1,n_estimators=40,min_samples_leaf=2,random_state=42,ccp_alpha=0.01) #using random forest as a model

#before i choose this parameters i ran gridsearchcv these are the parameters that worked to save time for you

reg.fit(X_train,y_train) #training the model

## Scoring

In [None]:
print("accuracy score on the training set : ",reg.score(X_train,y_train))
print("accuracy score on the test set : ",reg.score(X_test,y_test))

In [None]:
print("F1 score on the training set : ",f1_score(y_train,reg.predict(X_train)))
print("F1 score on the test set : ",f1_score(y_test,reg.predict(X_test)))

In [None]:
#confusin matrix
pred = reg.predict(X_test)
                   
sns.heatmap(confusion_matrix(y_test,pred),annot=True);

out of 61 example there is only 6 example which got predicted wrong

#### thank you 