# 1. Introduction


## Importing library

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
dataset = pd.read_csv("../input/heart-attack-analysis-prediction-dataset/heart.csv")

## 2.3 Undestanding Dataset

In [None]:
dataset.head()

In [None]:
print(f"The shape of given data is {dataset.shape}")
print(f'The different column label:  {list(dataset.columns)}')
print(f"The Sample Data coinsist of people of age between {dataset['age'].min()} to {dataset['age'].max()}")

In [None]:
dataset.describe()

In [None]:
print("The categorial cols are : ", ['sex','exng','caa','cp','fbs','restecg','slp','thall', 'output'])
print("The non categorial cols are : ", ["age","trtbps","chol","thalachh","oldpeak"])

In [None]:
dataset.isna().sum()

In [None]:
dataset.dtypes

# 3 Exploratory data analysis

## 3.1 Data visual of categorical data

In [None]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

fig = make_subplots(
    rows=2, cols=4,
    subplot_titles=("Blood sugar", "Included engina","Sex", "Electrocardiographic results", "Slope", "Target", "Chest pain type", "Number of major vessels"),
    specs=[[{"type": "domain"}, {"type": "domain"}, {"type": "domain"}, {"type": "domain"}],
           [{"type": "domain"}, {"type": "domain"}, {"type": "domain"}, {"type": "domain"}]]
)
fig.add_trace(
    go.Pie(labels=['True', 'False'], values=[(dataset['fbs'] == 1).sum(), (dataset['fbs'] == 0).sum()]),
    row=1, col=1
)
fig.add_trace(
    go.Pie(labels=['Yes', 'No'], values=[(dataset['exng'] == 1).sum(), (dataset['exng'] == 0).sum()]),
    row=1, col=2
)
fig.add_trace(
    go.Pie(labels=['T-T wave normality', 'Normal', 'Left ventricular hypertrophy'], values=[(dataset['restecg'] == 1).sum(), (dataset['restecg'] == 0).sum(), (dataset['restecg'] == 2).sum()]),
    row=1, col=4
)
fig.add_trace(
    go.Pie(labels=['Male', 'Female'], values=[(dataset['sex'] == 1).sum(), (dataset['sex'] == 0).sum()]),
    row=1, col=3
)
fig.add_trace(
    go.Pie(labels=['Atypical Angina', 'Typical Angina', 'Non-anginal Pain', 'Asymptomatic'], values=[(dataset['cp'] == 1).sum(), (dataset['cp'] == 0).sum(), (dataset['cp'] == 2).sum(), (dataset['cp'] == 3).sum()]),
    row=2, col=3
)
fig.add_trace(
    go.Pie(labels=['Yes', 'No'], values=[(dataset['slp'] == 1).sum(), (dataset['slp'] == 0).sum()]),
    row=2, col=1
)
fig.add_trace(
    go.Pie(labels=['Have diseases', 'Does not have diseases'], values=[(dataset['output'] == 1).sum(), (dataset['output'] == 0).sum()]),
    row=2, col=2
)
fig.add_trace(
    go.Pie(labels=['1', '0', '2', '3', '4'], values=[(dataset['caa'] == 1).sum(), (dataset['output'] == 0).sum(), (dataset['caa'] == 2).sum(), (dataset['caa'] == 3).sum(), (dataset['caa'] == 4).sum()]),
    row=2, col=4
)

fig.update_layout(height=800, width=1100, title_text="Distribuation of Categorial Values", showlegend=False)
fig.show()

## 3.2 Data visual of non categorical data 

In [None]:
fig = make_subplots(
    rows=2, cols=2,
)
fig.add_trace(go.Histogram(x=dataset.age, name='Age of the patient'),
              row=1, col=1)
fig.add_trace(go.Histogram(x=dataset.trtbps, name='Resting blood pressure'),
              row=1, col=2)
fig.add_trace(go.Histogram(x=dataset.chol, name=' Cholestoral'),
              row=2, col=1)
fig.add_trace(go.Histogram(x=dataset.thalachh, name='Maximum heart rate'),
              row=2, col=2)

fig.update_xaxes(title_text="age", row=1, col=1)
fig.update_xaxes(title_text="trtbps", row=1, col=2)
fig.update_xaxes(title_text="chol", row=2, col=1)
fig.update_xaxes(title_text="thalachh", row=2, col=2)

fig.update_yaxes(title_text="count", row=1, col=1)
fig.update_yaxes(title_text="count", row=1, col=2)
fig.update_yaxes(title_text="count", row=2, col=1)
fig.update_yaxes(title_text="count", row=2, col=2)

fig.update_layout(title_text="Distribution of Continuous features", showlegend=False)

fig.show()

## 3.3 Correlation matrix


In [None]:
import matplotlib.pyplot as plt
from seaborn.matrix import heatmap
plt.figure(figsize=(17,9))
heatmap(dataset.corr(), cmap='Blues', annot=True)
plt.title('Correlation Matrix');

## 3.4 Count of target variable according to continuous features

In [None]:
fig = make_subplots(
    rows=2, cols=4,
)
fig.add_trace(go.Bar(x=dataset.cp,y=dataset.output, name='Chest pain'),
              row=1, col=1)
fig.add_trace(go.Bar(x=dataset.sex,y=dataset.output, name='Sex of the patient'),
              row=1, col=2)
fig.add_trace(go.Bar(x=dataset.exng,y=dataset.output, name='Exercise induced angina'),
              row=1, col=3)
fig.add_trace(go.Bar(x=dataset.fbs,y=dataset.output, name='fasting blood sugar'),
              row=1, col=4)
fig.add_trace(go.Bar(x=dataset.caa,y=dataset.output, name='Number of major vessels'),
              row=2, col=1)
fig.add_trace(go.Bar(x=dataset.slp,y=dataset.output, name='Slope'),
              row=2, col=2)
fig.add_trace(go.Bar(x=dataset.restecg,y=dataset.output, name='Resting electrocardiographic'),
              row=2, col=3)
fig.add_trace(go.Bar(x=dataset.thall,y=dataset.output, name='Thalium Stress Test result'),
              row=2, col=4)
fig.update_layout(height=800, width=1100, title_text="Distribution of categorical features according to target variable", showlegend=False)

fig.update_xaxes(title_text="cp", row=1, col=1)
fig.update_xaxes(title_text="sex", row=1, col=2)
fig.update_xaxes(title_text="exng", row=1, col=3)
fig.update_xaxes(title_text="fbs", row=1, col=4)
fig.update_xaxes(title_text="caa", row=2, col=1)
fig.update_xaxes(title_text="slp", row=2, col=2)
fig.update_xaxes(title_text="restecg", row=2, col=3)
fig.update_xaxes(title_text="thall", row=2, col=4)

fig.update_yaxes(title_text="Number of Target patient", row=1, col=1)
fig.update_yaxes(title_text="Number of Target patient", row=2, col=1)


fig.show()

## 3.5 Pairplot

In [None]:
import seaborn as sns
sns.pairplot(dataset,hue='output')
plt.show()

# 4 Observation and Conclusion from data exploring and EDA



## From 2.3 Data understanding
1. There are no missing values in the data therefore we can fit model directly without any imputation to missing data.

2. There are no alphabetical data and model will fit directly scikit learn classification model required only numerical data.

3. There are some categorical data column therefore we will use Encoder to preprocess the data, its very important (we will use data with and without encoding to see the result)


## From EDA
1. There is no linear correlation between continuous variable according to the heatmap.

2. The scatterplot heatmap matrix suggests that there might be some correlation between output and cp, output and thalachh, output and slp

3. According to the 3.4 distribution plot.

  * People with Non-Anginal chest pain, that is with cp = 2 have higher number of heart attack case.

  * People with sex = 1 have higher number of heart attack case.

  * People with no exercise induced angina, that is with exng = 0 have higher number of heart attack case.

  * People with 0 major vessels, that is with caa = 0 have number of heart attack case.

  * People with thall = 2 have much higher number of heart attack case.


# 5 Modeling

## 5.1 Importing Packages


In [None]:
#Encoding
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

## Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

## Model evaluators
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.metrics import plot_roc_curve

In [None]:
X = dataset.drop('output', axis=1)
y = dataset.output

In [None]:
#Encoding
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

cat_feature = ['sex','exng','caa','cp','fbs','restecg','slp','thall']

one_hot = OneHotEncoder()

transformer = ColumnTransformer([('one_hot', one_hot, cat_feature)],
                                remainder='passthrough')
transformed_X = transformer.fit_transform(X)
pd.DataFrame(transformed_X)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(transformed_X, y, test_size = 0.3)

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

In [None]:
model = {'LogisticRegression' : LogisticRegression(),
         'KNeighborsClassifier': KNeighborsClassifier(),
         
         'DecisionTreeClassifier': DecisionTreeClassifier(),
         'RandomForestClassifier' : RandomForestClassifier(),
         'Native_Bayes': GaussianNB()}

def model_fit_score(model, X_train, X_test, y_train, y_test):
  model_score = {}
  for name, model in model.items():
    model.fit(X_train, y_train) 
    y_pred_proba = model.predict_proba(X_test)
    y_pred = np.argmax(y_pred_proba,axis=1)
    model_score[name] = model.score(X_test, y_test)
  return model_score  

In [None]:
model_fit_score(model, X_train, X_test, y_train, y_test)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
grid = {"n_estimators": [10, 100, 200, 500, 1000, 1200],
         "max_depth": [None, 5, 10, 20, 30],
         "max_features": ["auto", "sqrt"],
         "min_samples_split": [2,4,6],
         "min_samples_leaf": [1, 2, 4]}
np.random.seed(42)
clf = RandomForestClassifier(n_jobs=-1)
rs_clf = RandomizedSearchCV(estimator=clf,
                            param_distributions = grid,
                            n_iter=30,
                            cv = 5,
                            verbose = 2)
rs_clf.fit(X_train, y_train);

In [None]:
rs_clf.best_params_

In [None]:
def evaluate_preds(y_true, y_preds):
  accuracy = accuracy_score(y_true, y_preds)
  precision = precision_score(y_true, y_preds)
  recall = recall_score(y_true, y_preds)
  f1 = f1_score(y_true, y_preds)
  print(f"Acc : {round(accuracy, 2) * 100:.2f}%")
  print(f"Precision : {round(precision, 2):.2f}")
  print(f"recall : {round(recall, 2):.2f}")
  print(f"F1 score {round(f1, 2):.2f}")

   

In [None]:
evaluate_preds(y_test, rs_clf.predict(X_test))

In [None]:
from sklearn.metrics import  roc_curve

# Make prediction with probability 
y_probs = rs_clf.predict_proba(X_test)

y_probs_positive = y_probs[:, 1] #axis = 1

fpr, tpr, threshold = roc_curve(y_test, y_probs_positive)

fpr

In [None]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
ax.plot(fpr, tpr)
ax.plot([0,1], [0,1])
ax.set(title="ROC Curve",
        xlabel="fpr",
        ylabel="tpr",)
plt.show()