### Heart Disease Data Dictionary

The following are the features we'll use to predict our target variable (heart disease or no heart disease).

1. age - age in years 
2. sex - (1 = male; 0 = female) 
3. cp - chest pain type 
    * 0: Typical angina: chest pain related decrease blood supply to the heart
    * 1: Atypical angina: chest pain not related to heart
    * 2: Non-anginal pain: typically esophageal spasms (non heart related)
    * 3: Asymptomatic: chest pain not showing signs of disease
4. trestbps - resting blood pressure (in mm Hg on admission to the hospital)
    * anything above 130-140 is typically cause for concern
5. chol - serum cholestoral in mg/dl 
    * serum = LDL + HDL + .2 * triglycerides
    * above 200 is cause for concern
6. fbs - (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false) 
    * '>126' mg/dL signals diabetes
7. restecg - resting electrocardiographic results
    * 0: Nothing to note
    * 1: ST-T Wave abnormality
        - can range from mild symptoms to severe problems
        - signals non-normal heart beat
    * 2: Possible or definite left ventricular hypertrophy
        - Enlarged heart's main pumping chamber
8. thalach - maximum heart rate achieved 
9. exang - exercise induced angina (1 = yes; 0 = no) 
10. oldpeak - ST depression induced by exercise relative to rest 
    * looks at stress of heart during excercise
    * unhealthy heart will stress more
11. slope - the slope of the peak exercise ST segment
    * 0: Upsloping: better heart rate with excercise (uncommon)
    * 1: Flatsloping: minimal change (typical healthy heart)
    * 2: Downslopins: signs of unhealthy heart
12. ca - number of major vessels (0-3) colored by flourosopy 
    * colored vessel means the doctor can see the blood passing through
    * the more blood movement the better (no clots)
13. thal - thalium stress result
    * 1,3: normal
    * 6: fixed defect: used to be defect but ok now
    * 7: reversable defect: no proper blood movement when excercising 
14. target - have disease or not (1=yes, 0=no) (= the predicted attribute)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_score,recall_score, f1_score
from sklearn.metrics import plot_roc_curve
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Setting up the parematers for matplotlib for visuala treat
plt.rcParams['axes.labelsize'] = 15.
plt.rcParams['xtick.labelsize'] = 15.
plt.rcParams['ytick.labelsize'] = 15.
plt.rcParams['figure.figsize'] = [15.,8.]
plt.rcParams['legend.fontsize'] = 13.

In [None]:
# Import Dataset
df = pd.read_csv("../input/heart-disease-uci/heart.csv")
df.head()

## EDA Univariate and Bivariate Analysis

In [None]:
df.shape

In [None]:
df.size

In [None]:
df.dtypes

In [None]:
df.isna().sum()

### There is no missing value in the dataset

In [None]:
df.describe()

In [None]:
df.columns

In [None]:
sns.histplot(x=df["age"], kde=True, palette="magma");

### Mostly the the age group Having Heart Disease is between 50 to 70

In [None]:
df.sex.value_counts().plot(kind="bar",color=["Salmon","lightblue"], xlabel="0= Female, 1= Male", ylabel="Counts");

## Respective to the age, male counts are more compare to female

In [None]:
sns.catplot(data=df, x="sex", y="age", hue="target", palette="husl");

In [None]:
sns.set_theme(style='darkgrid')
plt.subplot(2,3,1)
sns.countplot(data=df,x='fbs' ,palette='magma')
plt.subplot(2,3,2)
sns.countplot(data=df,x='restecg',palette='magma')
plt.subplot(2,3,3)
sns.countplot(data=df,x='slope',palette='magma')
plt.subplot(2,3,4)
sns.countplot(data=df,x='ca',palette='magma')
plt.subplot(2,3,5)
sns.countplot(data=df,x='exang',palette='magma')
plt.subplot(2,3,6)
sns.countplot(data=df,x='thal',palette='magma');

In [None]:
#plt.figure(figsize=(20,10))
sns.set_theme(style='darkgrid')
plt.subplot(2,3,1)
sns.countplot(data=df,x='fbs',palette='magma')
plt.subplot(2,3,2)
sns.countplot(data=df,x='restecg',palette='magma')
plt.subplot(2,3,3)
sns.countplot(data=df,x='slope',palette='magma')
plt.subplot(2,3,4)
sns.countplot(data=df,x='ca',palette='magma')
plt.subplot(2,3,5)
sns.countplot(data=df,x='exang',palette='magma')
plt.subplot(2,3,6)
sns.countplot(data=df,x='thal',palette='magma');

In [None]:
df.hist(figsize=(20,20), layout=(5,3));

## Analysing the relationship

cp - chest pain type:

0. Typical angina: chest pain related decrease blood supply to the heart
1. Atypical angina: chest pain not related to heart
2. Non-anginal pain: typically esophageal spasms (non heart related)
3. Asymptomatic: chest pain not showing signs of disease

In [None]:
pd.crosstab(df.cp, df.target).plot(kind="bar", color=["Salmon","lightblue"])
plt.xlabel("Types of Chest Pain")
plt.ylabel("Count")
plt.title("Chest Pain Type For Having A Heart Disease")
plt.xticks(rotation=0)
plt.legend(["No Disease","Having Disease"]);

### Type3 is having less cases but the the problility of having a heart disease is high

In [None]:
plt.figure(figsize=(10,6))
plt.scatter(df.age[df.target==0],
           df.thalach[df.target==0],
           c = "g")
plt.scatter(df.age[df.target==1],
           df.thalach[df.target==1],
           c='y');
plt.xlabel("Age")
plt.ylabel("Maximum Heart Rate")
plt.title("Age vs the Max Heart Rate for Heart Disease")
plt.legend(["No Disease", "Having Disease"]);

## Person Having Heart Rate Above 140 most likely to have a heart disease...where the the age group is considered as 40 to 70 

In [None]:
df.shape

In [None]:
plt.figure(figsize=(20,20))
sns.set_theme(style="darkgrid")
plt.subplot(5,3,1)
sns.boxplot(data=df,x=df.age, hue="target")
plt.subplot(5,3,2)
sns.boxplot(data=df,x=df.sex, hue="target")
plt.subplot(5,3,3)
sns.boxplot(data=df,x=df.cp, hue="target")
plt.subplot(5,3,4)
sns.boxplot(data=df,x=df.trestbps, hue="target")
plt.subplot(5,3,5)
sns.boxplot(data=df,x=df.chol, hue="target")
plt.subplot(5,3,6)
sns.boxplot(data=df,x=df.fbs, hue="target")
plt.subplot(5,3,7)
sns.boxplot(data=df,x=df.restecg, hue="target")
plt.subplot(5,3,8)
sns.boxplot(data=df,x=df.thalach, hue="target")
plt.subplot(5,3,9)
sns.boxplot(data=df,x=df.exang, hue="target")
plt.subplot(5,3,10)
sns.boxplot(data=df,x=df.oldpeak, hue="target")
plt.subplot(5,3,11)
sns.boxplot(data=df,x=df.slope, hue="target")
plt.subplot(5,3,12)
sns.boxplot(data=df,x=df.ca, hue="target")
plt.subplot(5,3,13)
sns.boxplot(data=df,x=df.thal, hue="target");

## Comments
## In trestbps,chol,fbs, thalach, oldpeak,ca and thal column have outliers

## Correlation Matrix between columns

In [None]:
corr_mat = df.corr()
corr_mat

In [None]:
fig, ax = plt.subplots(figsize=(20,15))
ax = sns.heatmap(corr_mat,
                annot=True,
                linewidths= 0.5,
                cmap="YlGnBu",
                fmt=".2f")

## Clearly cp, thalach is having high positive corelation with the target value,
## i.e the the value of cp and thalach inceases the value of target also increases

## Preparing Data For The Model

In [None]:
df.head()

## Scaling The Data

In [None]:
standard_scaling = StandardScaler()
column_to_scale = ["age","trestbps","chol","thalach"]
df[column_to_scale] = standard_scaling.fit_transform(df[column_to_scale])

In [None]:
df.head()

In [None]:
x=  df.drop(["target"], axis=1)
y = df["target"]

In [None]:
x.head()

In [None]:
y.head()

## Split Data into Training and Test Set

In [None]:
np.random.seed(42)

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3)

In [None]:
len(x_train), len(y_train)

## Modeling

In [None]:
## Put all model into a dictionary

models = {"Logistic Regression":LogisticRegression(),
          "RandomForestClassifier": RandomForestClassifier(),
          "SVC": LinearSVC(),
          "Naive Bayes": GaussianNB(),
          "Desicion Tree": DecisionTreeClassifier(),
          "KNN": KNeighborsClassifier()
         }
def fit_and_score(models, x_train, x_test, y_train, y_test):
    np.random.seed(42)
    
    model_scores = {}
    for name, model in models.items():
        model.fit(x_train,y_train)
        model_scores[name] = model.score(x_test, y_test)
        
    return model_scores

In [None]:
score = fit_and_score(models= models,
                     x_train=x_train,
                     x_test=x_test,
                     y_train=y_train,
                     y_test=y_test)
score

## KNN and Naive Bayes perform better than other models

## Compare Scores (Initinal Score i.e Before Hyperprameter Tuning)

In [None]:
compare_score = pd.DataFrame(score, index=["accuracy"])
compare_score.T.plot.bar()

## Hpyerparamter Tuning For Models

### Logistic Regression 

In [None]:
## Create a parameter grid for logistic Rrgression
log_reg_grid = [    
    {'penalty' : ['l1', 'l2', 'elasticnet', 'none'],
    'C' : np.logspace(-4, 4, 20),
    'solver' : ['lbfgs','newton-cg','liblinear','sag','saga'],
    'max_iter' : [100, 1000,2500, 5000]
    }
]

In [None]:
# Tune Logistic Regression 
np.random.seed(42)
gs_log_reg = GridSearchCV(LogisticRegression(n_jobs=-1),
                         param_grid = log_reg_grid,
                         cv=5,
                         verbose=True)
gs_log_reg.fit(x_train,y_train)

In [None]:
gs_log_reg.best_params_

In [None]:
gs_log_reg.score(x_test,y_test)

In [None]:
# Make preidctions on test data
y_preds = gs_log_reg.predict(x_test)

In [None]:
y_preds

In [None]:
y_test

In [None]:
#Plot ROC curve
plot_roc_curve(gs_log_reg,x_test,y_test)

### RandomForestClassifier

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 80, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [2,4]
# Minimum number of samples required to split a node
min_samples_split = [2, 5]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2]
# Method of selecting samples for training each tree
bootstrap = [True, False]

In [None]:
rf_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [None]:
np.random.seed(42)
gs_rf_grid = GridSearchCV(RandomForestClassifier(),
                         param_grid=rf_grid,
                         cv=5,
                         verbose= True,
                         n_jobs=-1)
gs_rf_grid.fit(x_train,y_train)

In [None]:
gs_rf_grid.best_params_

In [None]:
gs_rf_grid.score(x_test,y_test)

In [None]:
plot_roc_curve(gs_rf_grid, x_test, y_test)

## KNN

In [None]:
train_scores = []

# Create a list of test scores
test_scores = []

# Create a list of different values for n_neighbors
neighbors = range(1, 21) # 1 to 20

# Setup algorithm
knn = KNeighborsClassifier()

# Loop through different neighbors values
for i in neighbors:
    knn.set_params(n_neighbors = i) # set neighbors value
    
    # Fit the algorithm
    knn.fit(x_train, y_train)
    
    # Update the training scores
    train_scores.append(knn.score(x_train, y_train))
    
    # Update the test scores
    test_scores.append(knn.score(x_test, y_test))

In [None]:
plt.plot(neighbors, train_scores, label="Train score")
plt.plot(neighbors, test_scores, label="Test score")
plt.xticks(np.arange(1, 21, 1))
plt.xlabel("Number of neighbors")
plt.ylabel("Model score")
plt.legend()

print(f"Maximum KNN score on the test data: {max(test_scores)*100:.2f}%")