In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/heart-attack-analysis-prediction-dataset/heart.csv')
df_o2 = pd.read_csv('/kaggle/input/heart-attack-analysis-prediction-dataset/o2Saturation.csv')

In [None]:
df.head()

## About this dataset

* Age : Age of the patient

* Sex : Sex of the patient

* exang: exercise induced angina (1 = yes; 0 = no)

* ca: number of major vessels (0-3)

* cp : Chest Pain type chest pain type

* Value 1: typical angina
* Value 2: atypical angina
* Value 3: non-anginal pain
* Value 4: asymptomatic
* trtbps : resting blood pressure (in mm Hg)

* chol : cholestoral in mg/dl fetched via BMI sensor

* fbs : (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)

* rest_ecg : resting electrocardiographic results

* Value 0: normal
* Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
* Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria
* thalach : maximum heart rate achieved

* target : 0= less chance of heart attack 1= more chance of heart attack

## Basic EDA

In [None]:
df.columns

In [None]:
#df = df.astype({'sex':'category', 'exng':'category', 'cp':'category', 'fbs':'category', 'restecg':'category', 'exng':'category', 'caa':'category', 'output':'category'})

In [None]:
display(df.info())
display(df.describe())

In [None]:
df.hist(figsize=(16,8))
plt.show()

In [None]:
plt.figure(figsize=(10, 10))
sns.heatmap(df.corr(), annot=True)

In [None]:
sns.countplot(df['output'])
plt.show()

#### we don't need smoothing here

In [None]:
sns.displot(x='age', hue='output', data=df, alpha=0.6)
plt.show()

In [None]:
attack = df[df['output']==1]
sns.displot(attack.age, kind='kde')
plt.show()

In [None]:
sns.displot(attack.age, kind='ecdf')
plt.grid(True)
plt.show()

#### ECDF reveals that 80% data lies below age 60 year old people. Which means that people with age greater than 60 have 20% of getting heart attack and people with age greater than 50 have 60% of getting a heart attack.

In [None]:
ranges = [0, 30, 40, 50, 60, 70, np.inf]
labels = ['0-30', '30-40', '40-50', '50-60', '60-70', '70+']

attack['age'] = pd.cut(attack['age'], bins=ranges, labels=labels)
attack['age'].head()

In [None]:
sns.countplot(attack.age)
plt.show()

#### Age do affect chances of getting heart attack. People with age between 40-60 gets heart attack most and getting a heart attack after age 60 is 20 percent.

In [None]:
fig, ax = plt.subplots(figsize=(8, 5))
sns.countplot(x='sex', hue='age', data=attack, ax=ax)
ax.set_xticklabels(['Female', 'Male'])
plt.legend(loc='upper left')
plt.show()

#### Males have high numbers here but it's because number of males are high in this dataset, we have to find out which gnder have highest chances of heart attack

In [None]:
attack = df[df['output'] == 1]
sns.displot(x='age', kind='kde', hue='sex', data=attack)
plt.grid(True)
plt.show()

In [None]:
sns.displot(x='age', kind='ecdf', hue='sex', data=attack)
plt.grid(True)
plt.show()

#### Both genders have equal chances the only difference is males have high chances of getting heart attack at early age i.e. after age 30, and females chances of getting heart attack a bit late compared to males, but the chances of getting heart attack is high in females after age 70.

## Classifier 1

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

In [None]:
# creating a copy of df
df1 = df
scaler = StandardScaler()

# define the columns to be encoded and scaled
cat_cols = ['sex','exng','caa','cp','fbs','restecg','slp','thall']
con_cols = ["age","trtbps","chol","thalachh","oldpeak"]

# encoding the categorical columns
df1 = pd.get_dummies(df1, columns = cat_cols, drop_first = True)

X = df1.drop(['output'],axis=1)
y = df1[['output']]

df1[con_cols] = scaler.fit_transform(X[con_cols])

# defining the features and target
X = df1.drop(['output'],axis=1)
y = df1[['output']]

# scaling the continuous featuree
print("The first 5 rows of X are")
print(X.head())

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 42)
print("The shape of X_train is      ", X_train.shape)
print("The shape of X_test is       ",X_test.shape)
print("The shape of y_train is      ",y_train.shape)
print("The shape of y_test is       ",y_test.shape)

In [None]:
lr = LogisticRegression(random_state=42)

knn = KNeighborsClassifier()
para_knn = {'n_neighbors':np.arange(1, 50)}

grid_knn = GridSearchCV(knn, param_grid=para_knn, cv=5)

dt = DecisionTreeClassifier()
para_dt = {'criterion':['gini','entropy'],'max_depth':np.arange(1, 50), 'min_samples_leaf':[1,2,4,5,10,20,30,40,80,100]}
grid_dt = GridSearchCV(dt, param_grid=para_dt, cv=5)

rf = RandomForestClassifier()

# Define the dictionary 'params_rf'
params_rf = {
    'n_estimators':[100, 350, 500],
    'min_samples_leaf':[2, 10, 30]
}
grid_rf = GridSearchCV(rf, param_grid=params_rf, cv=5)

In [None]:
dt = DecisionTreeClassifier(criterion='gini', max_depth=9, min_samples_leaf=10, random_state=42)
knn = KNeighborsClassifier(n_neighbors=3)
rf = RandomForestClassifier(n_estimators=500, min_samples_leaf=2, random_state=42)

In [None]:
# Define the list classifiers
classifiers = [('Logistic Regression', lr), ('K Nearest Neighbours', knn), ('Classification Tree', dt), ('Random Forest', rf)]

In [None]:
# Iterate over the pre-defined list of classifiers
for clf_name, clf in classifiers:    
 
    # Fit clf to the training set
    clf.fit(X_train, y_train)    
   
    # Predict y_pred
    y_pred = clf.predict(X_test)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_pred, y_test) 
   
    # Evaluate clf's accuracy on the test set
    print('{:s} : {:.3f}'.format(clf_name, accuracy))

In [None]:
classifiers = [('Logistic Regression', lr), ('Random Forest', rf), ('K Nearest Neighbours', knn)]

In [None]:
# Import VotingClassifier from sklearn.ensemble
from sklearn.ensemble import VotingClassifier

# Instantiate a VotingClassifier vc
vc = VotingClassifier(estimators=classifiers, voting='soft')     

# Fit vc to the training set
vc.fit(X_train, y_train)   

# Evaluate the test set predictions
y_pred = vc.predict(X_test)

# Calculate accuracy score
accuracy = accuracy_score(y_pred, y_test)
print('Voting Classifier: {:.3f}'.format(accuracy))

In [None]:
from sklearn.ensemble import AdaBoostClassifier

ada = AdaBoostClassifier(base_estimator=rf, n_estimators=100, random_state=1)

ada.fit(X_train, y_train)

y_pred = ada.predict(X_test)

accuracy_score(y_pred, y_test)

In [None]:
# Create a pd.Series of features importances
importances = pd.Series(data=rf.feature_importances_,
                        index= X_train.columns)

# Sort importances
importances_sorted = importances.sort_values()

# Draw a horizontal barplot of importances_sorted
plt.figure(figsize=(10, 10))
importances_sorted.plot(kind='barh', color='lightgreen')
plt.title('Features Importances')
plt.show()

In [None]:
# Perform the necessary imports
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt

# Create scaler: scaler
scaler = StandardScaler()

# Create a PCA instance: pca
pca = PCA()

# Create pipeline: pipeline
pipeline = make_pipeline(scaler, pca)

# Fit the pipeline to 'samples'
pipeline.fit(X_train)

# Plot the explained variances
features = range(pca.n_components_)
plt.bar(features, pca.explained_variance_)
plt.xlabel('PCA feature')
plt.ylabel('variance')
plt.xticks(features)
plt.show()

In [None]:
# Import necessary modules
from sklearn.metrics import roc_curve

# Compute predicted probabilities: y_pred_prob
y_pred_prob_ada = ada.predict_proba(X_test)[:,1]
y_pred_prob_lr = lr.predict_proba(X_test)[:,1]
y_pred_prob_vc = vc.predict_proba(X_test)[:,1]

# Generate ROC curve values: fpr, tpr, thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob_ada)
fpr1, tpr1, thresholds = roc_curve(y_test, y_pred_prob_lr)
fpr2, tpr2, thresholds = roc_curve(y_test, y_pred_prob_vc)
# fpr: False Positive Rate
# tpr: True Positive Rate

# Plot ROC curve
plt.figure(figsize=(10, 10))
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr, label='ADA', alpha=0.7)
plt.plot(fpr1, tpr1, label='LR', alpha=0.7)
plt.plot(fpr2, tpr2, label='VC', alpha=0.7)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()

## Classifier 2

In [None]:
samples = df.drop('output', axis=1)
samples.head()

In [None]:
# Perform the necessary imports
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt

# Create scaler: scaler
scaler = StandardScaler()

# Create a PCA instance: pca
pca = PCA()

# Create pipeline: pipeline
pipeline = make_pipeline(scaler, pca)

# Fit the pipeline to 'samples'
pipeline.fit(samples)

# Plot the explained variances
features = range(pca.n_components_)
plt.bar(features, pca.explained_variance_)
plt.xlabel('PCA feature')
plt.ylabel('variance')
plt.xticks(features)
plt.show()

In [None]:
scaled_samples = StandardScaler().fit_transform(samples)

# Create a PCA model with 10 components: pca
pca = PCA(n_components=10)

# Fit the PCA instance to the scaled samples
pca.fit(scaled_samples)

# Transform the scaled samples: pca_features
pca_features = pca.transform(scaled_samples)

# Print the shape of pca_features
print(pca_features.shape)

In [None]:
df2 = pd.DataFrame(pca_features)
df2.head()

In [None]:
from sklearn.cluster import KMeans

ks = range(1, 8)
inertias = []

for k in ks:
    # Create a KMeans instance with k clusters: model
    model = KMeans(n_clusters=k)
    
    # Fit model to samples
    model.fit(pca_features)
    
    # Append the inertia to the list of inertias
    inertias.append(model.inertia_)
    
# Plot ks vs inertias
plt.plot(ks, inertias, '-o')
plt.xlabel('number of clusters, k')
plt.ylabel('inertia')
plt.xticks(ks)
plt.show()

In [None]:
model2 = KMeans(n_clusters=2)

model2.fit(df2)

labels = model2.predict(pca_features)

In [None]:
sub = pd.DataFrame({'Labels':labels, 'Output':df.output})
pd.crosstab(sub.Output, sub.Labels)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(df.output, labels))

In [None]:
fpr, tpr, thresholds = roc_curve(sub.Output, sub.Labels)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr, alpha=0.7)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()