In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

In [None]:
heart = pd.read_csv('../input/heart-attack-analysis-prediction-dataset/heart.csv')
o2 = pd.read_csv('../input/heart-attack-analysis-prediction-dataset/o2Saturation.csv')

In [None]:
heart.head()

### Analyzing output Variable

In [None]:
x = pd.DataFrame(heart['output'].value_counts())
x

In [None]:
px.pie(x, values = 'output',names=['Yes','No'],title='Number of people with heart disease')

* Decent numbers here meaning there won't be imbalance.

In [None]:
px.imshow(heart.corr())

In [None]:
fig = px.bar(heart.corr().loc['output',:],title='Correlation of heart disease with all the columns',color_discrete_sequence=['red'])
fig.update_layout(xaxis_title = 'Columns',yaxis_title='Correlation value')

### Insights
* Ignoring the output column, the columns most correlated with the output are: chest pain (cp), max heart rate (thalachh), exercise induced angina (exng), oldpeak, slope (slp), major arteries affect (caa), and thall.     

# Analyzing the Dataset

In [None]:
print(heart.info())
print('-'*50)
print(heart.nunique())

### Insights
* There are no outright null values
* All the columns are numerical 
* Only 303 rows, not a lot of data to go on
* 'sex', 'fbs', 'exng', 'output' are binary (Columns only contain a 0 or 1)

In [None]:
for x in ['age','trtbps','chol','thalachh','oldpeak']:
    sns.boxplot(x = heart['output'],y=heart[x])
    plt.show()

### Insights
* Apart from cholesterol, all the other columns show a clear relationship with the output

In [None]:
heart[['age','trtbps','chol','thalachh','oldpeak']].skew()

### Insights
* The numerical columns max heart rate (trtbps), cholesterol (chol), and oldpeak have heavy skewing that would need to be fixed later.

In [None]:
px.scatter(heart,x='age',y='thalachh',color='output',marginal_x='box',marginal_y='box',title='Age vs Max Heart Rate')

In [None]:
print('Avg of the column age is:',heart.age.mean())
print('Avg of the column thalachh is:',heart.thalachh.mean())

### Insights
* We can see a negative correlation between age and max heart rate achieved. 
* All the people with heart conditions seem to be at a lower age and higher max heart rate.
* The differences between the average and the median for both columns isn't significant.

In [None]:
px.scatter(heart,x='age',y='chol',color='output',marginal_x='box',marginal_y='box',title='Age vs Cholesterol')

In [None]:
sns.pairplot(heart[['cp','chol','thalachh','exng','oldpeak','slp','caa','thall','output']],diag_kind='kde',hue='output')

### Insights
* The diagonal columns are the most interesting. They show the relationship between the people with heart disease and without it for each of the columns.
* For example, for cp, the most people with no chest pain don't have heart disease, whereas most people with the other kinds of chest pain do.
* This trend repeats with most columns except for cholesterol, where a pattern is hard to detect.

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer

In [None]:
knn = KNeighborsClassifier(n_neighbors=2)
scaler = StandardScaler()
pt = PowerTransformer()

In [None]:
X = heart.drop(columns='output')
y = heart['output']

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42,stratify=y)

In [None]:
X_train = pd.DataFrame(scaler.fit_transform(X_train),columns=X.columns)
X_test = pd.DataFrame(scaler.transform(X_test),columns=X.columns)

In [None]:
X_train = pd.DataFrame(pt.fit_transform(X_train),columns=X.columns)
X_test = pd.DataFrame(pt.transform(X_test),columns=X.columns)

In [None]:
knn.fit(X_train,y_train)

In [None]:
knn.score(X_test,y_test)

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
X2 = pd.DataFrame(scaler.fit_transform(X),columns=X.columns)
X2 = pd.DataFrame(pt.fit_transform(X2),columns=X.columns)
cv = cross_val_score(knn,X2,y)

In [None]:
cv.mean()