In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
pip install dmba

**Loading data**

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

df = pd.read_csv('../input/heart-attack-analysis-prediction-dataset/heart.csv')


df.head()

# **will be dropping columns **

In [None]:
df.drop(columns=['restecg', 'oldpeak', 'slp', 'caa', 'thall'], inplace=True)
df.columns = [c.replace(' ', '_') for c in df.columns]
df.head(10)

**Descriptive statistics**

In [None]:
df.shape

In [None]:
df.mean()

In [None]:
df.max()

In [None]:
df.min()

In [None]:
df.std()

**Missing values and Outliers**

In [None]:
df.isna().sum()

In order to handle missing values, we'll first have to write a code that detects these, which in our occasion, we dont have any. We will then describe the data we have to see if there are any outliers that don't make sense to have, but in our case, we do have a total of 5 ourliers. These outliers will be kept in order to describe that high volumes of high cholesterol can be the cause of a heart attack

In [None]:
df.dropna(axis='columns')

In [None]:
df.describe()

In [None]:
df_1 = df[['cp', 'trtbps']]
df_2 = df[['chol', 'fbs', 'thalachh']]


ax = sns.boxplot(data=df_2, orient="h", palette="Set2")

Chol has a couple of outliers

In [None]:
at_cols = ["sex", "exng", "cp", "fbs", "restecg", "thall"]

# continuous column
con_cols = ["age", "trtbps", "chol", "thalachh"]

# target 
target_col = ["output"]

In [None]:
#replacing outliers with the mean
for col in con_cols:
    df_col = df[col]
    Q1 = df_col.quantile(0.25)
    Q3 = df_col.quantile(0.75)
    IQR = Q3 - Q1

    lower_lim = Q1 - 1.5 * IQR
    upper_lim = Q3 + 1.5 * IQR

    

    df_col = pd.DataFrame(df_col)
    
    outlier = (df_col < lower_lim) | (df_col > upper_lim)
    
    
    mean = df_col.mean()
    df_col[outlier] = np.nan
    df_col.fillna(mean,inplace=True)
    
    df[col] = df_col

In [None]:
df[con_cols].describe().transpose() 

**Data Visualization**

In [None]:
from matplotlib import pyplot as plt

In [None]:
data_num = df[['age','trtbps','chol', 'thalachh']]
data_cat =df[['cp','fbs']]

for i in data_num.columns:
    plt.hist(data_num[i])
    plt.title(i)
    plt.show()

* age = Distribution of column age shows that there are outliers on the left side of the distribution.
* trtbps = Distribution of column trtbps shows outliers on both left as well as right side
* chol = Distribution of column chol shows that there are outliers on the right side of the distribution.
* thalachh = Distribution of column thalachh shows that there are outliers on the left side of the distribution

**Bar Charts and Aggregation**

In [None]:
import pandas as pd
 
import matplotlib.pyplot as plt

In [None]:
print("Categorical Variables: 'sex', 'cp', 'fbs', 'restecg', 'exng', 'thall', 'caa', 'slp' ")

In [None]:
df_age_chol=pd.read_csv('../input/heart-attack-analysis-prediction-dataset/heart.csv')
 
df_age_chol

In [None]:
plt.bar(x=df_age_chol['age'],
 
        height=df_age_chol['chol'])

plt.xlabel("AGE",fontsize=10)
plt.ylabel("CHOLESTORAL",fontsize=10)

In [None]:
df_age_chol = df[['age', 'chol']]
df_age_chol.head()

In [None]:
sns.regplot(x = "age", y="chol", data=df, fit_reg = False, scatter_kws={"alpha": 0.6})

In [None]:
plt.figure(figsize=(10,7))
sns.lineplot(y="trtbps",x="age",data=df)
plt.title("BLOOD PRESSURE WITH AGE",fontsize=10)
plt.xlabel("AGE",fontsize=10)
plt.ylabel("BLOOD PRESSURE",fontsize=10)
plt.show()

In [None]:
sns.histplot(x='chol', hue='output',bins=40, data=df)
plt.title("Heart Attack Counts w.r.t Cholesterol Level in mg/dl");

* Cholesterol Level in the range of 200 - 270 have higher chances of heart Attacks 
* 0 = high chances, 1 = low chances

In [None]:
plt.figure(figsize=(8,5))
sns.countplot(x='output', hue='cp', data=df)
plt.title("Heart Attack Counts w.r.t Chest Pain");

* Non-Anginal chest pains leads to higher Heart Attacks than the others types of pains. Typical Pains might greatly indicate lower chances of Heart Attack.
* 0 = asymptomatic 1 = non-anginal 2 = atypical 3 = typical

**Correlation Analysis**

In [None]:
corrMatrix = df.corr()
print (corrMatrix)

In [None]:
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt


df_corr = pd.DataFrame(df,columns=['age','cp','trtbps', 'chol', 'fbs', 'thalachh', 'output'])

plt.figure(figsize = (14, 8))
corrMatrix = df_corr.corr()
sn.heatmap(corrMatrix, annot=True)
plt.show()

* Based on correlations data, Chest pain, thallach(maximum heart rate) have high positive correlations with high chances of higher attacks.
* Similarly, exng, has high negative correlation with high chances of heart attacks
* age seems to moderately affect the chances of Heart Attacks.
* Cholesterol and fbs(fasting blood sugar seems to have slight effect on Heart Attack chances

it seems like multicollinearity is an issue here, because there is more predictor variables which overlap so much in what they measure that their effects are indistinguishable.

**Using VIF here checks to see if multicollinarity is a factor - which it is**

In [None]:
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

X = df[list(df.columns[:-2])]

vif_info = pd.DataFrame()
vif_info['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif_info['Column'] = X.columns
vif_info.sort_values('VIF', ascending=False)

In [None]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
import matplotlib.pylab as plt
from dmba import plotDecisionTree, classificationSummary, regressionSummary

**Dummies**

Dummies need to be created for the 'cp' column because it consists of 4 categorical variables: "typical angina," "atypical angina," "non-anginal pain," and "asymptomatic". A column is dropped because its data can be calculated from the other categorical variables, and therefore this is redundant data.

In [None]:
#dummies needed for 'cp'
df['cp'] = df['cp'].astype('category')
new_categories = {1: 'typical angina', 2: 'atypical angina', 3: 'non-anginal pain', 3: 'asymptomatic'}
df.cp.cat.rename_categories(new_categories, inplace=True)

df = pd.get_dummies(df, prefix_sep='_', drop_first=True)
df.head(10)

**Normalization**

Independent variables in the KNN tranining set will need to be normalized so that the differences in range of values can be scaled in relation to one another. Independent variables are normalized in the Neural net because categories are not equidistant and performance is better when variables are centered about 0, as opposed to 0-1. Normalization is not usedj in the Random forest because an average of estimates is used to make predictions.

Furthermore, we didn't normalize was because each column is on its own scale, they either have to be binomials, age, cholestoral was using mg/dl or mm Hg, which doesn't make sense to do. It wouldn't be easier to read. Just using correlation makes us understand that there will be outliers in our data.

**Modeling: KNN**

In [None]:
#Partitioning
trainData, validData = train_test_split(df, test_size=0.4, random_state=26)
print(trainData.shape, validData.shape)

**Preprocessing**

Data is fit to a training dataframe using the preprocessing StandardScaler from sklearn. The scaler data is transformed into a normalized dataframe and concatenated with output data, then partitioned into training and validation sets. The sets are then split into x and y, and passed to KNN classifier from sklearn.

In [None]:
scaler = preprocessing.StandardScaler()
scaler.fit(trainData[['age','cp_typical angina', 'cp_atypical angina', 'cp_asymptomatic', 'trtbps', 'chol', 'fbs', 'thalachh', 'exng']])
# Note the use of an array of column names

# Transform the full dataset
heartNorm = pd.concat([pd.DataFrame(scaler.transform(df[['age','cp_typical angina', 'cp_atypical angina',
                                                         'cp_asymptomatic', 'trtbps', 'chol', 'fbs', 'thalachh', 'exng']]), 
                                    columns=['zage','zcp_typical angina', 'zcp_atypical angina', 'zcp_asymptomatic',
                                             'ztrtbps', 'zchol', 'zfbs', 'zthalachh', 'zexng']),
                                     df[['output']]], axis=1)


#Then repartition into train and test using row indexs
trainNorm = heartNorm.iloc[trainData.index]
validNorm = heartNorm.iloc[validData.index]

**More partitioning**

In [None]:
train1_X = trainNorm[['zage','zcp_typical angina', 'zcp_atypical angina', 'zcp_asymptomatic',
                                             'ztrtbps', 'zchol', 'zfbs', 'zthalachh', 'zexng']]
train1_y = trainNorm['output']
valid1_X = validNorm[['zage','zcp_typical angina', 'zcp_atypical angina', 'zcp_asymptomatic',
                                             'ztrtbps', 'zchol', 'zfbs', 'zthalachh', 'zexng']]
valid1_y = validNorm['output']

**Run KNN**

In [None]:
results = []
for k in range(1, 15):
    knn = KNeighborsClassifier(n_neighbors=k).fit(train1_X, train1_y)
    results.append({
        'k': k,
        #test performance on validation set
        'accuracy': accuracy_score(valid1_y, knn.predict(valid1_X))
    })

# Convert results to a pandas data frame
results = pd.DataFrame(results)
print(results)

We will choose k=3 for KNN because it provides the highest accuracy for the lowest number of k value.

In [None]:
# training performance
classificationSummary(train1_y, knn.predict(train1_X))

# validation performance
classificationSummary(valid1_y, knn.predict(valid1_X))