In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
df = pd.read_csv('/kaggle/input/heart-disease-uci/heart.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().value_counts()

In [None]:
df.drop_duplicates()

## Exploratory Data Analysis

### Analysing the target feature

In [None]:
y = df["target"]

sns.countplot(y)


target_temp = df.target.value_counts()

print(target_temp)

In [None]:
print("Percentage of patience without heart problems: "+str(round(target_temp[0]*100/303,2)))
print("Percentage of patience with heart problems: "+str(round(target_temp[1]*100/303,2)))

The dataset is balanced.

### Analysing the 'Sex' feature

In [None]:
df['sex'].value_counts()

In [None]:
sns.barplot(df["sex"],y)

### Analysing the 'Chest Pain Type' feature

The chest pain experienced (Value 1: typical angina, Value 2: atypical angina, Value 3: non-anginal pain, Value 4: asymptomatic)

In [None]:
ax = sns.countplot(x = "cp",hue = "sex", data = df)
plt.title('Heart Disease count according To Chest Pain Type')
plt.xlabel('Chest Pain Type')
plt.ylabel('Count')
plt.show()

From this it can be infered that type 4 chest pain is more common in men(male) and type 3 in case of women(female)

### Analysing the FBS feature

In [None]:
df['fbs'].describe()

In [None]:
df['fbs'].unique()

In [None]:
sns.barplot(df["fbs"],y)

Nothing significant can be infered.

### Analysing the restecg feature

Resting electrocardiographic measurement (0 = normal, 1 = having ST-T wave abnormality, 2 = showing probable or definite left ventricular hypertrophy by Estes' criteria)

In [None]:
df['restecg'].unique()

In [None]:
sns.countplot(df['restecg'])

In [None]:
sns.barplot(df['restecg'],y)

Nothing very significant, however, type 1( having ST-T wave abnormality) are more prone to heart disease. Data for type 2 is insufficient to arrive at a conclusion.

### Analysing the Slope feature

Value 1: upsloping, Value 2: flat, Value 3: downsloping

In [None]:
df['slope'].unique()

In [None]:
sns.countplot(df['slope'])

In [None]:
sns.barplot(df['slope'],y)

We observe that slope 2 have more chances of heart problems than slope 1 and slope 0. 

### Analysing the number of major(ca) vessels feature
The number of major vessels (0-3)

In [None]:
df['ca'].unique()

In [None]:
sns.countplot(df["ca"])

In [None]:
sns.barplot(df['ca'],y)

### Analysing the 'thal' feature
A blood disorder called thalassemia 

In [None]:
df['thal'].unique()

In [None]:
sns.countplot(df['thal'])

In [None]:
sns.barplot(df['thal'],y)

### Analysing the 'exang' feature


In [None]:
df['exang'].unique()

In [None]:
sns.countplot(df['exang'])

In [None]:
sns.barplot(df['exang'],y)

### Age distribution

In [None]:
import warnings 
warnings.filterwarnings('ignore')

plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = (7,5)
sns.distplot(df['age'])
plt.title('Distribution of Age')
plt.show()

### Analysing the 'trestbps' feature
The person's resting blood pressure (mm Hg on admission to the hospital)


In [None]:
df['trestbps'].describe()

In [None]:
sns.boxplot(df['target'],df['trestbps'])
plt.title('Relation btw restbp and target')
plt.show()

The above bivariate plot between trestbps(the resting blood pressure of patient) and the target(whether the person is suffering from heart disease or not). The plot clearly suggest that the patients who are more likely to have heart disease have slightly lower blood pressure than the patients without heart disease.

### Analysing the 'chol' feature

In [None]:
df['chol'].describe()

In [None]:
plt.rcParams['figure.figsize'] = (10, 9)
sns.violinplot(df['target'], df['chol'])
plt.title('Relation of Cholestrol with Target')
plt.show()

The above violin plot shows the relation of cholestrol on the target variable. The data shows that people with heart disease have slightly higher cholestrol levels compared to the ones without heart disease. 

### Analysing the 'thalach' feature
The person's maximum heart rate achieved

In [None]:
df['thalach'].describe()

In [None]:
sns.boxplot(df['target'],df['thalach'])
plt.title('Relation btw max heart rate and target')
plt.show()

In [None]:
sns.violinplot(df['target'], df['thalach'])
plt.title('Relation btw max heart rate and target')
plt.show()

It is evident from the above bivariant plot that the max heart rate of individuals with heart disease are significantly higher than the ones without heart disease. Hence heart rate is positively correlated with the target variable i.e. heart disease.

### Correlation

In [None]:
corrmat = df.corr()
top_corr_features = corrmat.index
plt.figure(figsize=(20,20))
g = sns.heatmap(df[top_corr_features].corr(),annot = True, cmap = "RdYlGn")

In [None]:
df = df.drop(columns=(['exang']))

In [None]:
df = pd.get_dummies(df, columns = ['sex','cp','restecg','slope','thal','ca','fbs'])

### StandardScaling

In [None]:
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler
standardScaler = StandardScaler()
columns_to_scale = ['age','trestbps','chol','oldpeak']
df[columns_to_scale] = standardScaler.fit_transform(df[columns_to_scale])

In [None]:
df.head(20)

In [None]:
#train-test split
y = df['target']
X = df.drop(['target'], axis = 1)
X_train, X_test,y_train, y_test = train_test_split(X,y,test_size=0.3,random_state = 10, shuffle=True)

### K-Nearest Neighbour:

In [None]:
from sklearn import neighbors 
clf = neighbors.KNeighborsClassifier(n_neighbors=2, p=1)
clf.fit(X_train,y_train)

In [None]:
k_range = range(1,26)
scores={}

for k in k_range:
    knn=KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train,y_train)
    predict_knn = knn.predict(X_test)
    scores[k]=accuracy_score(y_test,predict_knn)
scores

In [None]:
accuracy = clf.score(X_test,y_test)
accuracy

### XGboost:

In [None]:
from xgboost import XGBClassifier
model_xgb = XGBClassifier()
model_xgb.fit(X_train,y_train)

In [None]:
# Predicting the model
y_predict_xgb = model_xgb.predict(X_test)
# Finding accuracy, precision, recall and confusion matrix
print(accuracy_score(y_test,y_predict_xgb))
print(classification_report(y_test,y_predict_xgb))

### Random Forest`

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(max_depth=3)
classifier.fit(X_train, y_train)


In [None]:
# Predicting the Test set results
y_pred_rf = classifier.predict(X_test)

print(accuracy_score(y_test,y_pred_rf))

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(max_depth=3,min_samples_split=3,max_features=13)
dt.fit(X_train, y_train)

In [None]:
y_pred_df = dt.predict(X_test)

print(accuracy_score(y_test, y_pred_df))

### Support Vector Machine

In [None]:
from sklearn.svm import SVC
svm = SVC()
svm.fit(X_train,y_train)

In [None]:
y_pred_svm = svm.predict(X_test)

print(accuracy_score(y_test, y_pred_svm))