In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
sns.set_palette('pastel')

### Problem Statement: 
To identify whether a patient is at risk of heart disease based on age, sex, and medical parameters



### Summary of background research:
Heart attacks are induced by blood clots, which prevent blood flow. These blood clots are formed when plaque build up within blood vessels become dislodged. Plaque is essentially a build up of cholestrol and fat on blood vessel walls.

Factors that promote plaque build up is high cholestrol, unhealthy diets, and lack of exercise. These factors can be quantified by attributes such as blood pressure, cholestrol levels, heart rate and so forth - which give an indication on the health of the patients heart and circulatory system.

### Step One: Loading the data

In [None]:
data = pd.read_csv('../input/heart-disease-uci/heart.csv')

### Step Two: Check dataset (Missing values / Outliers etc.)

In [None]:
data.describe()

In [None]:
data.isnull().sum()

In [None]:
data[data.chol>380]

In [None]:
data.drop(data[data.chol>380].index, axis=0, inplace=True)

Observations with a cholestrol level of over 380 will be removed. These values do not correspond to serum cholestrol levels, and are most likely Triglyceride values.

Other values seem about right.

No missing values.

### Step Three: Exploring the data

In [None]:
#checking the balance of targets:

data.target.value_counts()

### Comment on Dataset:
The data is well balanced, thus classification inaccuracy due to unbalanced data is not a risk

In [None]:
data.describe()

In [None]:
data.hist(bins=30, figsize=(30,30))

In [None]:
plt.figure(figsize=(10,5))
sns.distplot(data.age[data.target==0])
sns.distplot(data.age[data.target==1])
plt.legend(labels=['Heart Disease','No Heart Disease'])
plt.title('Distribution of Age')
plt.xlabel('Age')
plt.show()

In [None]:
plt.figure(figsize=(8,5))
sns.countplot(data.sex, hue=data.target)
plt.legend(labels=['Heart Disease','No Heart Disease'])
plt.title('Count of Observations by Sex')
plt.xlabel('Sex')
plt.xticks(np.arange(2),('Female','Male'))

In [None]:
sex_total = data.sex.value_counts()
sex_heartdisease = data.sex[data.target==0].value_counts()

percentage_hd_sex = []

for i in range(2):
    perc_hd = sex_heartdisease[i]/sex_total[i]
    percentage_hd_sex.append(perc_hd)

In [None]:
plt.figure(figsize=(6,5))
sns.barplot(y=pd.Series(percentage_hd_sex),x=['Female','Male'])
plt.title('Percentage of Patients with Heart Disease, by Sex')
plt.ylabel('Percentage (%)')
plt.xlabel('Sex')

### Observations on Sex:
1. There are a lot more Male observations than Female observations in this dataset
2. From the dataset, a higher percentage of Males have heart disease in comparison to Females. This may indicate that men have a higher risk of suffering from heart disease.

In [None]:
data.cp.value_counts()

In [None]:
plt.figure(figsize=(10,8))
sns.countplot(data.cp, hue=data.target)
plt.legend(labels=['Heart Disease','No Heart Disease'])
plt.xlabel('Chest Pain')
plt.title('Chest Pain experienced by Patients')

### Comment on chestpain:
Patients with heart disease are more likely to experience pain type 0 than any other pain type classification

In [None]:
plt.figure(figsize=(18,6))
sns.distplot(data.trestbps[data.target==0])
sns.distplot(data.trestbps[data.target==1])
plt.legend(labels=['Heart Disease','No Heart Disease'])
plt.title('Distribution of Blood Pressure')
plt.xticks(np.arange(60, 221, step=5))
plt.xlabel('Blood Pressure')

In [None]:
print(data.trestbps[data.target==0].mean(), data.trestbps[data.target==1].mean())

In [None]:
plt.figure(figsize=(4,8))
sns.boxplot(x=data.target, y=data.trestbps)
plt.title('Boxplot displaying Blood Pressure by target')
plt.yticks(np.arange(90, 210, step=5))
plt.ylabel('Blood Pressure')
plt.xlabel('Heart Disease (Y/N)')

### Comment on blood pressure:
1. Patients with heart disease appear to have a higher blood pressure relative to the patients without heart disease in this dataset, as demonstrated in the distribution and boxplots. 
2. That being said, only 25% of those without heart disease had normal blood pressure levels (<120). 
3. Observations of Hypertensive Crisis (>180) were only seen in patients with heart disease.
4. Quartile group 3 for heart disease patients has a larger range than compared to those without heart disease, further reinstating that there is a positive correlation between blood pressure and the risk of heart    disease.

In [None]:
plt.figure(figsize=(18,6))
sns.distplot(data.chol[data.target==0])
sns.distplot(data.chol[data.target==1])
plt.legend(labels=['Heart Disease','No Heart Disease'])
plt.title('Distribution of Cholestrol')
plt.xticks(np.arange(50,650, step=50))
plt.xlabel('Cholestrol')

In [None]:
print('HD Cholestrol Average: ',data.chol[data.target==0].mean(), ', No HD Cholestrol Average: ',data.chol[data.target==1].mean())

In [None]:
data[data.chol>500]

### Comment on Cholestrol:
1. There is a clear indication that patients with heart disease are more likely to have higher colestrol levels.
2. That being said, both groups have on average high cholestrol (Healthy levels of cholestrol is <200, High cholestrol is >240)
3. Healthy group is more skewed to the right, high average more likely influenced by outlier of 550: could potentially be an error or incredibly unique case

In [None]:
#dropping the 564 chol case:
x = data[data.chol>500].index
data.drop(x, axis=0, inplace=True)
data.index = range(len(data))

In [None]:
plt.figure(figsize=(18,6))
sns.distplot(data.chol[data.target==0])
sns.distplot(data.chol[data.target==1])
plt.legend(labels=['Heart Disease','No Heart Disease'])
plt.title('Distribution of Cholestrol')
plt.xticks(np.arange(50,650, step=50))
plt.xlabel('Cholestrol')

In [None]:
print('HD Cholestrol Average: ',data.chol[data.target==0].mean(), ', No HD Cholestrol Average: ',data.chol[data.target==1].mean())

In [None]:
data.head()

In [None]:
fig = plt.figure(figsize=(10,5))

fig.add_subplot(131)
plt.title('Fasting Blood Sugar')
sns.countplot(data.fbs, hue=data.target)

fig.add_subplot(132)
plt.title('RECG results')
sns.countplot(data.restecg, hue=data.target)

fig.add_subplot(133)
plt.title('Ex. induced Angina')
sns.countplot(data.exang, hue=data.target)

### Comment on Fasting Blood Sugar >120mg/dl, RECG, Angina:
1. No clear correlation between heart disease and fasting blood sugar > 120mg/dl
2. No clear correlation between RECG results and heart disease
3. Those with heart disease more likely to experience exercise induced angina

In [None]:
plt.figure(figsize=(12,6))
sns.distplot(data.thalach[data.target==0])
sns.distplot(data.thalach[data.target==1])
plt.legend(labels=['Heart Disease','No Heart Disease'])
plt.title('Distribution of Max Heart Rate Achieved')
plt.xlabel('Max Heart Rate')

### Comments on Max Heart Disease:
Patients with heart disease on average have a much lower achievable max heart rate.

## Step Four: Feature Selection

For feature selection, we will select a significance level of 0.05. All p_values greater than 0.05 indicate that the Null Hypothesis cannot be rejected, and thus there feature is independent from the target.

In [None]:
X = data.drop('target',axis=1)
y = data.target

from sklearn.feature_selection import chi2
F, p_values = chi2(X,y)

In [None]:
p_values = pd.Series(p_values, index=X.columns)
p_values.sort_values(ascending=False, inplace=True)

## Step Five: Training & Model Selections

In [None]:
features = p_values[p_values < 0.05].index

In [None]:
features

In [None]:
X = data[features]
y = data.target

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

The problem statement requires a classification model. The following classification algorithms will be tested:
1. Random Forest Trees
2. Logisitc Regression
3. Naive Bayes Regression

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model  import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix

RF = RandomForestClassifier()
lr = LogisticRegression()
NB = MultinomialNB()
models = [RF,lr,NB]

In [None]:
def model_test(model, X_train, X_test, y_train, y_test):
    print(model)
    model.fit(X_train, y_train)
    y_predict = model.predict(X_test)
    print(classification_report(y_test, y_predict))

In [None]:
for model in models:
    model_test(model,X_train,X_test,y_train,y_test)

From the above results, Logistic Regression will be selected as the classification algorithm for this model. This model produced the best F1-Score results.

In [None]:
lr.fit(X_train, y_train)
y_predict = model.predict(X_test)
cm = confusion_matrix(y_test,y_predict)
sns.heatmap(cm, annot=True)

### Step Six: The Final Model
Since the model has been train and tested, the model will be set with the entire dataset as opposed to the training and testing split

In [None]:
lr.fit(X,y)