In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


plt.style.use('ggplot')
sns.__version__

## Load dataset

In [None]:
df = pd.read_csv('../input/heart.csv')

## Description of columns

1. age: in years
2. sex: (1 = male; 0 = female)
3. cp: chest pain type
4. trestbps: resting blood pressure (in mm Hg on admission to the hospital)
5. chol: serum cholestoral in mg/dl
6. fbs: (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)
7. restecg: resting electrocardiographic results
8. thalach: maximum heart rate achieved
9. exang: exercise induced angina (1 = yes; 0 = no)
10. oldpeak: ST depression induced by exercise relative to rest
11. slope: the slope of the peak exercise ST segment
12. ca: number of major vessels (0-3) colored by flourosopy
13. thal: 3 = normal; 6 = fixed defect; 7 = reversable defect
14. target: 1 or 0

## Print values for each feature of first 5 samples

In [None]:
df.head(5)

## Print total number of features available and names

In [None]:
print('total number of features: ', len(df.columns)-1)
print("Feature names: ", list(df.columns.drop('target')))

## Unique values in all columns

In [None]:
columns = df.columns
unique = df.nunique()
plt.figure(figsize=(8, 8))
plt.barh(columns, unique)
plt.xlabel('Unique values')
plt.ylabel('Feature names')
plt.show()

In [None]:
print('Unique values in Target: ',  df['target'].nunique())

### Hence, this is a binary classification problem. 

## Descriptive statistics for each column

In [None]:
df.describe()

## Check any NaN value in dataset

In [None]:
df.isnull().values.any()

## Print columns' information

In [None]:
df.info()

## Find the significant features for predicting target

In [None]:
pd.DataFrame(df.corr(method='spearman')['target'])

### From above, chest pain type (cp), maximum heart rate achieved (thalach) are top 2 features.

### For KDE, Strip plot and pairplot mapping, 4 features are used which positively correlated to target variable

## Strip plot

In [None]:
vars = np.array(['cp', 'oldpeak', 'thalach', 'slope'])
plt.figure(figsize=(10, 10))
for i in vars:
    plt.subplot(2,2,np.where(i == vars)[0][0]+1)
    sns.stripplot(x="target", y=i, data=df)
    
plt.suptitle('Figure - Strip plot', x=0.5, y=0.9, verticalalignment='center', fontsize= 18)
plt.show()

##  Kernel density estimation plot

In [None]:
vars = np.array(['cp', 'oldpeak', 'thalach', 'slope'])
plt.figure(figsize=(10, 10))
for i in vars:
    plt.subplot(2,2,np.where(i == vars)[0][0]+1)
    sns.kdeplot(df['target'], df[i], shade=True, cut=4)
    
plt.suptitle('Figure - Kernel density estimation plot', x=0.5, y=0.9, verticalalignment='center', fontsize= 18)
plt.show()

### Both the above plots show the distribution of highly correlated features wrt target, but KDE provides dense distribution.

## Scatter plot

In [None]:
sns.pairplot(df, vars=['cp', 'restecg', 'thalach', 'slope'], height=4, hue='target', 
             diag_kind='kde', markers=["D", "s"], diag_kws=dict(shade=True))
plt.suptitle('Figure - Scatter plot of features ', x=0.5, y=1.01, verticalalignment='center', fontsize= 20)
plt.show()

## Histogram for dispered features

In [None]:
vars = np.array(['age', 'oldpeak', 'thalach', 'chol', 'trestbps'])
plt.figure(1 , figsize=(20, 10))
for i in vars:
    plt.subplot(2,3,np.where(i == vars)[0][0]+1)
    sns.distplot(a = df[i], rug=True, color = 'blue')

plt.suptitle('Figure - Histograms', x=0.5, y=0.9, verticalalignment='center', fontsize= 18)
plt.show()

## Barplot of age and sex wrt target variable

In [None]:
plt.figure(figsize=(24,12))
plt.subplot(1,2,1)
sns.countplot(x="sex", hue='target', data=df)
plt.subplot(1,2,2)
sns.countplot(x="age", hue='target', data=df)
plt.suptitle('Figure - Count plot of sex and age with target grouping variable', 
             x=0.5, y=0.9, verticalalignment='center', fontsize= 18)

### Left plot shows that just great than 20 females and more than 100 males have no heart problem. The latter plot signifies that people over 54 age have less heart problems.

In [None]:
plt.figure(figsize=(14,7))
plt.subplot(1,2,1)
sns.kdeplot(df['sex'], df['target'], shade=True, cut=3)
plt.subplot(1,2,2)
sns.kdeplot(df['age'], df['target'], shade=True, cut=3)
plt.suptitle('Figure - KDE plot of sex and age wrt target', x=0.5, y=1, verticalalignment='center', fontsize= 18)
plt.show()

### From the above plots, we can verify statements I have written above.

## Plot important features wrt sex

In [None]:
vars = np.array(['cp', 'oldpeak', 'thalach', 'slope'])
plt.figure(figsize=(15, 15))
for i in vars:
    plt.subplot(2,2,np.where(i == vars)[0][0]+1)
    sns.distplot(df[i][df['sex'] == 1], color='blue', label='male')
    sns.distplot(df[i][df['sex'] == 0], label='female')
    plt.legend()
    
plt.suptitle('Figure - Histogram of features wrt sex', x=0.5, y=0.9, verticalalignment='center', fontsize= 18)
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn import svm
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
import xgboost as xgb

## Create features and labels 

In [None]:
y = df['target']
x = df.drop(columns=['target'])

# Support Vector Classification using Linear Kernel

## Normalising dataset

In [None]:
scaler = StandardScaler().fit(x)
rescaledX = scaler.transform(x)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(rescaledX, y, random_state=5)

print("X train: ", X_train.shape)
print("X test: ", X_test.shape)
print("y train: ", y_train.shape)
print("y test: ", y_test.shape)

In [None]:
lr = svm.SVC(kernel='linear')
lr.fit(X_train, y_train)

In [None]:
y_pred = lr.predict(X_test)
lr.score(X_test, y_test)

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
precision_recall_fscore_support(y_test, y_pred, average='binary')

## Without normalising dataset

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=5)

print("X train: ", X_train.shape)
print("X test: ", X_test.shape)
print("y train: ", y_train.shape)
print("y test: ", y_test.shape)

In [None]:
lr = svm.SVC(kernel='linear')
lr.fit(X_train, y_train)

In [None]:
y_pred = lr.predict(X_test)
lr.score(X_test, y_test)

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
precision_recall_fscore_support(y_test, y_pred, average='binary')

# Logistic regression 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=5)

print("X train: ", X_train.shape)
print("X test: ", X_test.shape)
print("y train: ", y_train.shape)
print("y test: ", y_test.shape)

In [None]:
lr = LogisticRegression(C=0.1, solver='liblinear')
lr.fit(X_train, y_train)

In [None]:
y_pred = lr.predict(X_test)
lr.score(X_test, y_test)

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
precision_recall_fscore_support(y_test, y_pred, average='binary')

# Stochastic Gradient Descent (SGD)

In [None]:
scaler = StandardScaler().fit(x)
rescaledX = scaler.transform(x)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(rescaledX, y, test_size=0.15, random_state=5)

print("X train: ", X_train.shape)
print("X test: ", X_test.shape)
print("y train: ", y_train.shape)
print("y test: ", y_test.shape)

In [None]:
sgd = SGDClassifier(max_iter=50, random_state=5)
sgd.fit(X_train, y_train)

In [None]:
y_pred = lr.predict(X_test)
sgd.score(X_test, y_test)

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
precision_recall_fscore_support(y_test, y_pred, average='binary')

# XGBoost Model

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=5)

print("X train: ", X_train.shape)
print("X test: ", X_test.shape)
print("y train: ", y_train.shape)
print("y test: ", y_test.shape)

In [None]:
accuracy = []

max_dep = range(1,10)

for i in max_dep:
    xg = xgb.XGBClassifier(max_depth=i, min_samples_leaf=2)
    xg.fit(X_train, y_train)
    accuracy.append(xg.score(X_test, y_test))
    
print('List of accuracy: ', accuracy)    

In [None]:
plt.plot(max_dep, accuracy, label='Accuracy of validation set')
plt.ylabel('Accuracy')
plt.xlabel('Max Depth')
plt.legend()
plt.show()

In [None]:
xg =  xgb.XGBClassifier(max_depth=3, min_samples_leaf=2)
xg.fit(X_train, y_train)

In [None]:
y_pred = xg.predict(X_test)
xg.score(X_test, y_test)

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
precision_recall_fscore_support(y_test, y_pred, average='binary')