In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df=pd.read_csv('../input/heart-disease-uci/heart.csv')

In [None]:
df.head()

In [None]:
df.describe(include='all')

In [None]:

df.isna().sum()

In [None]:
plt.subplots(12,figsize=(12,6))
plt.subplot(121)
sns.distplot(df['age'],color='pink')
plt.subplot(122)
sns.boxplot(x=df['target'],y=df['age'])

If we look at the age distribution, a huge chunk of people are actually over 40. Moreover looking at the boxplot we realise that younger people are more prone to heart disease while older people are less prone to heart diseases

In [None]:

df['sex'].value_counts().plot(kind='pie')

pd.crosstab(df.sex,df.target,normalize='index').plot(kind='bar')



In [None]:
plt.subplots(12,figsize=(12,6))
plt.subplot(121)
sns.distplot(df['trestbps'],color='pink')
plt.subplot(122)
sns.boxplot(x=df['target'],y=df['trestbps'])

In [None]:
plt.subplots(12,figsize=(12,6))
plt.subplot(121)
sns.distplot(df['chol'],color='pink')
plt.subplot(122)
sns.boxplot(x=df['target'],y=df['chol'])

In [None]:

df['cp'].value_counts().plot(kind='pie')

pd.crosstab(df.cp,df.target,normalize='index').plot(kind='bar')


In [None]:
plt.subplots(12,figsize=(10,6))
plt.subplot(221)
df['cp'].value_counts().plot(kind='pie')
plt.subplot(222)
sns.countplot(x=df['cp'],hue=df['target'])

In [None]:
sns.countplot(x=df['fbs'],hue=df.target)

In [None]:
pd.crosstab(df.restecg,df.target,normalize='index').plot(kind='bar')

Looking at the plot we can see quite a few observation. For example rest ecg is directly corelated with high values of heart disease

In [None]:
plt.subplots(13,figsize=(14,6))
plt.subplot(131)
sns.distplot(df["thalach"])
plt.subplot(132)
sns.boxplot(y=df["thalach"],x=df['sex'])
plt.subplot(133)
sns.boxplot(y=df["thalach"],x=df['target'])

THe above diagrams show the distribution of heart rate. From the second diagram from the left we can observe that both male & female have similar heart rate. Moreover from the diagrama on the righ we can also infer that people with heart disease have a higher heart rate 

In [None]:
plt.figure(figsize=(14,7))
sns.heatmap(df.corr(),cmap='viridis',annot=True)

In [None]:
df.nunique()

There are hardly any features that have a high direct correlation with the target. Furthermore, the data does not seem to have any multicollinearity as the correlation values are low

# Feature Engineering

In [None]:
df=pd.get_dummies(df,columns=['cp','restecg','slope','ca','thal'],drop_first=True)
y=df.target
X=df.drop("target",axis=1)

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
num_columns=['age','trestbps','chol','thalach','oldpeak']
X_train['age']=scaler.fit_transform(X_train['age'].values.reshape(-1,1))
X_train['trestbps']=scaler.fit_transform(X_train['trestbps'].values.reshape(-1,1))
X_train['chol']=scaler.fit_transform(X_train['chol'].values.reshape(-1,1))  
X_train['thalach']=scaler.fit_transform(X_train['thalach'].values.reshape(-1,1))   
X_train['oldpeak']=scaler.fit_transform(X_train['oldpeak'].values.reshape(-1,1))   

In [None]:
X_test['age']=scaler.transform(X_test['age'].values.reshape(-1,1))
X_test['trestbps']=scaler.transform(X_test['trestbps'].values.reshape(-1,1))
X_test['chol']=scaler.transform(X_test['chol'].values.reshape(-1,1))  
X_test['thalach']=scaler.transform(X_test['thalach'].values.reshape(-1,1))   
X_test['oldpeak']=scaler.transform(X_test['oldpeak'].values.reshape(-1,1))

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
lr=LogisticRegression(C=1.0,penalty='l2')
lr.fit(X_train,y_train)
print("The cross validation score mean is ",cross_val_score(lr,X_train,y_train,cv=3).mean())
pred=lr.predict(X_test)
print(classification_report(y_test,pred))
sns.heatmap(confusion_matrix(y_test,pred),annot=True)

# Support Vector Machine

In [None]:
from sklearn.svm import SVC
svc=SVC()
svc.fit(X_train,y_train)
print("The cross validation score is ",cross_val_score(svc,X_train,y_train).mean())
pred=svc.predict(X_test)
print(classification_report(y_test,pred))
sns.heatmap(confusion_matrix(y_test,pred),annot=True)

# Decision Classifer

In [None]:
from sklearn.tree import DecisionTreeClassifier
dc=DecisionTreeClassifier()
dc.fit(X_train,y_train)
print("The cross validation score is ",cross_val_score(dc,X_train,y_train).mean())
pred=dc.predict(X_test)
print(classification_report(y_test,pred))
sns.heatmap(confusion_matrix(y_test,pred),annot=True)

# Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier()
rf.fit(X_train,y_train)
print("The cross validation score is ",cross_val_score(rf,X_train,y_train).mean())
pred=rf.predict(X_test)
print(classification_report(y_test,pred))
sns.heatmap(confusion_matrix(y_test,pred),annot=True)

In [None]:

pd.DataFrame(rf.feature_importances_,index=X_train.columns,columns=['Feature Importance']).sort_values("Feature Importance",ascending=False).head(5).plot(kind='barh')

# Gradient Boosting Classifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gbc=GradientBoostingClassifier()
gbc.fit(X_train,y_train)
print("The cross validation score is ",cross_val_score(gbc,X_train,y_train).mean())
pred=gbc.predict(X_test)
print(classification_report(y_test,pred))
sns.heatmap(confusion_matrix(y_test,pred),annot=True)

# AdaBoostClassifier

In [None]:
from sklearn.ensemble import AdaBoostClassifier
abc = AdaBoostClassifier(n_estimators=50,
                         learning_rate=1)
abc.fit(X_train,y_train)
abc.predict(X_test)
print(classification_report(y_test,pred))
sns.heatmap(confusion_matrix(y_test,pred),annot=True)


# XGBoost

In [None]:
import xgboost as xgb
data_dmatrix = xgb.DMatrix(data=X,label=y)

In [None]:
xg_reg = xgb.XGBClassifier(objective ='reg:logistic', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)

In [None]:
xg_reg.fit(X_train,y_train)

pred = xg_reg.predict(X_test)

In [None]:
print(classification_report(y_test,pred))
sns.heatmap(confusion_matrix(y_test,pred),annot=True)