In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier

In [None]:
df = pd.read_csv('../input/health-care-data-set-on-heart-attack-possibility/heart.csv')
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.shape

Exploratory Data Analysis of all the variables

In [None]:
df.columns

Attribute Information
1) age
2) sex
3) chest pain type (4 values)
4) resting blood pressure
5) serum cholestoral in mg/dl
6)fasting blood sugar > 120 mg/dl
7) resting electrocardiographic results (values 0,1,2)
8) maximum heart rate achieved
9) exercise induced angina
10) oldpeak = ST depression induced by exercise relative to rest
11)the slope of the peak exercise ST segment
12) number of major vessels (0-3) colored by flourosopy
13) thal: 0 = normal; 1 = fixed defect; 2 = reversable defect
14) target: 0= less chance of heart attack 1= more chance of heart attack

In [None]:
catcols = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca',	'thal']
numcols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

In [None]:
plt.figure(figsize=(10,15))
for i in range(0,8):
  plt.subplot(4,2,i+1)
  sns.barplot(df['target'], df[catcols[i]])

In [None]:
plt.figure(figsize=(10,15))
for i in range(0,8):
  plt.subplot(4,2,i+1)
  sns.countplot(df[catcols[i]], hue ='target', data = df)

In [None]:
plt.figure(figsize=(10,15))
for i in range(0,5):
  plt.subplot(4,2,i+1)
  sns.distplot(df[numcols[i]])

In [None]:
plt.figure(figsize=(10,15))
for i in range(0,5):
  plt.subplot(4,2,i+1)
  sns.boxplot(df[numcols[i]])

In [None]:
plt.figure(figsize=(12,6))
sns.heatmap(df.corr(),annot=True)

In [None]:
plt.figure(figsize=(8,4))
sns.violinplot(x='target',y='thalach',data=df)
plt.title('violen plot for target and thalach variable')

In [None]:
plt.figure(figsize=(8,4))
sns.lineplot(x='target',y='thalach',data=df)
plt.title('line plot for target and thalach variable')

In [None]:
plt.figure(figsize=(8,4))
sns.barplot(x='target',y='cp',data=df)
plt.title('bar plot for target and cp variable')

The dataset is small since i am removing outliers from the datset for better model

In [None]:
from scipy import stats
import numpy as np
z = np.abs(stats.zscore(df))
print(np.where(z > 3))

In [None]:
df_withoutoutlier = df[(z < 3).all(axis=1)]

In [None]:
X = df_withoutoutlier.copy()
X = X.drop('target', axis =1)
y = df_withoutoutlier['target']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state =100)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
model_LG = LogisticRegression()
model_LG.fit(X_train, y_train)
y_pred = model_LG.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
print(confusion_matrix(y_test, y_pred))

In [None]:
model_RF = RandomForestClassifier()
model_RF.fit(X_train, y_train)

In [None]:
data = pd.Series(model_RF.feature_importances_, index = X_train.columns)

In [None]:
data.sort_values(ascending=True, inplace=True)

Important Features of the dataset

In [None]:
data.plot.barh()

In [None]:
y_pred = model_RF.predict(X_test)
print(classification_report(y_test, y_pred))

Logistic Regression gives the best results on the above dataset or we can also build our model using some best features.