# Importing necessary libraries

In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
%matplotlib inline

# Importing dataset

In [None]:
data = pd.read_csv("../input/heart-disease-uci/heart.csv")
data.head()

In [None]:
data.shape

In [None]:
data.isnull().sum()

##### NO NULL VALUES!

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.hist(figsize=(20,20));

# Checking maximum correlated features

In [None]:
corr = data.corr().sort_values(by='target',ascending=False).round(2)
print(corr['target'])

**fbs, chol** are the lowest correlated with the target variable.

In [None]:
plt.subplots(figsize=(12, 9))
sns.heatmap(corr,annot=True)

In [None]:
data.target.value_counts()

# Exploratory Data Analysis

In [None]:
# Checking if its a balanced dataset
sns.countplot(x='target',data=data)
plt.title('Affected vs Unaffected');

In [None]:
# Affected vs Unaffected based on gender
sns.countplot(x='sex',hue='target',data=data)
plt.title('Based on Gender')
plt.xlabel('sex')

**X-axis**: Here 1 is male, 0 is female

**target**: 0 unaffected, 1 affected

In [None]:
# Distribution based on age
plt.subplots(figsize=(10, 7))
sns.countplot(x='age',data=data);
plt.title('Age')

In [None]:
print('Min age: ', min(data['age']))
print('Max age: ', max(data['age']))
print('Average age: ', data['age'].mean())

In [None]:
# Display chest pain types in bar chart
data.groupby(data['cp']).count()['target'].plot(kind = 'bar', title = 'Chest Pain Types', figsize = (8, 6))
plt.xlabel('Chest Pain Types')
plt.xticks(np.arange(4), ('typical angina', 'atypical angina', 'non-anginal pain', 'asymptomatic'), rotation = 0)
plt.show()

In [None]:
# Display chest pain types based on the target
pd.crosstab(data.cp,data.target).plot(kind = "bar", figsize = (8, 6))
plt.title('Heart Disease Frequency According to Chest Pain Type')
plt.xlabel('Chest Pain Type')
plt.xticks(np.arange(4), ('typical angina', 'atypical angina', 'non-anginal pain', 'asymptomatic'), rotation = 0)
plt.ylabel('Frequency')
plt.show()

In [None]:
# Display blood pressure distribution
data['trestbps'].plot(kind = 'hist', title = 'Blood Pressure in mm Hg', figsize = (8, 6))
plt.show()

In [None]:
# Display Cholestoral distribution
data['chol'].plot(kind = 'hist', title = 'Serum Cholestoral in mg/dl', figsize = (8, 6))
plt.show()

In [None]:
# Display fasting blood sugar in bar chart
data.groupby(data['fbs']).count()['target'].plot(kind = 'bar', title = 'Fasting Blood Sugar', figsize = (8, 6))
plt.xticks(np.arange(2), ('fbs < 120 mg/dl', 'fbs > 120 mg/dl'), rotation = 0)
plt.show()

In [None]:
data.groupby(data['restecg']).count()['target'].plot(kind = 'bar', title = 'Resting Electrocardiographic Results', figsize = (8, 6))
plt.xticks(np.arange(3), ('normal', 'ST-T wave abnormality', 'probable or left ventricular hypertrophy'))
plt.show()

In [None]:
# Display resting electrocardiographic results based on the target
pd.crosstab(data.restecg,data.target).plot(kind = "bar", figsize = (8, 6))
plt.title('Heart Disease Frequency According to Resting Electrocardiographic Results')
plt.xticks(np.arange(3), ('normal', 'ST-T wave abnormality', 'probable or left ventricular hypertrophy'))
plt.xlabel('Resting Electrocardiographic Results')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Display maximum heart rate distribution
data['thalach'].plot(kind = 'hist', title = 'Maximum Heart Rate Achieved', figsize = (8, 6))
plt.show()

In [None]:
# Display exercise induced angina in bar chart
data.groupby(data['exang']).count()['target'].plot(kind = 'bar', title = 'Exercise Induced Angina',  figsize = (8, 6))
plt.xticks(np.arange(2), ('No', 'Yes'), rotation = 0)
plt.show()

In [None]:
# Display exercise induced angina based on the target
pd.crosstab(data.exang,data.target).plot(kind = "bar", figsize = (8, 6))
plt.title('Heart Disease Frequency According to Exercise Induced Angina')
plt.xlabel('Exercise Induced Angina')
plt.xticks(np.arange(2), ('No', 'Yes'), rotation = 0)
plt.ylabel('Frequency')
plt.show()

# Relation among highest correlated features

In [None]:
sns.pairplot(data[corr['target'].head(7).index], hue='target')

In [None]:
X = data.drop(['target'], axis = 1)
y = data['target']
X.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state= 5)

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

# Training on different models

In [None]:
from sklearn.svm import SVC
svc_classifier = SVC()
svc_classifier.fit(X_train, y_train)
y_pred_scv = svc_classifier.predict(X_test)
accuarcy_svm=accuracy_score(y_test, y_pred_scv)
print(accuarcy_svm)

In [None]:
from sklearn.linear_model import LogisticRegression
lr_classifier = LogisticRegression(random_state = 51, penalty = 'l1', solver='liblinear')
lr_classifier.fit(X_train, y_train)
y_pred_lr = lr_classifier.predict(X_test)
accuracy_score(y_test, y_pred_lr)
accuarcy_lr=accuracy_score(y_test, y_pred_lr)
print(accuarcy_lr)

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt_classifier = DecisionTreeClassifier(criterion = 'entropy',random_state = 51)
dt_classifier.fit(X_train, y_train)
y_pred_dt = dt_classifier.predict(X_test)
accuarcy_dt=accuracy_score(y_test, y_pred_dt)
print(accuarcy_dt)

In [None]:
rf_model=RandomForestClassifier()
rf_model.fit(X_train,y_train)
y_pred_rf = rf_model.predict(X_test)
accuarcy_rf=accuracy_score(y_test, y_pred_rf)
print(accuarcy_rf)

### Logistic Regression performed best on this dataset

# Confusion Matrix

In [None]:
cm = confusion_matrix(y_test, y_pred_rf)
plt.title('Heatmap of Confusion Matrix', fontsize = 15)
sns.heatmap(cm, annot = True)
plt.show()

In [None]:
# dumping the model in pickle file for future use
import pickle
pickle.dump(lr_classifier, open('heart_disease_detector.pickle','wb'))# load model
heart_disease_detector_model = pickle.load(open('./heart_disease_detector.pickle', 'rb'))

## Thank You
## If you find this kernel useful **Upvote** it
## Feel free to ask any queries or give any suggestions

[Check this link for deploying it into an end to end model](https://medium.com/analytics-vidhya/the-lifecycle-to-build-a-web-app-for-prediction-from-scratch-bec1632b5f27)