In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv('/kaggle/input/heart-disease-uci/heart.csv')
data.head(100)

In [None]:
data.info()

In [None]:
import matplotlib.pyplot as plt
data.hist(edgecolor='black', linewidth=1.2)
fig = plt.gcf()
fig.set_size_inches(25,18)
plt.show

In [None]:
# using heatmap getting an idea of correlation between the features of the dataset
import seaborn as sns
sns.set()
plt.figure(figsize = (16,10))
sns.heatmap(data.loc[:, ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target']].corr(),
    annot=True)

In [None]:
# getting idea about the frequency of Age of the Patients with Heart Disease
ax = data[data['target']==1]['age'].value_counts().sort_index().plot.bar(
    figsize = (16,6),
    fontsize = 14,
    title = 'Frequency v/s Age of Patient with Heart Disease',
    rot=0,
    color = ['red', 'blue', 'purple', 'green', 'lavender'],
)
ax.set_title('Frequency v/s Age of Patient with Heart Disease', fontsize=20)
ax.set_xlabel('Age', fontsize=20)
ax.set_ylabel('Frequency', fontsize=20)

In [None]:
# getting idea about the percentage of Age of the Patients with Heart Disease
ax = ((data[data['target']==1]['age'].value_counts()/ len(data)) * 100).sort_index().plot.bar(
    figsize = (16,6),
    fontsize = 14,
    title = 'Percentage v/s Age of Patient with Heart Disease',
    rot=0,
    color = ['red', 'blue', 'purple', 'green', 'lavender'],
)
ax.set_title('Percentage v/s Age of Patient with Heart Disease', fontsize=20)
ax.set_xlabel('Age', fontsize=20)
ax.set_ylabel('Frequency', fontsize=20)

In [None]:
# Separating the Inputs and the Outputs from the data
X = data.drop(['target'], axis=1)
y = data['target']
X.head(), y.head()

In [None]:
# spliting training and testing data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

In [None]:
# Scaling the training data
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
std_scaler = StandardScaler()

X_train = std_scaler.fit_transform(X_train)
X_test = std_scaler.transform(X_test)

In [None]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train, y_train)
y_predicted = log_reg.predict(X_test)
print('MAE:', mean_absolute_error(y_test, y_predicted))
log_reg.score(X_test, y_test)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
log_Reg_cm = confusion_matrix(y_test, y_predicted)
log_Reg_cm

In [None]:
# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
rand_forest = RandomForestClassifier(n_estimators=7, max_depth=4, class_weight={0:sum(y_train==0),1:sum(y_train==1)})
rand_forest.fit(X_train, y_train)
y_predicted = rand_forest.predict(X_test)
print('MAE:', mean_absolute_error(y_test, y_predicted))
rand_forest.score(X_test, y_test)

In [None]:
# Gradient Boosting Classifier
from sklearn.ensemble import GradientBoostingClassifier
grad_boost_clf = GradientBoostingClassifier()
grad_boost_clf.fit(X_train, y_train)
y_predicted = grad_boost_clf.predict(X_test)
print('MAE:', mean_absolute_error(y_test, y_predicted))
grad_boost_clf.score(X_test, y_test)

In [None]:
# Support Vector Classifier
from sklearn.svm import SVC 
svc = SVC(kernel = 'linear', random_state = 0)
svc.fit(X_train,y_train)
y_predicted = svc.predict(X_test)
print('MAE:', mean_absolute_error(y_test, y_predicted))
svc.score(X_test, y_test)

In [None]:
# Model using Dummy Variables
X = data.drop(['target'], axis=1)
y = data['target']
X.head(), y.head()

In [None]:
#Encode the catergorical data in the features data
X_encoded = pd.get_dummies(data, columns=['sex', 'cp','fbs','restecg','exang','slope','ca','thal'])
X_encoded.head()

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, random_state=42, test_size=0.2)

In [None]:
std_scaler = StandardScaler()

X_train = std_scaler.fit_transform(X_train)
X_test = std_scaler.transform(X_test)

In [None]:
log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train, y_train)
y_predicted = log_reg.predict(X_test)
print('MAE:', mean_absolute_error(y_test, y_predicted))
log_reg.score(X_test, y_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rand_forest = RandomForestClassifier(random_state=42)
rand_forest.fit(X_train, y_train)
y_predicted = rand_forest.predict(X_test)
print('MAE:', mean_absolute_error(y_test, y_predicted))
rand_forest.score(X_test, y_test)

In [None]:
svc = SVC(kernel = 'linear', random_state = 0)
svc.fit(X_train,y_train)
y_predicted = svc.predict(X_test)
print('MAE:', mean_absolute_error(y_test, y_predicted))
svc.score(X_test, y_test)

In [None]:
grad_boost_clf = GradientBoostingClassifier()
grad_boost_clf.fit(X_train, y_train)
y_predicted = grad_boost_clf.predict(X_test)
print('MAE:', mean_absolute_error(y_test, y_predicted))
grad_boost_clf.score(X_test, y_test)

In [None]:
# So as we can see after using Dummy variables we have almost overfitted every Model.
# So we can conclude that the Model works fine without Dummy variables and
# we get accuracy of 86.88% with Random Forest Classifier and SVC