In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/heart-disease-uci/heart.csv')
df.head()

In [None]:
df.info()

In [None]:
df.describe()

## Column Information
* Age (age) is the age of candidate
* Sex (sex) has numeric values. 1 denotes male and 0 denotes female
* Chest Pain (cp) pain has values between 0-3. The types of angina that are described in the research paper. The higher the number, the lesser are the odds of heart attack
* Resting Blood Pressure (trtbps) is normal pressure with no exercise
* Cholesterol (chol) means the blockage for blood supply in the blood vessels
* Fasting Blood Pressure (fbs) is blood sugar taken after a long gap between a meal and the test. Typically, it's taken before any meal in the morning
* Rest ECG (restecg) results means ECG values taken while person is on rest which means no exercise and normal functioning of heart is happening
* Maximum Heart Rate (thalachh) achieved
* Exercise Induced Angina (exng) is chest pain while exercising or doing any physical activity
* ST Depression (oldpeak) is the difference between value of ECG at rest and after exercise
* ST Slope (slp) is the tangent to the depression value
* Number of Major Blood Vessels (caa) supplying blood to heart blocked
* Types of Thalassemia (thall)
* Heart Attack (target) where 1 denotes Heart Attack suffered and 0 where it did not take place

## EDA

In [None]:
df_eda = df.copy()
df_eda.head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns


sns.set()
sns.set(rc={'figure.figsize':(15,10)})
print(df_eda.sex.value_counts())

sns.catplot(x='sex', kind='count', data=df_eda)
plt.show()

In [None]:
sns.histplot(x='age', hue='sex', kde=True, data=df_eda)
plt.show()
plt.close()

In [None]:
# df_eda.target.value_counts()

In [None]:
from plotly.offline import init_notebook_mode, iplot, plot

labels = df_eda[df_eda['target'] == 1]['sex'].value_counts().index
pie1 = df_eda[df_eda['target'] == 1]['sex'].value_counts().values


fig = {
  "data": [
    {
      "values": pie1,
      "labels": labels,
      "domain": {"x": [0, .5]},
      "name": "",
      "hoverinfo":"label+percent+name+value",
      "hole": .2,
      "type": "pie"
    },],
  "layout": {
        "title":"Distribution of Target by Gender",
        "annotations": [
            { "font": { "size": 25},
              "showarrow": True,
              "text": "DEATH",
                "x": 1,
                "y": 1,
            },
        ]
    }
}
iplot(fig)

In [None]:
df_eda.hist(figsize=(12,9))
plt.show()

## Data Preprocessing

In [None]:
X = df.drop('target', axis=1)
y = df['target']

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(X)
X_scaled = scaler.transform(X)

In [None]:
X_scaled

## Splitting Data

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.25, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=1)

print(X_train.shape)
print(X_valid.shape)
print(X_test.shape)

## Model Prediction

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

### SVC

In [None]:
svc = SVC(C=100, kernel='linear')
svc.fit(X_train, y_train)
print("Score of Train : ", svc.score(X_train, y_train))
print("Score of Validation : ", svc.score(X_valid, y_valid))

y_pred = svc.predict(X_test)
svc_accuracy = accuracy_score(y_pred, y_test)
print("Score of Test : ", svc_accuracy)
print(classification_report(y_pred, y_test))

### Random Forest Classifier

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

print("Score of Trained Model : ", rf.score(X_train, y_train))
print("Score of Validation Model : ", rf.score(X_valid, y_valid))

y_pred = rf.predict(X_test)
rf_accuracy = accuracy_score(y_pred, y_test)
print("Score of Test : ", rf_accuracy)
print(classification_report(y_pred, y_test))

### Gradient Boosting

In [None]:
gbc = GradientBoostingClassifier()
gbc.fit(X_train, y_train)

print("Score of Trained Model : ", gbc.score(X_train, y_train))
print("Score of Test Model : ", gbc.score(X_valid, y_valid))

y_pred = gbc.predict(X_test)
gbc_accuracy = accuracy_score(y_pred, y_test)
print("Score of Test : ", gbc_accuracy)
print(classification_report(y_pred, y_test))

### Decission Tree

In [None]:
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)

print("Score of Trained Model : ", dtc.score(X_train, y_train))
print("Score of Test Model : ", dtc.score(X_test, y_test))

y_pred = dtc.predict(X_test)
dtc_accuracy = accuracy_score(y_pred, y_test)
print(classification_report(y_pred, y_test))

### KNN

In [None]:
knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(X_train, y_train)

print("Score of Trained Model : ", knn.score(X_train, y_train))
print("Score of Test Model : ", knn.score(X_valid, y_valid))

y_pred = knn.predict(X_test)
knn_accuracy = accuracy_score(y_pred, y_test)
print("Score of Test : ", knn_accuracy)
print(classification_report(y_pred, y_test))

In [None]:
label_model = ['SVC', 'Random Forest', 'Gradient Boosting', 'Decision Tree', 'KNN']
accuracy = [svc_accuracy, rf_accuracy, gbc_accuracy, dtc_accuracy, knn_accuracy]

for i in range(len(label_model)):
    print("{} accuracy : {}".format(label_model[i], accuracy[i]))

In [None]:
plt.figure(figsize=(12,9))
plt.bar(label_model, accuracy)
plt.show()

## SVM VS KNN 

In [None]:
accuracy_val = []
for i in range(10,15,1):
    model = KNeighborsClassifier(n_neighbors=i)
    model.fit(X_train, y_train)
    model_pred = model.predict(X_test)
    accuracy = accuracy_score(model_pred, y_test)
    accuracy_val.append(accuracy)
plt.figure(figsize=(12,9))
plt.plot(range(10,15,1), accuracy_val)
plt.show()

In [None]:
accuracy_val = []
for i in [100, 200, 300, 400, 500]:
    model = SVC(C = i, kernel='linear')
    model.fit(X_train, y_train)
    model_pred = model.predict(X_test)
    accuracy = accuracy_score(model_pred, y_test)
    accuracy_val.append(accuracy)
plt.figure(figsize=(12,9))
plt.plot([100, 200, 300, 400, 500], accuracy_val)
plt.show()