In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#additional imports
import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.svm import LinearSVC
from sklearn.svm  import SVC


In [None]:
path = Path('/kaggle/input/heart-disease-uci/heart.csv')

Loading dataframe and checking what types of data we have

In [None]:
df = pd.read_csv(path)
df.head()

looks like everything is numerical , once again we can re-check it with pandas .info() method

In [None]:
df.info()

We also need to find null\NaN values

In [None]:
df.isnull().sum()

In [None]:
df.isna().sum()

We don't need to preprocess text data and fill empty cells. Now we can visualize dataframe to get the idea how features relate to each other.

**Data visualization**

In [None]:
correlation = df.corr()
plt.figure(1,figsize = (11,11))
sns.heatmap(correlation,annot=True,cmap="YlGnBu")

In [None]:
#gender distribution
#1 for a male, and 0 for a female
sns.countplot(x = 'sex', data = df,palette="Set3")

In [None]:
#age distribution
sns.distplot(df['age'] , bins = 20)

rate is spiking after 40 y\o

cp is chest pain rate. and it's highly related to heart desease according to our correlation matrix. let's get plot representation

In [None]:
# cp is chest pain rate. 
fig = plt.figure()
ax = fig.add_subplot(111)
scatter = ax.scatter(df['cp'],df['age'],c=df.target)
ax.set_title("age and chest pain distribution")
ax.set_xlabel('chest pain')
ax.set_ylabel('age')
plt.colorbar(scatter)

age 40 and older tend to have chest pain without actual heart desease. And it's almost impossible to have a heart desease without pain.

Maximum heart rate (thalach) and ST segment(slope) are highly related to desease.

let's plot

In [None]:
sns.pairplot(df[['age','thalach','slope','target']],hue='target',size=5)

it's tempting to use desicion tree and random forest classifier , but since it's binary classification - most models should have similar performance . I've decided to stick SVC this time

**feature scaling**

In [None]:
scaler = StandardScaler()
df.shape

In [None]:
X = scaler.fit_transform(df.drop(['target'],axis=1))
y = df.target
X.shape,y.shape

train \ test split

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2, random_state = 0)

We should try Linear SVC first

In [None]:
Lsvcmodel = LinearSVC()
Lsvcmodel.fit(X_train,y_train)
Lsvcmodel.score(X_test,y_test)

before trying other models, i would search for a hyperparameters tuning

In [None]:
params = {'C' : [0.01,0.1,0.25,0.5,0.75,1,10,100],
         'gamma' : [1,0.75,0.5,0.25,0.1,0.01,0.001],
         'kernel': ['rbf','poly','linear']}

In [None]:
gridsearch = GridSearchCV(SVC(),params,refit=True)
gridsearch.fit(X_train,y_train)

In [None]:
gridsearch.best_params_

**creating final model**

In [None]:
Lmodel = SVC(kernel='linear', gamma=1, C=0.01)
Lmodel.fit(X_train,y_train)
Lmodel.score(X_test,y_test)

confusion matrix and classification report

In [None]:
test_pred = Lmodel.predict(X_test)
cm = confusion_matrix(y_test,test_pred)
cm

In [None]:
print(classification_report(y_test,test_pred))