In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn import svm
from sklearn import neighbors
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import scale

sns.set()


In [None]:
df = pd.read_csv('../input/heart-disease-uci/heart.csv')
df.head()

In [None]:
#0 age
#1 sex
#2 chest pain type (4 values)
#3 resting blood pressure
#4 serum cholestoral in mg/dl
#5 fasting blood sugar > 120 mg/dl
#6 resting electrocardiographic results (values 0,1,2)
#7 maximum heart rate achieved
#8 exercise induced angina
#9 oldpeak = ST depression induced by exercise relative to rest
#10 the slope of the peak exercise ST segment
#11 number of major vessels (0-3) colored by flourosopy
#12 thal: 3 = normal; 6 = fixed defect; 7 = reversable defect

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
# A little bit of EDA

sns.countplot(x='target', hue='sex', data=df)

# 0 sex is more likely to get the disease

In [None]:
sns.histplot(x='age', data=df, hue='target')

# normal distribution

In [None]:
sns.boxplot(x='target', y='age', data=df)

# there are fewer older people with the disease 

In [None]:
sns.countplot(x='target', hue='cp', data=df)

# chest pain type 0 is less likely to relate to the disease
# whereas type 2 is more likely to do

In [None]:
sns.jointplot(x='age', y='trestbps', data=df, hue='target')

# the older one gets the highier one's blood pressure

In [None]:
sns.jointplot(x='age', y='chol', hue='target', data=df)

# cholesterol increases with age

In [None]:
sns.countplot(x='target', hue='fbs', data=df)

# blood sugar doesn't affect the target significantly ?

In [None]:
sns.countplot(x='target', hue='restecg', data=df)

# reason 1 is much frequent in patients with the disease

In [None]:
sns.scatterplot(x='age', y='thalach', hue='target', data=df)

# the younger the highier maximum heart rate achieved
# patients with a diases tend to have highier maximum heart rates

In [None]:
sns.boxplot(x='target', y='thalach', data=df)

In [None]:
sns.countplot(x='target', hue='exang', data=df)

# no angina rate is highier in patients with the disease

In [None]:
sns.boxplot(x='target', y='oldpeak', data=df)

# exericise indused depression is lower in patients with the disease

In [None]:
sns.countplot(x='target', hue='slope', data=df)

# depression slope 2 is highier in patients with the disease

In [None]:
sns.countplot(x='target', hue='ca', data=df)

# number of the vessels is lower in patients with the disease

In [None]:
sns.countplot(x='target', hue='thal', data=df)

# defect type 2 is highier and type 3 is lower in patients with the disease


In [None]:
# Data preproccesing


In [None]:
df.groupby('target').mean()['age']

In [None]:
df.groupby('target').mean()['trestbps']

In [None]:
df.groupby('target').mean()['chol']

In [None]:
df.groupby('target').mean()['thalach']

In [None]:
df.head()

In [None]:
numerical = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
categorical = [col for col in df.columns if col not in [*numerical, 'target']]
dummies = pd.get_dummies(df, columns=categorical, drop_first=True)

In [None]:
dummies = dummies.drop([*numerical, 'target'], axis=1)
dummies.head()

In [None]:
scaled = pd.DataFrame(scale(df[numerical]), columns=numerical, index=df.index)
scaled.head()

In [None]:
preprocessed = pd.concat([scaled, dummies, df['target']], axis=1)
preprocessed.head()

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(preprocessed.drop('target', axis=1), preprocessed['target'], train_size=0.9, random_state=42)
X_train.shape, X_test.shape

In [None]:
model = svm.SVC()
model.fit(X_train, Y_train)

In [None]:
model.score(X_train, Y_train)

In [None]:
cross_val_score(model, X_test, Y_test).mean()

In [None]:
model = neighbors.KNeighborsClassifier()
model.fit(X_train, Y_train)

In [None]:
cross_val_score(model, X_test, Y_test).mean()

In [None]:
predictions = model.predict(X_test)
print(metrics.classification_report(Y_test, predictions))