In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statistics as stat
from scipy import stats

## Import Dataset

In [None]:
df = pd.read_csv('../input/heart-attack-analysis-prediction-dataset/heart.csv')
df.head()

In [None]:
df.info()

In [None]:
df = df.drop_duplicates()
df.shape

## Outlier Check and Handling

In [None]:
def z_score_method(df, variable_name):
    #Takes two parameters: dataframe & variable of interest as string
    columns = df.columns
    z = np.abs(stats.zscore(df))
    threshold = 3
    outlier = []
    index=0
    for item in range(len(columns)):
        if columns[item] == variable_name:
            index = item
    for i, v in enumerate(z[:, index]):
        if v > threshold:
            outlier.append(i)
        else:
            continue
    return outlier

In [None]:
outlier = []
col = []
for i,k in enumerate(df.columns):
    outlier.append(z_score_method(df,k))
    if outlier[i] != []:
        col.append(k)

#handle outlier
ind = 0
for i in range(len(outlier)):
    if (outlier[i] == []):
        continue
    else:
        for j in (outlier[i]):
            df[col[ind]].values[j] = stat.median(df[col[ind]])
        ind += 1

## Check Unbalance Data

In [None]:
sns.countplot(x = "output",data = df)

So, before we train the model we should balancing the data firstly

## Check Correlation

In [None]:
plt.figure(figsize=(15,10))
sns.heatmap(df.corr(),annot=True,cmap='coolwarm')

## Visualize Categorical Data

In [None]:
sns.catplot(x="sex", data=df, kind="count")
sns.catplot(x="cp", data=df, kind="count")
sns.catplot(x="fbs", data=df, kind="count")
sns.catplot(x="restecg", data=df, kind="count")
sns.catplot(x="exng", data=df, kind="count")
sns.catplot(x="slp", data=df, kind="count")
sns.catplot(x="caa", data=df, kind="count")
sns.catplot(x="thall", data=df, kind="count")

## Visualize Numerical Data

In [None]:
sns.catplot(x="output", y="age", data=df, kind="box")
sns.catplot(x="output", y="trtbps", data=df, kind="box")
sns.catplot(x="output", y="chol", data=df, kind="box")
sns.catplot(x="output", y="thalachh", data=df, kind="box")
sns.catplot(x="output", y="oldpeak", data=df, kind="box")

## Distribution Output Based on Ages

In [None]:
plt.figure(figsize=(12,8))
sns.histplot(data=df, x='age', hue='output')
plt.title('Distribution Output Based on Ages')
plt.show()

## Classification Process

In [None]:
df1 = df.copy()

In [None]:
x = df1.iloc[:, :-1].values
y = df1.iloc[:, -1].values

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
import xgboost as xgb

#### Splitting Data and Standarizing Data

In [None]:
# Spliting the data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2,random_state = 0)
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

#### Balancing Data

Handle unbalanced data using SMOTE

In [None]:
from imblearn.over_sampling import SMOTENC
smote_nc = SMOTENC(categorical_features=[df['output'].min(), df['output'].max()], random_state=0)
x_train, y_train = smote_nc.fit_resample(x_train, y_train)

#### Train Model

In [None]:
key = ['LogisticRegression','KNeighborsClassifier','SVC','DecisionTreeClassifier','RandomForestClassifier',
       'GradientBoostingClassifier','AdaBoostClassifier','XGBClassifier']
value = [LogisticRegression(),KNeighborsClassifier(),SVC(C=.5, gamma = 0.1,kernel = 'rbf', random_state = 0),
         DecisionTreeClassifier(),RandomForestClassifier(),GradientBoostingClassifier(),AdaBoostClassifier(),xgb.XGBClassifier()]
models = dict(zip(key,value))
print(models)

In [None]:
predicted =[]
for name,algo in models.items():
    model=algo
    model.fit(x_train,y_train)
    predict = model.predict(x_test)
    acc = accuracy_score(y_test, predict)
    predicted.append(acc)
    print(name,acc)

In [None]:
plt.figure(figsize = (10,5))
sns.barplot(x = predicted, y = key)

The highest accuracy score is 93.4 % using SVM