In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()
# sns.set_style('whitegrid')

In [None]:
data_path = '/kaggle/input/heart-attack-analysis-prediction-dataset/heart.csv'
data_orig = pd.read_csv(data_path)
print(data_orig.shape)
data_orig.head()

In [None]:
## Create copy and check null values
data = data_orig.copy()
data.isnull().sum()

In [None]:
## check duplicates
if any(data.duplicated()):
    print('removing duplicates , count : ',sum(data.duplicated()))
    data.drop_duplicates(inplace = True)
    data.reset_index(drop = True,inplace = True)

In [None]:
all_cols = list(data.columns)
cont_cols = ['age','trtbps','chol','thalachh','oldpeak']
cate_cols = ['sex','fbs','restecg','exng','slp','caa','thall']
output_col = 'output'

In [None]:
fig,ax1 = plt.subplots(nrows = 5,ncols = 1,figsize = (20,20))
skew_vals = data[cont_cols].skew()
for k in range(5):
    data[[cont_cols[k]]].hist(ax = ax1[k],label = 'Skew Value : {:.4f}'.format(skew_vals[k]))
    ax1[k].legend()
plt.show()

In [None]:
%%capture
'''
oldpeak is the only skewed col
chol has a high skew value, probably due to outliers
'''

In [None]:
### checking for outliers 
fig,ax11 = plt.subplots(5,1,figsize = (20,20))
for i in range(5):
    sns.boxplot(x = cont_cols[i],data = data,ax = ax11[i])
    sns.swarmplot(x = cont_cols[i],data = data,ax = ax11[i],color = 'black')
plt.show()

In [None]:
%%capture
'''
columns : ['trtbps','chol',oldpeak] have few extreme outliers
will probably remove these values before fitting to the model.
'''

In [None]:
temp_cols = cate_cols 
fig,ax2 = plt.subplots(7,2,figsize = (20,40))
for i in range(7):
    sns.countplot(x = temp_cols[i],data=data,ax = ax2[i][0])
    sns.countplot(x = temp_cols[i],data=data,ax = ax2[i][1],hue = 'output')
plt.show()

In [None]:
%%capture
'''
sex : risk of heart problem in much more in sex 0 which also has less data (could be that the data is not representative of the population , 
        biasness in data collection)
fbs : does not look like make much difference on the output
restecg : value 1 indicates higher risk (to be expected?)
exng : class 0 has much higher risk
'''

In [None]:
sns.set()
sns.pairplot(data,hue = 'output')

In [None]:
corr = data.corr()
plt.figure(figsize = (16,8))
sns.heatmap(corr,annot=True,cmap = 'Greens')
plt.show()

In [None]:
## removing outliers from continous columns ##
data = data_orig.copy()

cont_data = data[cont_cols]

q1 = cont_data.quantile(0.25)
q3 = cont_data.quantile(0.75)
inter_quartile = (q3 - q1)
upper_lim = dict(q3 + 1.5 * inter_quartile)
lower_lim = dict(q1 - 1.5 * inter_quartile)
for col in cont_cols:
    upper_cross = (data[col] > upper_lim[col])
    lower_cross = (data[col] < lower_lim[col])
    out_count = sum(upper_cross) + sum(lower_cross)
    out_ratio = (out_count / len(data))*100
    print('{}\t{}\t{:.4f}%'.format(col,out_count,out_ratio))
    
    data[upper_cross][col] = upper_lim[col]
    data[lower_cross][col] = lower_lim[col]

In [None]:
##### training ml models ######
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

## Boosting algorithms
import xgboost as xgb
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier

## Metrics
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [None]:
feat_cols = ['age', 'sex', 'trtbps', 'chol', 'fbs', 'restecg', 'thalachh',
                 'exng', 'oldpeak']

X = data[feat_cols].values
Y = data[output_col].values

X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size = 0.2)

# scaling data
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
pd.DataFrame(X_train_scaled,columns = feat_cols).head()

In [None]:
def train_model(model_name,model,data_dict):
    model.fit(data_dict['X_train'],data_dict['Y_train'])
    preds = model.predict(data_dict['X_test'])
    print('---------------------- Report ------------------------')
    print('Model : {}'.format(model_name))
    print(classification_report(data_dict['Y_test'],preds))
    return model

In [None]:
data_dict = {'X_train' : X_train_scaled,
             'X_test' : X_test_scaled,'Y_train' : Y_train,'Y_test' : Y_test} 

models_dict = {
    'Logistic Regression' : LogisticRegression(random_state=42),
    'Naive Bayes' : GaussianNB(),
    'Linear SVM' : SVC(kernel = 'linear',random_state = 42),
    'RBF SVM' : SVC(kernel = 'rbf',random_state=42),
    'Random Forest' : RandomForestClassifier(n_estimators=100,random_state=42),
    'XGBoost' : XGBClassifier(use_label_encoder=False),
    'AdaBoost' : AdaBoostClassifier(learning_rate=0.15,n_estimators=25,random_state=42),
    'Gradient Boosting' : GradientBoostingClassifier(random_state=42,n_estimators=20,
                                                     loss = 'deviance',learning_rate=0.20),
    'LGBM' : LGBMClassifier(random_state=42)
}

In [None]:
for model_name,model in models_dict.items():
    train_model(model_name,model,data_dict)

In [None]:
### Applying grid search on SVM
param_grid = {
    'C' : [0.1,1,10,100,1000],
    'degree' : [4,5,6,7,8]
}

svm_model = SVC(kernel = 'rbf')
grid = GridSearchCV(svm_model,param_grid,refit = True,verbose = 5)
model = train_model('Grid Search SVM',grid,data_dict)