In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
warnings.filterwarnings('ignore')

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



In [None]:
df = pd.read_csv('/kaggle/input/heart-attack-analysis-prediction-dataset/heart.csv')
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.describe().T

In [None]:
df.duplicated().sum()

In [None]:
print('before drop shape:',df.shape)
df = df[~df.duplicated()]
print('after drop shape:',df.shape)

In [None]:
df.nunique()

# EDA

In [None]:
sns.countplot(df['output'])

In [None]:
category_cols = ['sex','cp','fbs','restecg','exng','slp','caa','thall']
num_cols = ['age','trtbps','chol','thalachh','oldpeak']

In [None]:
# category cols plot
fig = plt.figure(figsize=(15,50))
for i,col in enumerate(category_cols):
    ax1 = fig.add_subplot(8,2,2*i+1)
    sns.countplot(df[df['output']==0][col],ax=ax1)
    ax1.set_title(f'{col} plot(heart 0)',fontsize=16)
    ax2 = fig.add_subplot(8,2,2*i+2)
    sns.countplot(df[df['output']==1][col],ax=ax2)
    ax2.set_title(f'{col} plot(heart 1)',fontsize=16)
    if ax1.get_ylim() > ax2.get_ylim(): # ylim align
        ax2.set_ylim(ax1.get_ylim())
    else:
        ax1.set_ylim(ax2.get_ylim())
plt.show()

In [None]:
# num_cols hist plot
fig = plt.figure(figsize=(15,25))
for i,col in enumerate(num_cols):
    ax1 = fig.add_subplot(3,2,i+1)
    sns.distplot(df[df['output']==0][col],label='heart 0',color='green',ax=ax1)
    sns.distplot(df[df['output']==1][col],label='heart 1',color='blue',ax=ax1)
    ax1.set_title(f'{col} Histogram',fontsize=20)
    plt.legend()
plt.show()
    

In [None]:
sns.pairplot(df,hue='output')

In [None]:
df.corr()

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df.corr())

In [None]:
# output_corr high columns
d_ = df.corr()['output']
corr_high = d_[(d_>0.4)|(d_<(-0.4))].index
corr_high = [col for col in corr_high if col != 'output']
corr_high

### Catplot

In [None]:
sns.catplot(x='cp',y='thalachh',data=df,hue='output',kind='box')

In [None]:
sns.catplot(x='cp',y='oldpeak',data=df,hue='output',kind='box')

In [None]:
sns.catplot(x='exng',y='thalachh',data=df,kind='box',hue='output')

In [None]:
sns.catplot(x='exng',y='oldpeak',data=df,kind='box',hue='output')

In [None]:
sns.catplot(x='caa',y='thalachh',data=df,hue='output',kind='box')

In [None]:
sns.catplot(x='caa',y='oldpeak',data=df,hue='output',kind='box')

### FacetGrid

In [None]:
# thalachh and age plot
sns.jointplot(x='age',y='thalachh',data=df,hue='output')

In [None]:
# thalachh and age and cp plot
g = sns.FacetGrid(df,hue='output',col='cp',size=4)
g.map(plt.scatter,'age','thalachh')
g.add_legend()
plt.show()

In [None]:
sns.catplot(x='slp',y='thalachh',data=df,hue='output')

In [None]:
g = sns.FacetGrid(df,hue='output',col='slp',size=4)
g.map(plt.scatter,'age','thalachh')
g.add_legend()
plt.show()

In [None]:
g = sns.FacetGrid(df,hue='output',col='sex',row='cp',size=4)
g.map(plt.scatter,'thalachh','oldpeak')
g.add_legend()
plt.show()

In [None]:
# output_corr lower columns ['trtbps','chol']

sns.jointplot('trtbps','chol',data=df,hue='output')

# Model Make

In [None]:
from sklearn.model_selection import train_test_split,KFold,GridSearchCV
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.metrics import accuracy_score,roc_auc_score,roc_curve,confusion_matrix,f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
import lightgbm as lgb

In [None]:
data = df.drop('output',axis=1)
data_cols = data.columns
target = df['output']
train_data, test_data, train_target, test_target = train_test_split(data,target,test_size=0.2,random_state=1)
train_data, val_data, train_target, val_target = train_test_split(train_data,train_target,test_size=0.2,random_state=1)
print(f'train data shape:{train_data.shape}\nvalidation data shape:{val_data.shape}\ntest data shape:{test_data.shape}')

In [None]:
# scaling
scaler = StandardScaler()
train_data_scaled = scaler.fit_transform(train_data)
val_data_scaled = scaler.transform(val_data)
test_data_scaled = scaler.transform(test_data)

# dataframe
train_data_scaled =pd.DataFrame(train_data_scaled,columns=data_cols)
val_data_scaled =pd.DataFrame(val_data_scaled,columns=data_cols)
test_data_scaled =pd.DataFrame(test_data_scaled,columns=data_cols)

train_data_scaled.head()

In [None]:
def train_and_val_score(x,y,xval,yval,model,model_name):
    model.fit(x,y)
    preds = model.predict(xval)
    acc = accuracy_score(yval,preds)
    f1 = f1_score(yval,preds)
    auc_score = roc_auc_score(yval,preds)
    mat = confusion_matrix(yval,preds)
    print(f'{model_name.upper()} Validation Score\nAccuracy:{acc:.3f} F1 Score:{f1:.3f} Auc:{auc_score:.3f}')
    sns.heatmap(mat,annot=True,cmap='Blues')
    plot_roc_curve(xval,yval,model)
    
def plot_roc_curve(data,target,model):
    preds = model.predict_proba(data)
    preds = preds[:,1]
    fpr, tpr, thres = roc_curve(target,preds)
    plt.figure(figsize=(5,5))
    plt.plot(fpr,tpr,label='rou curve')
    plt.plot([0,1],[0,1])
    plt.xlabel('FPR')
    plt.ylabel('TPR')
    plt.legend()
    plt.show()

In [None]:
logistic = LogisticRegression()
train_and_val_score(train_data_scaled,train_target,val_data_scaled,val_target,logistic,'logistic')

In [None]:
randomforest = RandomForestClassifier()
train_and_val_score(train_data_scaled,train_target,val_data_scaled,val_target,randomforest,'randomforest')

In [None]:
kne = KNeighborsClassifier(n_neighbors=5)
train_and_val_score(train_data_scaled,train_target,val_data_scaled,val_target,kne,'kneighbors')

In [None]:
xgb_model = xgb.XGBClassifier()
train_and_val_score(train_data_scaled,train_target,val_data_scaled,val_target,xgb_model,'xgb')

In [None]:
lgb_model = lgb.LGBMClassifier()
train_and_val_score(train_data_scaled,train_target,val_data_scaled,val_target,lgb_model,'lgb')

In [None]:
feature_importance = randomforest.feature_importances_
randomforest_importance = pd.DataFrame({'feature_name':data_cols,'importances':feature_importance})
randomforest_importance.index = randomforest_importance['feature_name'].values
randomforest_importance.drop('feature_name',axis=1)
randomforest_importance = randomforest_importance.sort_values('importances',ascending=False)
randomforest_importance.plot(kind='bar')
plt.title('RandomForest Feature Importance')

In [None]:
xgb_importance = xgb_model.feature_importances_
xgb_importance = pd.DataFrame({'feature_name':data_cols,'importances':feature_importance})
xgb_importance.index = xgb_importance['feature_name'].values
xgb_importance.drop('feature_name',axis=1)
xgb_importance = xgb_importance.sort_values('importances',ascending=False)
xgb_importance.plot(kind='bar')
plt.title('XGB Model Feature Importance')

In [None]:
lgb_importance = lgb_model.feature_importances_
lgb_importance = pd.DataFrame({'feature_name':data_cols,'importances':feature_importance})
lgb_importance.index = lgb_importance['feature_name'].values
lgb_importance.drop('feature_name',axis=1)
lgb_importance = lgb_importance.sort_values('importances',ascending=False)
lgb_importance.plot(kind='bar')
plt.title('LGBM Model Feature Importance')

### Parameter search

In [None]:
logistic_params = {'penalty':['l1','l2','elasticnet','none'],
                   'C':[0.01,0.1,1,10,20,30,50]}
randomforest_params = {'max_depth':[3,5,10],
                       'n_estimators':[100,200,250,300],
                       'min_samples_leaf':[2,3,4,5]}
kne_params = {'n_neighbors':[3,4,5,6,7,8],
              'weights':['uniform','distance']}
xgb_params = {'max_depth':[None,3,5,7,10],
              'gamma':[0.1,0,1,2,3]}
lgb_params = {'num_leaves':[1,2,3,4,5],
              'max_depth':[-1,1,2,3,4,],
              'n_estimators':[10,30,50,100,120]}

params_list = [logistic_params,randomforest_params,kne_params,xgb_params,lgb_params]
logistic = LogisticRegression()
randomforest = RandomForestClassifier()
kne =KNeighborsClassifier()
xgb_model = xgb.XGBClassifier(eval_metric='logloss')
lgb_model = lgb.LGBMClassifier()
models = [logistic,randomforest,kne,xgb_model,lgb_model]
model_name = ['logistic','randomforest','kneighbors','xgb','lgb']
best_param_dict = {}
for i in range(len(models)):
    grid = GridSearchCV(models[i],params_list[i],cv=3,scoring='accuracy')
    grid.fit(train_data_scaled,train_target)
    best_param_dict[model_name[i]] = grid.best_params_
    print(grid.best_score_)

In [None]:
best_param_dict

In [None]:
best_logistic = LogisticRegression(**best_param_dict['logistic'])
train_and_val_score(train_data_scaled,train_target,val_data_scaled,val_target,best_logistic,'logistic')

In [None]:
best_randomforest = RandomForestClassifier(**best_param_dict['randomforest'])
train_and_val_score(train_data_scaled,train_target,val_data_scaled,val_target,best_randomforest,'randomforest')

In [None]:
best_kne =KNeighborsClassifier(**best_param_dict['kneighbors'])
train_and_val_score(train_data_scaled,train_target,val_data_scaled,val_target,best_kne,'kneighbors')

In [None]:
best_xgb = xgb.XGBClassifier(**best_param_dict['xgb'],eval_metric='logloss')
train_and_val_score(train_data_scaled,train_target,val_data_scaled,val_target,best_xgb,'xgb')

In [None]:
best_lgb = lgb.LGBMClassifier(**best_param_dict['lgb'])
train_and_val_score(train_data_scaled,train_target,val_data_scaled,val_target,best_lgb,'lgb')

In [None]:
best_model_list = [best_logistic,best_randomforest,best_kne,best_xgb,best_lgb]
for i,model in enumerate(best_model_list):
    preds = model.predict(test_data_scaled)
    acc = accuracy_score(test_target,preds)
    print(f'{model_name[i]} TEST Accuracy:{acc:.3f}')

#### next make feature