In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
df = pd.read_csv('/kaggle/input/heart-attack-analysis-prediction-dataset/heart.csv')
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', len(df.columns))

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
for i in df.columns:
    print('{}:- {}\n{}\n\n'.format(i ,df[i].nunique(), df[i].unique()))

In [None]:
sns.set(style='whitegrid')

In [None]:
def distribution(df, cols=4, display_num=len(df.columns)-1,figsize=(12,10)):
    rows = (display_num - 1) // cols + 1
    
    fig, axes = plt.subplots(rows, cols, figsize=figsize)
    for i in range(display_num):
        row_i = i // cols
        col_i = i % cols
        title = df.columns[i]
        sns.histplot(x=df.columns[i], data=df, alpha=0.8, ax=axes[row_i, col_i])
        axes[row_i, col_i].set_title(title)
    
    fig.tight_layout()
    

In [None]:
distribution(df)

In [None]:
class Outliers(object):
    def __init__(self, df, col):
        self.df = df
        self.col = col
        
        self.mean = df[col].mean()
        self.median = df[col].median()
        self.min = df[col].min()
        self.max = df[col].max()
        self.std = df[col].std()
        self.quantile_25 = self.df[self.col].quantile(0.25)
        self.quantile_75 = self.df[self.col].quantile(0.75)
        
    def info(self):
        print('Mean: {}'.format(self.mean))
        print('Median: {}'.format(self.median))
        print('Standard Deviation: {}'.format(self.std))
        print('Minimun value: {}'.format(self.min))
        print('Maximum value: {}'.format(self.max))
        print('25th quantile: {}'.format(self.quantile_25))
        print('75th quantile: {}'.format(self.quantile_75))
        
        des = '*' * 20
        return des

class IQR(Outliers):
    def __init__(self, df, col):
        super().__init__(df, col)
        
    def iqr_calc(self):
        
        IQR = self.quantile_75 - self.quantile_25
        
        lower_bound = self.quantile_25 - (1.5 * IQR)
        upper_bound = self.quantile_75 + (1.5 * IQR)
        
        return lower_bound, upper_bound
    
    def iqr_outliers(self):
        
        lower_bound, upper_bound = self.iqr_calc()
        
        return self.df.loc[(self.df[self.col] < lower_bound) | (self.df[self.col] > upper_bound), self.col]
    
    def iqr_remove(self):
        
        lower_bound, upper_bound = self.iqr_calc()
        
        return self.df.loc[(self.df[self.col] > lower_bound) & (self.df[self.col] < upper_bound)]
    
class Z_score(Outliers):
    def __init__(self, df, col):
        super().__init__(df, col)
        
    def z_score_outliers(self):
        outlier = []
        for i in self.df[self.col]:
            z = (i - self.mean) / self.std
            if abs(z) > 3:
                outlier.append(i)
                
        return outlier
    
    def z_score_remove(self):
        
        df_copy = self.df.copy()
        for i in self.z_score_outliers():
            df_copy = df_copy.loc[df_copy[self.col] != i]
            
        return df_copy
    
class StandardDeviation(Outliers):
    def __init__(self, df, col):
        super().__init__(df, col)
        
    def std_calc(self):
        
        lower_std = self.mean - (3 * self.std)
        upper_std = self.mean + (3 * self.std)
        
        return lower_std, upper_std
    
    def std_outliers(self):
        
        lower_std, upper_std = self.std_calc()
        return self.df.loc[(self.df[self.col] < lower_std) | (self.df[self.col] > upper_std), self.col]

    def std_remove(self):
        
        lower_std, upper_std = self.std_calc()
        return self.df.loc[(self.df[self.col] > lower_std) & (self.df[self.col] < upper_std)]


In [None]:
columns = ['chol', 'trtbps', 'thalachh', 'oldpeak']

for i in columns:
    out = Outliers(df, i)
    iqr = IQR(df, i)
    z_score = Z_score(df, i)
    std = StandardDeviation(df, i)
    
    print('Outliers:- {}\n'.format(i))
    
    print('Info:- {}\n')
    out.info()
    print('*' * 40)
    print('\n')
          
    print('IQR Outliers:- \n{}\n'.format(iqr.iqr_outliers()))
    print('Shape of df if removed outliers with IQR:- {}'.format(iqr.iqr_remove().shape))
    print('*' * 20)
    
    print('Z-score Outliers:- \n{}\n'.format(z_score.z_score_outliers())) 
    print('Shape of df if removed outliers with Z-score:- {}'.format(z_score.z_score_remove().shape))
    print('*' * 20)
          
    print('StandardDeviation Outliers:- \n{}\n'.format(std.std_outliers()))
    print('Shape of df if removed outliers with StandardDeviation:- {}'.format(std.std_remove().shape))
          
    print('\n', '*'*100, '\n', '*'*100, '\n')

In [None]:
# Treating Outliers of chol
z_score_chol = Z_score(df, 'chol')
df = z_score_chol.z_score_remove()
df.shape

In [None]:
# Treating Outliers of trtbps
std_trtbps = StandardDeviation(df, 'trtbps')
df = std_trtbps.std_remove()
df.shape

In [None]:
# Treating Outliers of thalachh
iqr_thalachh = IQR(df, 'thalachh')
df = iqr_thalachh.iqr_remove()
df.shape

In [None]:
# Treating Outliers of oldpeak
z_score_oldpeak = Z_score(df, 'oldpeak')
df = z_score_oldpeak.z_score_remove()
df.shape

In [None]:
# Distributions of columns after treating outliers
distribution(df)

### EDA

In [None]:
sex_out = df.groupby(['sex', 'output'])[['output']].count()
sex_atk = df.groupby('sex')[['output']].mean()

fig, axes = plt.subplots(2,1, figsize=(8,10))

indx = np.arange(len(sex_out.unstack().output.index))
width = 0.25

axes[0].bar(indx - width/2, sex_out.unstack().output[0].values,
       width = width, alpha = 0.7, label = 'less chance of heart attack')
axes[0].bar(indx + width/2, sex_out.unstack().output[1].values,
       width=width, alpha=0.7, label= 'more chance of heart attack')

axes[0].set_title('Frequency of people with chance of heart attack \n(gender wise)\n', fontsize=25)
axes[0].set_xlabel('Sex', fontsize=20)
axes[0].set_ylabel('Frequency', fontsize=20)

axes[0].set_xticks([0,1])
axes[0].set_xticklabels(('Female', 'Male'))
axes[0].legend()



axes[1].bar(sex_atk.index.map({0: 'Female', 1: 'Male'}), sex_atk.output, color = 'red', 
        alpha = 0.5, edgecolor = 'black', width = 0.5)

axes[1].set_title('Chance of getting heart attack of each gender type\n', fontsize = 25)
axes[1].set_xlabel('\nChance of getting heart attack (in %)', fontsize = 20)
axes[1].set_ylabel('Genders\n', fontsize = 20)

fig.tight_layout()
plt.show()


In [None]:
sex_hrt = df.groupby('sex')[['output']].mean()
sex_info = df.groupby('sex')[['chol', 'trtbps', 'thalachh', 'oldpeak']].aggregate(np.mean)
age_dis = pd.qcut(df.age, 5)
age_hrt = df.pivot_table('output', index=age_dis, columns='sex')

fig, axes = plt.subplots(1, 3, figsize=(15,9))

axes[0].pie(sex_hrt.output, labels=sex_hrt.index.map({0: 'Female', 1: 'Male'}),
           wedgeprops={'edgecolor': 'black'},
            shadow=True, textprops={'fontsize': 15}, autopct='%1.2f%%')
axes[0].set_title('Chance of getting \n(if compared according to genders)', fontsize=20)

indx_1 = np.arange(len(sex_info.columns))
width_1 = 0.2

axes[1].bar(indx_1 - width_1/2, sex_info.loc[0,:], width=width_1, alpha=0.5,
           color='pink', label='Female', edgecolor='black')
axes[1].bar(indx_1 + width_1/2, sex_info.loc[1,:], width=width_1, alpha=0.5,
           color='blue', label='Male', edgecolor='black')

axes[1].set_title('Comparing genders on avg. \nvalues of differenta categories\n', fontsize=20)
axes[1].set_xlabel('\nCategories', fontsize=15)
axes[1].set_ylabel('Average values', fontsize=15)
axes[1].set_yscale('log')
axes[1].set_xticks([0,1, 2, 3])
axes[1].set_xticklabels(('chol', 'trtbps', 'thalachh', 'oldpeak'))
axes[1].legend()

indx_2 = np.arange(len(age_hrt.index))
width_2 = 0.4

axes[2].bar(indx_2 - width_2/2, age_hrt[0], width=width_2, alpha=0.7, label='Female')
axes[2].bar(indx_2 + width_2/2, age_hrt[1], width=width_2, alpha=0.7, label='Male')

axes[2].set_title('Chances of having heart attack\n as pereach age group\n', fontsize=20)
axes[2].set_xlabel('\nAge Group', fontsize=15)
axes[2].set_ylabel('Chances of heart attack (in %)', fontsize=15)

axes[2].set_xticks(indx_2)
axes[2].set_xticklabels(age_hrt.index, rotation=90)
axes[2].legend()

fig.tight_layout()
plt.show()

### Creating Model

In [None]:
df.head()

In [None]:
for i in df.columns:
    print(i, ':- ', df[i].nunique())

In [None]:
X = df.drop('output', axis=1).copy()
y = df['output']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=0)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler

In [None]:
cat1 = ['sex', 'fbs', 'exng']
cat2 = ['cp', 'restecg', 'caa', 'thall']
num = ['age', 'trtbps', 'chol', 'thalachh', 'oldpeak']

In [None]:
trans_col = make_column_transformer((OneHotEncoder(handle_unknown='ignore'), cat2),
                                   (OrdinalEncoder(), cat1),
                                   (StandardScaler(), num),
                                   remainder='passthrough')

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [None]:
pipe_lr = make_pipeline(trans_col, LogisticRegression(solver = 'liblinear'))
pipe_svm = make_pipeline(trans_col, SVC())
pipe_rf = make_pipeline(trans_col, RandomForestClassifier())

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
params_lr = {
    'logisticregression__penalty': ['l1', 'l2'],
    'logisticregression__tol': [0.01, 0.001, 0.0001, 0.00001]
}

grid_lr = GridSearchCV(estimator=pipe_lr, param_grid=params_lr, cv=10, verbose=10)


In [None]:
params_svm = {
    'svc__kernel': ['linear', 'rbf'],
    'svc__C': [1, 10, 100, 1000, 10000]
}

grid_svm = GridSearchCV(estimator=pipe_svm, param_grid=params_svm, cv=10, verbose=10)

In [None]:
params_rf = {
    'randomforestclassifier__criterion': ['entropy', 'gini'],
    'randomforestclassifier__max_depth': [2, 4, 6, 8, 10],
    'randomforestclassifier__min_samples_split': [2, 4, 6, 8, 10],
    'randomforestclassifier__min_samples_leaf': [2, 3, 4, 5],
}

grid_rf = GridSearchCV(estimator = pipe_rf, param_grid = params_rf, cv=10, verbose=10, n_jobs=-1)

In [None]:
grid_lr.fit(X_train, y_train)

In [None]:
grid_svm.fit(X_train, y_train)

In [None]:
grid_rf.fit(X_train, y_train)

In [None]:
print(grid_lr.best_params_)
print(grid_lr.score(X_train, y_train))
print(grid_lr.score(X_test, y_test))


In [None]:
print(grid_svm.best_params_)
print(grid_svm.score(X_train, y_train))
print(grid_svm.score(X_test, y_test))

In [None]:
print(grid_rf.best_params_)
print(grid_rf.score(X_train, y_train))
print(grid_rf.score(X_test, y_test))

In [None]:
from sklearn.metrics import f1_score
print(f1_score(y_test, grid_lr.predict(X_test)))
print(f1_score(y_test, grid_svm.predict(X_test)))
print(f1_score(y_test, grid_rf.predict(X_test)))