In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import warnings

warnings.filterwarnings('ignore')

In [None]:
plt.style.use('ggplot')

In [None]:
data = pd.read_csv('../input/heart-disease-uci/heart.csv')

In [None]:
data.isna().sum()

In [None]:
data.duplicated().sum()

In [None]:
data.drop_duplicates(inplace=True)

In [None]:
data.duplicated().sum()

In [None]:
X, y = data.iloc[:,:-1], data['target']

In [None]:
fig = plt.figure(figsize=(9,5), dpi=80)
ax = fig.gca()

sns.countplot(y, ax=ax)
ax.set_title('Count Target')

*Classes are balanced*

# **Distributions of quantitative variables**

In [None]:
import scipy as sc

In [None]:
col_qua = [i for i in data.columns if len(data[i].unique()) > 5]

In [None]:
for i in col_qua:
    
    fig = plt.figure(figsize=(9,5), dpi=70)
    ax = fig.gca()
    
    sns.distplot(X[i], ax=ax, fit=sc.stats.norm)
    ax.set_title(i)

***The variables appear to have a normal distribution, we will verify this with Shapiro.***

In [None]:
for i in col_qua:
    
    sts, p_value = sc.stats.shapiro(X[i])
    
    fig = plt.figure(figsize=(9,5), dpi=70)
    ax = fig.gca()
    
    sc.stats.probplot(X[i], dist='norm', plot=ax)
    ax.set_title(f'{i}   Stats: {sts:.3f}   P_value: {p_value:.5f}')
    
   
    
    #print(i)
    #print(f'Stats: {sts:.3f}\tP_value: {p_value:.3f}\n')

***If the p_value is greater than 0.05 it is said that our variable does not have a normal distribution; but if the p_value is less than 0.05 it has a normal distribution***

In [None]:
!pip install pingouin

***We will install Pingouin to verify the significance of the correlations.***

In [None]:
import pingouin as pg

In [None]:
# corr
matrix_corr = X.corr(method='spearman')

fig = plt.figure(figsize=(12,8), dpi=80)
ax = fig.gca()

sns.heatmap(matrix_corr, annot=True, cbar=True, cmap='bwr', ax=ax)
ax.set_title('Matrix - Corr')

In [None]:
sig_corr = pg.pairwise_corr(X, method='spearman').loc[:,['X', 'Y', 'r', 'p-unc']]

sig_corr[sig_corr['p-unc'] < 0.05].sort_values(by='p-unc')

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
y_test.value_counts()

In [None]:
from xgboost import XGBClassifier

In [None]:
xgb = XGBClassifier(n_estimators=300, reg_lambda=0.7, reg_alpha=0.7).fit(X_train, y_train)

In [None]:
pred = xgb.predict(X_test)

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_test, pred))

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
cross_val_score(xgb, X_test, y_test, cv=4, scoring='f1')

In [None]:
cross_val_score(xgb, X_test, y_test, cv=4, scoring='accuracy')

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
fig = plt.figure(figsize=(12,8), dpi=80)
ax = fig.gca()

sns.heatmap(confusion_matrix(y_test, pred), annot=True, cmap='bwr', cbar=True, ax=ax)
ax.set_title('confusion Matrix')