In [None]:
import pandas as pd
import numpy as np

###Plotting
import matplotlib.pyplot as plt
import seaborn as sns
pd.options.display.max_rows = 4000


## Feature engineering
from sklearn.preprocessing import MinMaxScaler


### data split
from sklearn.model_selection import train_test_split

##Corelation
from scipy.stats import pearsonr


### Feature selection
from sklearn.ensemble import ExtraTreesClassifier

###
import copy

### models
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm
import xgboost as xgb
from lightgbm import LGBMClassifier


###model evaluation
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
dataframe=pd.read_csv('/kaggle/input/heart-disease-uci/heart.csv')

In [None]:
df=copy.deepcopy(dataframe)

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.head()

In [None]:
bins=[20,30,40,50,60,70,80,90,100]
plt.hist(df['age'],label='Age',bins=bins)
plt.xticks([20,30,40,50,60,70,80,90])
plt.legend()
plt.show()

In [None]:
# age
# sex
# chest pain type (4 values)
# resting blood pressure
# serum cholestoral in mg/dl
# fasting blood sugar > 120 mg/dl
# resting electrocardiographic results (values 0,1,2)
# maximum heart rate achieved
# exercise induced angina
# oldpeak = ST depression induced by exercise relative to rest
# the slope of the peak exercise ST segment
# number of major vessels (0-3) colored by flourosopy
# thal: 3 = normal; 6 = fixed defect; 7 = reversable defect

In [None]:

age=df.groupby(['age'])['target'].value_counts().unstack()
age.plot(kind='bar',width=0.8, figsize=(15,5),label='Target')
plt.ylabel('count')
plt.grid(True)
plt.legend()
plt.tight_layout();

In [None]:
####From this we can say that more men are 

df.groupby(['sex'])['target'].value_counts()

In [None]:

df.groupby(['sex'])['target'].value_counts(normalize=True)

In [None]:
sns.countplot(x='sex',hue='target',data=df)

In [None]:
df.columns

In [None]:
df.groupby(['cp'])['target'].value_counts().unstack()

In [None]:
sns.countplot(x='cp',hue='target',data=df)

In [None]:
weights=df['cp'].value_counts()
weights_labels=[0,1,2,3]

In [None]:
plt.pie(weights,labels=weights_labels,autopct="%2f %%")
plt.show()


In [None]:
###min---126
####max--564
sns.boxplot(df['chol'],hue='target',data=df)

In [None]:
low=df.loc[df['chol']<=130,'age'].count()
medium=df.loc[(df['chol']>130) & (df['chol']<=200),'age'].count()
high=df.loc[(df['chol']>200) & (df['chol']<300),'age'].count()
extremely_high=df.loc[(df['chol']>300),'age'].count()

In [None]:
df.info()

In [None]:
# def change_value(x):
    
#     if x <=130:
#         return 'low'
#     elif (x >131 and x <=200):
#         return 'medium'
#     elif (x > 201  and x <=300):
#         return 'high'
#     elif (x>301):
#         return 'ehigh'
#     return df

In [None]:
df['chol']=pd.cut(df.chol,bins=[100,130,200,300,600],labels=['low','medium','high','ehigh'])
df['chols']=df['chol'].cat.codes


In [None]:
sns.boxplot(df['trestbps'])

In [None]:
plt.hist(df['trestbps'])
plt.show()

In [None]:
df['trestbps']=pd.cut(df['trestbps'],bins=[50,100,150,200,250],labels=['low','medium','high','vhigh'])
df['trestbps']=df['trestbps'].cat.codes

In [None]:
df.columns

In [None]:
df.groupby(['trestbps','fbs','chols'])['target'].value_counts()

In [None]:
df['thalach'].describe()

In [None]:
df['thalach']=pd.cut(df['thalach'],bins=[50,100,150,200,250],labels=['low','medium','high','vhigh'])
df['thalach']=df['thalach'].cat.codes

In [None]:
df.head()

In [None]:
scale=MinMaxScaler()

In [None]:
old_np=df['oldpeak'].to_numpy()

In [None]:
old_np=old_np.reshape(-1,1)

In [None]:
old_np_scaled=scale.fit_transform(old_np)

In [None]:
df['oldpeak']=pd.DataFrame(old_np_scaled)

In [None]:
df.drop('chol',axis=1,inplace=True)

In [None]:
df.head()

In [None]:
cor=df.corr()

In [None]:
plt.figure(figsize=(15,10))
sns.heatmap(cor,annot=True,cmap='Blues')

In [None]:
features_sel=ExtraTreesClassifier(n_estimators=500)

In [None]:
y=df['target']

In [None]:
df.drop('target',axis=1,inplace=True)

In [None]:
x_train,x_test,y_train,y_test=train_test_split(df,y,test_size=0.3)

In [None]:
base_models={'Random Forest':RandomForestClassifier(n_estimators=1500,max_depth=8,max_features=10),
#              'Logistic':LogisticRegression(),
             'XGboost':xgb.XGBRFClassifier(learning_rate=0.3,n_estimators =1000,max_depth=15, nthread=4, scale_pos_weight=1,
                                           reg_alpha=1e-5,
                                           #0.005
                                      subsample=0.85,colsample_bytree = 0.8,
                                           gamma=0.4,min_child_weight=1,objective= 'binary:logistic',
                                          scoring='roc_auc')}
#             'LightGBM':LGBMClassifier()}


In [None]:
for i in base_models:
    model=base_models[i].fit(x_train,y_train)
    y_train_predict=model.predict(x_train)
    y_predicted=model.predict(x_test)
    print(f'Training_accuaracy,{i}:{accuracy_score(y_train,y_train_predict)}')
    print(f'Testing accuracy,{i}:{accuracy_score(y_test,y_predicted)}')
    print(confusion_matrix(y_test,y_predicted))
    print(classification_report(y_test, y_predicted))
    