In [220]:
import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt

In [221]:
heart_dis = pd.read_csv('heart_disease_uci.csv')
heart_dis.head()


Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


In [222]:
heart_dis.size


14720

In [223]:
heart_dis.shape


(920, 16)

In [224]:
heart_dis.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        920 non-null    int64  
 1   age       920 non-null    int64  
 2   sex       920 non-null    object 
 3   dataset   920 non-null    object 
 4   cp        920 non-null    object 
 5   trestbps  861 non-null    float64
 6   chol      890 non-null    float64
 7   fbs       830 non-null    object 
 8   restecg   918 non-null    object 
 9   thalch    865 non-null    float64
 10  exang     865 non-null    object 
 11  oldpeak   858 non-null    float64
 12  slope     611 non-null    object 
 13  ca        309 non-null    float64
 14  thal      434 non-null    object 
 15  num       920 non-null    int64  
dtypes: float64(5), int64(3), object(8)
memory usage: 115.1+ KB


In [225]:
null_cols=heart_dis.isnull().sum()
null_cols[null_cols>0]


trestbps     59
chol         30
fbs          90
restecg       2
thalch       55
exang        55
oldpeak      62
slope       309
ca          611
thal        486
dtype: int64

In [226]:
num_map = {0:'non-existent', 1:'mild', 2:'moderate', 3: 'severe', 4:'critical'}
heart_dis['num_levels'] = heart_dis['num'].map(num_map)
heart_dis['num_levels'].head(10)


0    non-existent
1        moderate
2            mild
3    non-existent
4    non-existent
5    non-existent
6          severe
7    non-existent
8        moderate
9            mild
Name: num_levels, dtype: object

In [227]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer 
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

In [228]:
heart_dis.columns


Index(['id', 'age', 'sex', 'dataset', 'cp', 'trestbps', 'chol', 'fbs',
       'restecg', 'thalch', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num',
       'num_levels'],
      dtype='object')

In [229]:
X = heart_dis[['age', 'sex','cp', 'trestbps', 'chol', 'fbs',
       'restecg', 'thalch', 'exang', 'oldpeak'
       ]]
y = heart_dis['num']

In [230]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 60)

In [231]:
X_train.info()


<class 'pandas.core.frame.DataFrame'>
Index: 736 entries, 81 to 205
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       736 non-null    int64  
 1   sex       736 non-null    object 
 2   cp        736 non-null    object 
 3   trestbps  691 non-null    float64
 4   chol      715 non-null    float64
 5   fbs       658 non-null    object 
 6   restecg   735 non-null    object 
 7   thalch    695 non-null    float64
 8   exang     695 non-null    object 
 9   oldpeak   689 non-null    float64
dtypes: float64(4), int64(1), object(5)
memory usage: 63.2+ KB


In [232]:
X_train.select_dtypes(include='O').columns.tolist()

['sex', 'cp', 'fbs', 'restecg', 'exang']

In [233]:
cat_features=['sex','fbs', 'exang']

In [234]:
X_train.select_dtypes(include='number').columns.tolist()

['age', 'trestbps', 'chol', 'thalch', 'oldpeak']

In [235]:
num_features = ['age', 'trestbps', 'chol', 'thalch', 'oldpeak']

In [236]:
X_train['sex'].unique() #hierarchical patttern at the output, hence use OneHotEncoder()

array(['Female', 'Male'], dtype=object)

In [237]:
X_train['restecg'].unique()

array(['lv hypertrophy', 'normal', 'st-t abnormality', nan], dtype=object)

In [238]:
one_hot= OneHotEncoder(sparse_output = False, handle_unknown = 'ignore')
scaler = StandardScaler()

In [239]:
ordinal_category = [  
    ['normal', 'lv hypertrophy', 'st-t abnormality'],
    ['asymptomatic',  'non-anginal','atypical angina', 'typical angina']
]

In [240]:
ordinal_features = ['restecg', 'cp']
ordinal_enc = OrdinalEncoder(categories = ordinal_category ) 


In [241]:
preprocessor = ColumnTransformer(transformers = [
('cat', Pipeline(steps=[
    ('impute', SimpleImputer(strategy= 'most_frequent')),
    ('one_hot',one_hot )
]),cat_features),
('num', Pipeline(steps=[
    ('impute', SimpleImputer(strategy= 'mean')),
    ('scaler',scaler )
]), num_features),    
    ('ord', Pipeline(steps=[
    ('impute', SimpleImputer(strategy= 'most_frequent')),
    ('ordinal',ordinal_enc )
]), ordinal_features)
])
preprocessor


In [242]:
preprocessor.fit_transform(X_train,X_test)


array([[ 1.        ,  0.        ,  1.        , ..., -0.45534841,
         1.        ,  0.        ],
       [ 0.        ,  1.        ,  1.        , ...,  2.9496748 ,
         0.        ,  0.        ],
       [ 0.        ,  1.        ,  1.        , ...,  1.81466706,
         1.        ,  0.        ],
       ...,
       [ 0.        ,  1.        ,  1.        , ...,  2.00383502,
         0.        ,  0.        ],
       [ 0.        ,  1.        ,  1.        , ...,  1.81466706,
         1.        ,  0.        ],
       [ 0.        ,  1.        ,  1.        , ..., -0.83368432,
         1.        ,  0.        ]])

In [243]:
pipeline_rf = Pipeline(steps = [
    ('preprocessor', preprocessor),
    ('model', RandomForestClassifier())
    
])
pipeline_rf

In [244]:
RandomForestClassifier().get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [245]:
#pipeline_rf.fit(X_train,y_train)

In [246]:
param_grid_rf = {
    'model__min_samples_split':[2,3,4],
    'model__n_estimators':[100,300,500],
    'model__criterion':['gini','entropy']
}

In [247]:
 #y_pred =pipeline_rf.predict(X_test)

In [248]:
grid_search_rf = GridSearchCV(estimator = pipeline_rf,param_grid = param_grid_rf, scoring = 'accuracy', cv = 5 )
grid_search_rf.fit(X_train,y_train)

In [249]:
y_pred_rf = grid_search_rf.best_estimator_.predict(X_test)

In [250]:
print(accuracy_score(y_pred_rf,y_test)) 

0.6032608695652174


In [251]:
print(confusion_matrix(y_pred_rf,y_test))

[[80 22  1  1  0]
 [11 27  8 13  2]
 [ 1  3  4  5  0]
 [ 0  2  3  0  1]
 [ 0  0  0  0  0]]


In [252]:
#cm = confusion_matrix(y_pred,y_test)
#sn.heatmap(cm, annot = True)

In [253]:
pipeline_dt = Pipeline(steps = [
    ('preprocessor', preprocessor),
    ('model', DecisionTreeClassifier(min_samples_split = 10, min_samples_leaf = 10))
    
])
pipeline_dt

In [254]:
DecisionTreeClassifier().get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'random_state': None,
 'splitter': 'best'}

In [255]:
pipeline_dt.fit(X_train,y_train)

In [256]:
y_pred_dt = pipeline_dt.predict(X_test)

In [257]:
print(accuracy_score(y_pred_dt,y_test))

0.5489130434782609


In [258]:
print(confusion_matrix(y_pred_dt,y_test))

[[75 20  3  3  0]
 [14 22  7 11  2]
 [ 1  2  1  2  1]
 [ 2  9  5  3  0]
 [ 0  1  0  0  0]]


In [259]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [260]:
log_clf = LogisticRegression(warm_start = False)
rf_clf =RandomForestClassifier(min_samples_split = 5)
dt_clf = DecisionTreeClassifier(min_samples_leaf = 6)
svm_clf = SVC(probability = True)

In [261]:
voting_clf = VotingClassifier(estimators= [
    ('log',log_clf),
    ('rf',rf_clf),
    ('dt',dt_clf),
    ('svm',svm_clf )
],voting= 'soft')

In [262]:
voting_pipeline = Pipeline(steps = [
    ('preprocessor', preprocessor),
    ('model', voting_clf)
    
])
voting_pipeline

In [263]:
voting_pipeline.fit(X_train,y_train)

In [264]:
y_pred_voting= voting_pipeline.predict(X_test)

In [265]:
print(accuracy_score(y_pred_voting,y_test))

0.625


In [266]:
LogisticRegression().get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'deprecated',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}