Before move to this advanced analysis part please refer my descriptive analysis notebook to get an idea about the dataset.

**Link:** https://www.kaggle.com/migdev/descriptive-analysis-of-cardiovascular-dataset

BMI is a good indicator of healthiness of humans. Therefore, BMI was calculated by using weight and height and then weight and height were deleted from the dataset. Now we move to build a predictive model to predict presence of CVD. Fist we move to machine learning techniques and then we move to classical statistical methods. 

In [None]:
#import necessary libraries 
import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split,cross_val_score,KFold,cross_val_predict,GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier

In [None]:
df=pd.read_csv('../input/all-cleaned-and-partitioned-datasets/cardio_final.csv')
#only predictors
x=df.drop(['cardio'],axis='columns')

#only response
y= df['cardio']

# split dataset into train (70%) and test (30%)
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3,random_state=42)

In [None]:
# Decision Tree model
from sklearn.tree import DecisionTreeClassifier
#Using max_depth, criterion will suffice for DT Models, rest all will remain constant 
#hyperparamter tuning
parameters = {'max_depth' : (3,5,7,9,10,15,20,25)
              , 'criterion' : ('gini', 'entropy')
              , 'max_features' : ('auto', 'sqrt', 'log2')
              , 'min_samples_split' : (2,4,6)
             }
DT_grid  = RandomizedSearchCV(DecisionTreeClassifier(), param_distributions = parameters, cv = 3, verbose = True, random_state=42)
DT_grid.fit(x_train,y_train)
#this gives you the best parameters for decision tree model
DT_grid.best_estimator_

In [None]:
#Re Build Model with Best Estimators
DT_Model = DecisionTreeClassifier(criterion='entropy',max_depth=7, max_features='auto',
                                  min_samples_leaf=1, min_samples_split=4,random_state=42)

DT_Model.fit(x_train,y_train)

In [None]:
#train and test accuracies
print (f'Train Accuracy - : {DT_Model.score(x_train,y_train):.3f}')
print (f'Test Accuracy - : {DT_Model.score(x_test,y_test):.3f}')

In [None]:
#confusion matrix
y_pred=DT_Model.predict(x_test)
cm1=confusion_matrix(y_test, y_pred)

plt.figure(figsize=(10,7))
sn.heatmap(cm1,annot=True,fmt='d')
plt.xlabel('predicted')
plt.ylabel('Truth')

In [None]:
#Random Forest model
#Hyperparameter tuning
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 500, num = 5)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 50, num = 5)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

In [None]:
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, cv = 3, verbose=2, random_state=42)
# Fit the random search model
rf_random.fit(x_train, y_train)

#best parameters
rf_random.best_params_

In [None]:
#model fitting
regr = RandomForestClassifier(bootstrap=True , max_depth=10 ,max_features='sqrt',min_samples_leaf=4 ,
                              min_samples_split=10 ,n_estimators=300,random_state=42)
regr.fit(x_train,y_train)

#Accuracies for train and test sets
print (f'Train Accuracy - : {regr.score(x_train,y_train):.3f}')
print (f'Test Accuracy - : {regr.score(x_test,y_test):.3f}')

In [None]:
#confusion matrix
y_pred=regr.predict(x_test)
cm2=confusion_matrix(y_test, y_pred)

plt.figure(figsize=(10,7))
sn.heatmap(cm2,annot=True,fmt='d')
plt.xlabel('predicted')
plt.ylabel('Truth')