In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [None]:
df=pd.read_csv("../input/cardiovascular-disease-dataset/cardio_train.csv",delimiter=";",index_col=None)

In [None]:
df.head()

Data we are using for this assignment is Cardiovascular Disease and this dataset contains 13
attributes with 70000 populations. The outcome of this dataset is in binary form which 1
indicates having cardiovascular disease and 0 represents none.

In [None]:
df=df.drop(columns=["id"])
df["age"]=df["age"].div(365)

We can see that id is irrelevant and will not improve our accuracies in our model. Therefore,
it has to be dropped. Besides that, Age is in days form and it has to be converted to years for
better analytical understanding. After that we can consider merging weight and height to form
Body Mass Index (BMI), but first we have to remove all outliers first.

In [None]:
print("duplicate {}".format(df.duplicated().sum()))
print(df.isnull().sum())

In [None]:
corr= df.corr()
plt.figure(figsize=(16, 6))
sns.heatmap(corr,annot=True)

In [None]:
df.drop_duplicates(inplace=True)
print("duplicate {}".format(df.duplicated().sum()))

In [None]:
df.describe()

Age: Age is looking normal for youngest to be 29 and oldest to be 64.   
Height: Height of 55 and 250 isn’t normal at all.  
Weight: Weight of 10kg already tell us there is outlier.  
Systolic: A negative value of blood pressure is impossible.  
Diastolic: A negative value of blood pressure is impossible.  

In [None]:
outlier_height=((df["height"]>200) | (df["height"]<140))
df=df[~outlier_height]
outlier_weight=((df["weight"]>150) | (df["weight"]<40))
df=df[~outlier_weight]

For height, we can set the range to be 140 and 200, since negative growth could lead to
shorter height while for weight, a range of 40 to 150 is set.

In [None]:
sns.lmplot(x='weight', y='height', hue='gender', data=df, fit_reg=False, height=6)
df["bmi"] = df["weight"]/ (df["height"]/100)**2
df=df.drop(columns=["height","weight"])

 we can observe that weight and height is having correlation, therefore we can remove
weight and height from the dataframe and replace it with a body mass index (BMI).

In [None]:
blood_pressure = df.loc[:,['ap_lo','ap_hi']]
sns.boxplot(x = 'variable',y = 'value',data = blood_pressure.melt())
print("Diastolic pressure is higher than systolic one in {0} cases".format(df[df['ap_lo']> df['ap_hi']].shape[0]))

In [None]:
outlier_bp1= ((df["ap_hi"]>250) | (df["ap_lo"]>160))
outlier_bp2 = ((df["ap_hi"] < 80) | (df["ap_lo"] < 30))
df = df[~outlier_bp1]
df= df[~outlier_bp2]

While for diastolic we consider dropping all values bigger than 200 and smaller than 30, and
dropping all values bigger than 250 and smaller than 80 for systolic blood pressure. The
above is considered that having a diastolic 120 and systolic of 180 is considered hypertensive
crisis, for conservative purpose, we set it at 160 and 200.


In [None]:
print("Diastilic pressure is higher than systolic one in {0} cases".format(df[df['ap_lo']> df['ap_hi']].shape[0]))

In [None]:
df.drop(df[(df['ap_hi'] > df['ap_hi'].quantile(0.975)) | (df['ap_hi'] < df['ap_hi'].quantile(0.025))].index,inplace=True)
df.drop(df[(df['ap_lo'] > df['ap_lo'].quantile(0.975)) | (df['ap_lo'] < df['ap_lo'].quantile(0.025))].index,inplace=True)

In [None]:
print("Diastolic pressure is higher than systolic one in {0} cases".format(df[df['ap_lo']> df['ap_hi']].shape[0]))

In [None]:
blood_pressure = df.loc[:,['ap_lo','ap_hi']]
sns.boxplot(x = 'variable',y = 'value',data = blood_pressure.melt())

In [None]:
df.count()

In [None]:
corr= df.corr()
plt.figure(figsize=(16, 6))
sns.heatmap(corr,annot=True)

In [None]:
fig, ax =plt.subplots(1,2,figsize=(14,6))
sns.countplot(x=df["alco"],hue=df["cholesterol"],ax=ax[0])
sns.countplot(x=df["smoke"],hue=df["cholesterol"],ax=ax[1])
ax[0].set_title("Alcoholic vs Cholesterol")
ax[1].set_title("Smoker vs Cholesterol")
ax[0].set_xlabel("Acoholic")
ax[1].set_xlabel("Smoker")
ax[0].set_xticklabels(["No","Yes"])
ax[1].set_xticklabels(["No","Yes"])
ax[0].legend(["Low CHolesterol","High Cholesterol","Very High Cholesterol"],loc="center right")
ax[1].legend(["Low CHolesterol","High Cholesterol","Very High Cholesterol"],loc="center right")
fig.show()

In [None]:
fig, ax =plt.subplots(1,2,figsize=(14,6))
sns.countplot(x=df["alco"],hue=df["cardio"],ax=ax[0])
sns.countplot(x=df["smoke"],hue=df["cardio"],ax=ax[1])
ax[0].set_title("Alcoholic vs Cardio")
ax[1].set_title("Smoker vs Cardio")
ax[0].set_xlabel("Acoholic")
ax[1].set_xlabel("Smoker")
ax[0].set_xticklabels(["No","Yes"])
ax[1].set_xticklabels(["No","Yes"])
ax[0].legend(["Low CHolesterol","High Cholesterol","Very High Cholesterol"],loc="center right")
ax[1].legend(["Low CHolesterol","High Cholesterol","Very High Cholesterol"],loc="center right")
fig.show()

In [None]:
fig, ax = plt.subplots(figsize=(10,6))
ax=sns.boxplot(x=df["cholesterol"],y=df["bmi"])
ax.set_title("Boxplot of Cholesterol Level against bmi")
ax.set_xticklabels(["Low Cholesterol","High Cholesterol","Very High Cholesterol"])
ax.set_xlabel("Cholesterol")
ax.set_ylabel("bmi")

In [None]:
df['cardio'].value_counts()

In [None]:
X=df.drop(columns=["cardio"])
y=df["cardio"]

In [None]:
X_train,X_test, y_train, y_test = train_test_split(X,y,train_size=0.2,random_state=0)

In [None]:
from sklearn.preprocessing import normalize
X = normalize(X)
X_train = normalize(X_train)
X_test = normalize(X_test)

Before we start anything, we have to determine the data is well-balanced. Using the count
method, we know that 33362 (51%) patients do not have cardiovascular disease and 31972
(49%) have cardiovascular. Since it is a balanced dataset, we can normalize the dataset and
begin to decide what type of classification to be used. The purpose of normalizing dataset is
so that observation can be presented in a normal distribution, an initial transformation applied
over here is helpful in improving data classification accuracies.
Then we can split feature and outcome into train and test. By splitting factor of 0.2, 80
percent will be training data and 20 percent will be testing data.

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

dec = DecisionTreeClassifier()
ran = RandomForestClassifier()
knn = KNeighborsClassifier()
svm = SVC(random_state=0)
naive = GaussianNB()
log=LogisticRegression()

models = {"Decision tree" : dec,"Random forest" : ran,"KNN" : knn,"SVM" : svm,"Naive bayes" : naive,"Logistic regression": log}
scores= { }

for key, value in models.items():    
    model = value
    model.fit(X_train, y_train)
    scores[key] = model.score(X_test, y_test)

In [None]:
scores_frame = pd.DataFrame(scores, index=["Accuracy Score"]).T
scores_frame.sort_values(by=["Accuracy Score"], axis=0,inplace=True)
scores_frame

In [None]:
from sklearn.model_selection import cross_val_score
acc_random_forest = cross_val_score(estimator=ran,X= X_train,y= y_train, cv=10)
acc_decission_tree=cross_val_score(estimator=dec, X=X_train, y=y_train, cv=10)
acc_knn = cross_val_score(estimator=knn, X=X_train, y=y_train, cv=10)
acc_svm =cross_val_score(estimator=svm ,X=X_train, y=y_train, cv=10)
print("Random Forest Accuracy: ", acc_random_forest.mean())
print("Random Forest Standard Deviation: ", acc_random_forest.std())
print("Decission Tree Accuracy: ",acc_decission_tree.mean())
print("Decission Tree Standard Deviation: ", acc_decission_tree.std())
print("KNN Average Accuracy: ", acc_knn.mean())
print("KNN Standard Deviation: ", acc_knn.std())

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
DT_pred = DecisionTreeClassifier()#max_depth=3, min_samples_split=50, min_samples_leaf=50, random_state=0
DT_pred=DT_pred.fit(X_train, y_train)
y_pred = DT_pred.predict(X_test)
y_pred

In [None]:
print("Confusion Matrix \n",confusion_matrix(y_test,y_pred))

In [None]:
print("Clasification Accuracies\n",classification_report(y_test,y_pred))
DT_acc = round(accuracy_score(y_test, y_pred), 2)
print("Overall accuracy score: {} ".format(DT_acc))

Sensitivity is the measure of proportion of actual positive cases got predicted as positive. A
62 percent of sensitivity tells us that from a population of only people having cardiovascular
disease, this model able to predict correctly 62% of the population. Which is actually
undesirable since the model loses its ability to predict the remaining 38% of the patients. A
higher sensitivity is desirable especially in healthcare industry.   
  
Meanwhile for specificity, it means the ability to predict healthy people as healthy. The
relationship of specificity tells us that the higher the value of specificity, it means higher
value of true negative and lower false positive. While a lower specificity means lower true
negative and higher value of false positive. In this case we would prefer high rate of true
negative and low false positive. From specificity we can derive false positive rate, which is 1-0.636 = 0.364, a 34 percent of false positive rate. To be able to predict the patient correctly,
we must lower the false positive rate as it can cause unnecessary panic among patients.
Therefore, 63 percent is still acceptable but an improvement would be desired.  
  
Since the dataset is balanced, a F1 score is unnecessary

In [None]:
from sklearn.metrics import roc_curve, auc
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(false_positive_rate, true_positive_rate)
roc_auc

In [None]:
max_depths = np.linspace(1, 10,10, endpoint=True)
train_results = []
test_results = []
for max_depth in max_depths:
    dt = DecisionTreeClassifier(max_depth=max_depth)
    dt.fit(X_train, y_train)
    train_pred = dt.predict(X_train)
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train, train_pred)
    roc_auc = auc(false_positive_rate, true_positive_rate)
    train_results.append(roc_auc)
    y_pred = dt.predict(X_test)
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
    roc_auc = auc(false_positive_rate, true_positive_rate)
    test_results.append(roc_auc)
from matplotlib.legend_handler import HandlerLine2D
line1, = plt.plot(max_depths, train_results,"b", label="Train AUC")
line2, = plt.plot(max_depths, test_results, "r", label="Test AUC")
plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
plt.ylabel("AUC score")
plt.xlabel("Tree depth")
plt.grid(True)

Depth indicates the depth of tree; it is something we needs to control so that the nodes do not
expand too much and become complicated and difficult to analyse. The deeper the tress, the
more splitting there is and it will be ended up too deep. Therefore, it is important to select the
depth carefully. By looking at the plot, we can see there is underfitting when the depth is
lower than 3 and overfitting when the depth is bigger than 4. An ideal situation would be for
the tree to have a depth between these 2 conditions, which is 4. At the depth of 4, there is no
underfitting and overfitting. Therefore, the first hyperparameter is determined.

In [None]:
min_samples_splits = np.linspace(0.1, 1.0, 10, endpoint=True)
train_results = []
test_results = []
for min_samples_split in min_samples_splits:
    dt = DecisionTreeClassifier(min_samples_split=min_samples_split)
    dt.fit(X_train, y_train)
    train_pred = dt.predict(X_train)
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train, train_pred)
    roc_auc = auc(false_positive_rate, true_positive_rate)
    train_results.append(roc_auc)
    y_pred = dt.predict(X_test)
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
    roc_auc = auc(false_positive_rate, true_positive_rate)
    test_results.append(roc_auc)

line1, = plt.plot(min_samples_splits, train_results,"b", label="Train AUC")
line2, = plt.plot(min_samples_splits, test_results, "r", label="Test AUC")
plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
plt.ylabel("AUC score")
plt.xlabel("Min split")
plt.xticks(np.linspace(0.1, 1.0, 10))
plt.grid(True)

The third parameter we are tuning is min_sample_leaf. It tells us that the minimum number of
samples to be required at the node, which is the base of the tree. Judging from this AUC
curve, the best score is when leaf have a sample leaf of 0.1, which reaching a constant score
at 0.2 to 0.4, then it decreases over the increment of min_sample_leaf. Train AUC and Test
AUC intersect when approaching 0.45 and underfitting happening right when the line passes
through min_sample_leaf of 0.45. Therefore, the third desired parameter would be 0.1, or if
the line seems too close to overfit, min_sample_leaf between 0.2 to 0.4 can be choosen.

In [None]:
min_samples_leafs = np.linspace(0.1, 0.5, 5, endpoint=True)
train_results = []
test_results = []
for min_samples_leaf in min_samples_leafs:
    dt = DecisionTreeClassifier(min_samples_leaf=min_samples_leaf)
    dt.fit(X_train, y_train)
    train_pred = dt.predict(X_train)
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train, train_pred)
    roc_auc = auc(false_positive_rate, true_positive_rate)
    train_results.append(roc_auc)
    y_pred = dt.predict(X_test)
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
    roc_auc = auc(false_positive_rate, true_positive_rate)
    test_results.append(roc_auc)
from matplotlib.legend_handler import HandlerLine2D
line1, = plt.plot(min_samples_leafs, train_results,"b", label="Train AUC")
line2, = plt.plot(min_samples_leafs, test_results, "r", label="Test AUC")
plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
plt.ylabel("AUC score")
plt.xlabel("Min leaf")
plt.grid(True)

In [None]:
max_features = list(range(1,X.shape[1]))
train_results = []
test_results = []
for max_feature in max_features:
    dt = DecisionTreeClassifier(max_features=max_feature)
    dt.fit(X_train, y_train)
    train_pred = dt.predict(X_train)
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train, train_pred)
    roc_auc = auc(false_positive_rate, true_positive_rate)
    train_results.append(roc_auc)
    y_pred = dt.predict(X_test)
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
    roc_auc = auc(false_positive_rate, true_positive_rate)
    test_results.append(roc_auc)
from matplotlib.legend_handler import HandlerLine2D
line1, = plt.plot(max_features, train_results,"b", label="Train AUC")
line2, = plt.plot(max_features, test_results, "r", label="Test AUC")
plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
plt.ylabel("AUC score")
plt.xlabel("Max feature")
plt.grid(True)

Features indicates the number of features to be considered during the split. As we can see the
entire feature is overfitted. In this case we couldn’t decide which value to be used, therefore
setting it to be default.

In [None]:
from sklearn import tree
feature=["age","gender","ap_hi","ap_lo","cholesterol","gluc","smoke","alco","active","bmi"]
fig, ax=plt.subplots(figsize=(16,10))
clf = DecisionTreeClassifier(criterion="gini", min_samples_split=0.1,max_depth=4,min_samples_leaf= 0.1)
pred=clf.fit(X,y)
ax=tree.plot_tree(pred.fit(X_train,y_train),feature_names=feature)

In [None]:
clf = DecisionTreeClassifier(criterion="gini", max_depth=4,min_samples_split=0.1,min_samples_leaf= 0.1)
pred=clf.fit(X,y)
y_pred=pred.predict(X_test)
cm=confusion_matrix(y_test, y_pred)
f, ax = plt.subplots(figsize=(5,5))
sns.heatmap(cm,fmt=".0f", annot=True,linewidths=0.2, linecolor="purple", ax=ax)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()
TN = cm[0,0]
TP = cm[1,1]
FN = cm[1,0]
FP = cm[0,1]
Accuracy=(TP+TN)/(TP+TN+FN+FP)
Error=(FP+FN)/(TP+TN+FN+FP)
Precision = TP/(TP+FP)
Recall = TP/(TP+FN)
F1_Score = 2*(Recall * Precision) / (Recall + Precision)
pd.DataFrame([[Accuracy,Error,Precision, Recall, F1_Score]],columns=["Accuracy","Error","Precision", "Recall", "F1 Score"], index=["Results"])

From the confusion matrix, we obtained a higher accuracy, lower error rate, higher precision
and better recall. This indicated the fine tuning is indeed important and it shows a significant
improvement. Therefore, we would use max_depth=4, min_samples_split=0.1,
min_samples_leaf= 0.1 and default on max_feature. Refer to appendix 5 for the plotted
decision tree. We can tell that there is a significant increase of Accuracy, from 63 percent to
71 percent, 64 percent precision increased to 71 percent, 62 percent of sensitivity increased to
67 perfcent and a lower error rate. Generally the performance of the model after
hyperparameter tuning is sastifying

In [None]:
DT_pred = DecisionTreeClassifier(max_depth=4, min_samples_split=0.1, min_samples_leaf=0.1, random_state=0)
DT_pred=DT_pred.fit(X_train, y_train)
y_pred = DT_pred.predict(X_test)
y_pred
print("Clasification Accuracies\n",classification_report(y_test,y_pred))
DT_acc = round(accuracy_score(y_test, y_pred), 2)
print("Overall accuracy score: {} ".format(DT_acc))

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
KNN_pred = KNeighborsClassifier()
KNN=KNN_pred.fit(X_train, y_train)
y_pred_knn = KNN.predict(X_test)
y_pred_knn

The next classifier we are tuning is K Nearest Neighbours. According to previous accuracy
testing, it has an accuracy of 0.631, which is 0.003 lower compared to decision tree. The first
thing we would be referring is the Confusion Matrix.

In [None]:
print("Confusion Matrix \n",confusion_matrix(y_test,y_pred_knn))

Compared to Decision Tree, KNN have a higher sensitivity before tuning. Accuracy and
precision are not interchangeable. A model that able to predict a patient health condition
without consistency is not useful. This model able to predict 67 percent of people having
cardiovascular disease out of all patient that have cardiovascular disease. This model also has
an average result for specificity, it is able to predict healthy people as healthy. Therefore, this
model predicted 63 percent of healthy people as healthy and 36 percent of healthy people as
having cardiovascular disease. Through model tuning, we wish to increase all of the
percentages so that lesser errors and higher accuracy and precision.

In [None]:
n_neighbors = np.linspace(1, 30,30, endpoint=True).astype(int)
train_results = []
test_results = []
for n_neighbor in n_neighbors:
    dt = KNeighborsClassifier(n_neighbors=n_neighbor)
    dt.fit(X_train, y_train)
    train_pred = dt.predict(X_train)
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train, train_pred)
    roc_auc = auc(false_positive_rate, true_positive_rate)
    train_results.append(roc_auc)
    y_pred = dt.predict(X_test)
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
    roc_auc = auc(false_positive_rate, true_positive_rate)

    test_results.append(roc_auc)
from matplotlib.legend_handler import HandlerLine2D
line1, = plt.plot(n_neighbors, train_results,"b", label="Train AUC")
line2, = plt.plot(n_neighbors, test_results, "r", label="Test AUC")
plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
plt.ylabel("AUC score")
plt.xlabel("N neighbor")
plt.grid(True)

From this curve we can observe that the higher the N neighbour, the lesser the overfitting is.
There is a very obvious overfitting from the start. However, test score reached maximum
after neighbour value of 10, while train score continues to decreases over the ranges.

In [None]:
distances = np.linspace(1,5,5,endpoint=True).astype(int)
train_results = []
test_results = []
fig = plt.figure()
ax = fig.add_subplot(111)
for distance in distances:
    dt = KNeighborsClassifier(p=distance)
    dt.fit(X_train, y_train)
    train_pred = dt.predict(X_train)
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train, train_pred)
    roc_auc = auc(false_positive_rate, true_positive_rate)
    train_results.append(roc_auc)
    y_pred = dt.predict(X_test)
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
    roc_auc = auc(false_positive_rate, true_positive_rate)
    test_results.append(roc_auc)
from matplotlib.legend_handler import HandlerLine2D
line1, = ax.plot(distances, train_results,"b", label="Train AUC")
line2, = ax.plot(distances, test_results, "r", label="Test AUC")
ymax_train = max(train_results)
xpos_train = train_results.index(ymax_train)
xmax_train = distances[xpos_train]
#ax.annotate('local max', xy=(xmax_train, ymax_train), xytext=(xmax_train, ymax_train+1),
           # arrowprops=dict(facecolor='black', shrink=0.05),
            #)
ax.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
ax.set_ylabel("AUC score")
ax.set_xlabel("Distance")
ax.grid(True)

The next parameter we viewing is p value, which normally only consist of 2 value. When p
is equal to 1, it refers that the model is using manhattan_distance, where p equals to 2, the
models is using euclidian_distance. So, on this plot we will be only observe distance 1 and
distance 2. According to the plot, distance 1 completely outdo distance 2. Although both
shows overfitting, but we will determine by choosing the best AUC score.  
  
Therefore, we will be choosing a number of ranges within 5 to 30 and 1 for n_neighbors, and
1 for p, which is manhattan_distance.

In [None]:
num=np.linspace(1,10,10).astype(int)
param_dist={"n_neighbors":num,
           "weights":["uniform","distance"],
           "algorithm":["ball_tree", "kd_tree", "brute"],
           "p":np.linspace(1,2,2)}
KNN = KNeighborsClassifier()
KNN_cv=GridSearchCV(KNN,param_dist,cv=5)
KNN_cv.fit(X_train,y_train)

Through GridSearchCV we can find out what is the best parameters for KNN classifier. Since
we do not have a specific value for K yet, which we only classify it within a range of 5 to 30.
We needed a specific value to tune our model. By adding each hyperparameters into a
dictionary, the method will return us all the parameters best for tuning. Since it is a method, I
will choose 4 parameters which is algorithm, n_neighbors, p and weight.

In [None]:
print("Tuned Parameter: {}".format(KNN_cv.best_params_))
print("Best Score: {}".format(KNN_cv.best_score_))

The results returned is indeed what we desired. Algorithm best used is ball_tree, n_neighbors
falls within the range of 5 to 30, p value of 1 and weight as uniform. We further evaluate the
model performance in confusion matrix.

In [None]:
clf = KNeighborsClassifier(algorithm="ball_tree", n_neighbors= 9, p= 1.0, weights= 'uniform')
pred=clf.fit(X,y)
y_pred=pred.predict(X_test)
cm=confusion_matrix(y_test, y_pred)
f, ax = plt.subplots(figsize=(5,5))
sns.heatmap(cm,fmt=".0f", annot=True,linewidths=0.2, linecolor="purple", ax=ax)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()
TN = cm[0,0]
TP = cm[1,1]
FN = cm[1,0]
FP = cm[0,1]
Accuracy=(TP+TN)/(TP+TN+FN+FP)
Error=(FP+FN)/(TP+TN+FN+FP)
Precision = TP/(TP+FP)
Recall = TP/(TP+FN)
F1_Score = 2*(Recall * Precision) / (Recall + Precision)
pd.DataFrame([[Accuracy,Error,Precision, Recall, F1_Score]],columns=["Accuracy","Error","Precision", "Recall", "F1 Score"], index=["Results"])

We can observe that from 63 percent accuracy, the model already improved into 74.8 percent
accuracy, which is an increment of 11%. A 36.4 percent of error rate has dropped to 25
percent. Precision increased from 64.1 percent to 75.79 percent, an increment of 11.69
percent. Also, sensitivity increased from 67 percent to 71 percent. These increment in
accuracies tells us that through fine tuning the model can perform better and quicker.

In [None]:
random_forest = GridSearchCV(estimator=RandomForestClassifier(), param_grid={'n_estimators': [100, 300]},cv=5).fit(X_train, y_train)
random_forest.fit(X_train, y_train)
acc_random_forest = round(random_forest.score(X_train, y_train) * 100, 2)
print(acc_random_forest,random_forest.best_params_)
acc_test_random_forest = round(random_forest.score(X_test, y_test) * 100, 2)
acc_test_random_forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

In [None]:
y_pred = rf.predict(X_test)
y_pred

In [None]:
print("Confusion Matrix \n",confusion_matrix(y_test,y_pred))

In [None]:
from sklearn.metrics import roc_curve, auc
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(false_positive_rate, true_positive_rate)
roc_auc

In [None]:
n_estimators = [1, 2, 4, 8, 16, 32, 64, 100, 200]
train_results = []
test_results = []
for estimator in n_estimators:
   rf = RandomForestClassifier(n_estimators=estimator, n_jobs=-1)
   rf.fit(X_train, y_train)
   train_pred = rf.predict(X_train)
   false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train, train_pred)
   roc_auc = auc(false_positive_rate, true_positive_rate)
   train_results.append(roc_auc)
   y_pred = rf.predict(X_test)
   false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
   roc_auc = auc(false_positive_rate, true_positive_rate)
   test_results.append(roc_auc)
from matplotlib.legend_handler import HandlerLine2D
line1, = plt.plot(n_estimators, train_results, "b", label="Train AUC")
line2, = plt.plot(n_estimators, test_results, "r", label="Test AUC")
plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
plt.ylabel("AUC score")
plt.xlabel("n_estimators")
plt.show()

In [None]:
max_depths = np.linspace(1, 32, 32, endpoint=True)
train_results = []
test_results = []
for max_depth in max_depths:
   rf = RandomForestClassifier(max_depth=max_depth, n_jobs=-1)
   rf.fit(X_train, y_train)
   train_pred = rf.predict(X_train)
   false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train, train_pred)
   roc_auc = auc(false_positive_rate, true_positive_rate)
   train_results.append(roc_auc)
   y_pred = rf.predict(X_test)
   false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
   roc_auc = auc(false_positive_rate, true_positive_rate)
   test_results.append(roc_auc)
from matplotlib.legend_handler import HandlerLine2D
line1, = plt.plot(max_depths, train_results, "b", label="Train AUC")
line2, = plt.plot(max_depths, test_results, "r", label="Test AUC")
plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
plt.ylabel("AUC score")
plt.xlabel("Tree depth")
plt.show()

In [None]:
min_samples_splits = np.linspace(0.1, 1.0, 10, endpoint=True)
train_results = []
test_results = []
for min_samples_split in min_samples_splits:
   rf = RandomForestClassifier(min_samples_split=min_samples_split)
   rf.fit(X_train, y_train)
   train_pred = rf.predict(X_train)
   false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train, train_pred)
   roc_auc = auc(false_positive_rate, true_positive_rate)
   train_results.append(roc_auc)
   y_pred = rf.predict(X_test)
   false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
   roc_auc = auc(false_positive_rate, true_positive_rate)
   test_results.append(roc_auc)
from matplotlib.legend_handler import HandlerLine2D
line1, = plt.plot(min_samples_splits, train_results, "b", label="Train AUC")
line2, = plt.plot(min_samples_splits, test_results, "r", label="Test AUC")
plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
plt.ylabel("AUC score")
plt.xlabel("min samples split")
plt.show()

In [None]:
max_features = list(range(1,X_train.shape[1]))
train_results = []
test_results = []
for max_feature in max_features:
   rf = RandomForestClassifier(max_features=max_feature)
   rf.fit(X_train, y_train)
   train_pred = rf.predict(X_train)
   false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train, train_pred)
   roc_auc = auc(false_positive_rate, true_positive_rate)
   train_results.append(roc_auc)
   y_pred = rf.predict(X_test)
   false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
   roc_auc = auc(false_positive_rate, true_positive_rate)
   test_results.append(roc_auc)
from matplotlib.legend_handler import HandlerLine2D
line1, = plt.plot(max_features, train_results, "b", label="Train AUC")
line2, = plt.plot(max_features, test_results, "r", label="Test AUC")
plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
plt.ylabel("AUC score")
plt.xlabel("max features")
plt.show()

In [None]:
print("Confusion Matrix \n",confusion_matrix(y_test,y_pred))

In [None]:
clf = RandomForestClassifier( bootstrap = True,max_leaf_nodes=33,n_estimators= 188,max_depth=12,min_samples_split=0.6,max_features="sqrt")
 
pred=clf.fit(X,y)
y_pred=pred.predict(X_test)
cm=confusion_matrix(y_test, y_pred)
f, ax = plt.subplots(figsize=(5,5))
sns.heatmap(cm,fmt=".0f", annot=True,linewidths=0.2, linecolor="purple", ax=ax)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()
TN = cm[0,0]
TP = cm[1,1]
FN = cm[1,0]
FP = cm[0,1]
Accuracy=(TP+TN)/(TP+TN+FN+FP)
Error=(FP+FN)/(TP+TN+FN+FP)
Precision = TP/(TP+FP)
Recall = TP/(TP+FN)
F1_Score = 2*(Recall * Precision) / (Recall + Precision)
pd.DataFrame([[Accuracy,Error,Precision, Recall, F1_Score]],columns=["Accuracy","Error","Precision", "Recall", "F1 Score"], index=["Results"])

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# Hyperparameter grid
param_grid = {
    'n_estimators': np.linspace(10, 200).astype(int),
    'max_depth': [None] + list(np.linspace(3, 20).astype(int)),
    'max_features': ['auto', 'sqrt', None] + list(np.arange(0.5, 1, 0.1)),
    'max_leaf_nodes': [None] + list(np.linspace(10, 50, 500).astype(int)),
    'min_samples_split': [2, 5, 10],
    'bootstrap': [True, False]
}

# Estimator for use in random search
estimator = RandomForestClassifier(random_state = 0)

# Create the random search model
rs = RandomizedSearchCV(estimator, param_grid, n_jobs = -1, 
                        scoring = 'roc_auc', cv = 3, 
                        n_iter = 10, verbose = 1, random_state=0)

# Fit 
rs.fit(X_train, y_train)

In [None]:
rs.best_params_

In [None]:
best_model = rs.best_estimator_

In [None]:
n_nodes = []
max_depths = []

for ind_tree in best_model.estimators_:
    n_nodes.append(ind_tree.tree_.node_count)
    max_depths.append(ind_tree.tree_.max_depth)
    
print(f'Average number of nodes {int(np.mean(n_nodes))}')
print(f'Average maximum depth {int(np.mean(max_depths))}')

In [None]:
from sklearn.metrics import roc_auc_score

sample_leaf_options = [1,2,3,4,5,10,20]
#X_train=X_train.reshape(1,-1)
# for loop to iterate for each leaf size
for leaf_size in sample_leaf_options :
    model = RandomForestClassifier(n_estimators = 200, n_jobs = -1,random_state =0, min_samples_leaf = leaf_size)
    model.fit(X_train,y_train)
    print("\n Leaf size :", leaf_size)
    print ("AUC - ROC : ", roc_auc_score(y_train,model.predict(X_train)))

In [None]:
clf=RandomForestClassifier(n_estimators = 1000, n_jobs = -1,random_state =0)

In [None]:
clf.fit(X_train,y_train)

In [None]:
feature_lbl=["age","gender","ap_hi" ,"ap_low","cholesterol","gluc","smoke","alco","active","bmi"]
for feature in zip(feature_lbl, clf.feature_importances_):
    print(feature)

In [None]:
from sklearn.feature_selection import SelectFromModel
sfm = SelectFromModel(clf, threshold=0.1)
sfm.fit(X_train, y_train)

In [None]:
for feature_list_index in sfm.get_support(indices=True):
    print(feature_lbl[feature_list_index])

In [None]:
X_important_train = sfm.transform(X_train)
X_important_test = sfm.transform(X_test)

In [None]:
clf_important = RandomForestClassifier(n_estimators=1000, random_state=0, n_jobs=-1)
clf_important.fit(X_important_train, y_train)

In [None]:
y_pred = clf.predict(X_test)

accuracy_score(y_test, y_pred)

In [None]:
y_important_pred = clf_important.predict(X_important_test)

accuracy_score(y_test, y_important_pred)

In [None]:
clf = RandomForestClassifier(n_estimators=188,min_samples_leaf=1,max_leaf_nodes=33,max_features=4,max_depth=12, random_state=0, n_jobs=-1)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
clf_important = RandomForestClassifier(n_estimators=188,max_leaf_nodes=33,max_features=0.799,max_depth=12, random_state=0, n_jobs=-1)
clf_important.fit(X_important_train, y_train)
y_important_pred = clf_important.predict(X_important_test)
accuracy_score(y_test, y_important_pred)

In [None]:
cm=confusion_matrix(y_test, y_important_pred)
f, ax = plt.subplots(figsize=(5,5))
sns.heatmap(cm,fmt=".0f", annot=True,linewidths=0.2, linecolor="purple", ax=ax)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()
TN = cm[0,0]
TP = cm[1,1]
FN = cm[1,0]
FP = cm[0,1]
Accuracy=(TP+TN)/(TP+TN+FN+FP)
Error=(FP+FN)/(TP+TN+FN+FP)
Precision = TP/(TP+FP)
Recall = TP/(TP+FN)
F1_Score = 2*(Recall * Precision) / (Recall + Precision)
pd.DataFrame([[Accuracy,Error,Precision, Recall, F1_Score]],columns=["Accuracy","Error","Precision", "Recall", "F1 Score"], index=["Results"])

In [None]:
clf_important = RandomForestClassifier(n_estimators=188,max_leaf_nodes=33,max_depth=12, min_samples_split=0.6,max_features="sqrt",random_state=0, n_jobs=-1)
clf_important.fit(X_important_train, y_train)
y_important_pred = clf_important.predict(X_important_test)
accuracy_score(y_test, y_important_pred)
#max_features=0.799

In [None]:
cm=confusion_matrix(y_test, y_important_pred)
f, ax = plt.subplots(figsize=(5,5))
sns.heatmap(cm,fmt=".0f", annot=True,linewidths=0.2, linecolor="purple", ax=ax)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()
TN = cm[0,0]
TP = cm[1,1]
FN = cm[1,0]
FP = cm[0,1]
Accuracy=(TP+TN)/(TP+TN+FN+FP)
Error=(FP+FN)/(TP+TN+FN+FP)
Precision = TP/(TP+FP)
Recall = TP/(TP+FN)
F1_Score = 2*(Recall * Precision) / (Recall + Precision)
pd.DataFrame([[Accuracy,Error,Precision, Recall, F1_Score]],columns=["Accuracy","Error","Precision", "Recall", "F1 Score"], index=["Results"])