In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
dfCardio=pd.read_csv('/kaggle/input/cardiovascular-disease-dataset/cardio_train.csv')

dfCardio

In [None]:
#Comma Separator:

In [None]:
dfCardio=pd.read_csv('/kaggle/input/cardiovascular-disease-dataset/cardio_train.csv',  sep='[;]', engine='python')
dfCardio

In [None]:
#Remove Duplicate

In [None]:
dfCardio.duplicated().sum()

In [None]:
dfCardio.drop_duplicates(inplace=True)

In [None]:
#Age calculation

In [None]:
dfCardio['age']=(dfCardio['age']/365).apply(np.floor)
#dfCardio.age.astype(int)
dfCardio

In [None]:
dfCardio['age'] = dfCardio.age.astype(int)
dfCardio['age']

In [None]:
dfCardio.info()

In [None]:
#Detecting Outliers 

In [None]:
dfCardio.describe()

In [None]:
#Calcuating upper and lower range for ap_hi, ap_lo

In [None]:
ap_list = ["ap_hi", "ap_lo"]
boundary = pd.DataFrame(index=["lower_bound","upper_bound"]) # We created an empty dataframe
for each in ap_list:
    Q1 = dfCardio[each].quantile(0.25)
    Q3 = dfCardio[each].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1- 1.5*IQR
    upper_bound = Q3 + 1.5*IQR
    boundary[each] = [lower_bound, upper_bound ]
boundary

In [None]:
#We can select the index of outlier data by using boundaries we calculated.
#upper outliers.

In [None]:
ap_hi_filter = (dfCardio["ap_hi"] > boundary["ap_hi"][1])
ap_lo_filter = (dfCardio["ap_lo"] > boundary["ap_lo"][1])                                                           
outlier_filter = (ap_hi_filter | ap_lo_filter)
x_outliers = dfCardio[outlier_filter]
x_outliers["cardio"].value_counts()

In [None]:
out_filter = ((dfCardio["ap_hi"]>250) | (dfCardio["ap_lo"]>200) )
print("There is {} outlier".format(dfCardio[out_filter]["cardio"].count()))


In [None]:
# Drop outliers

In [None]:
dfCardio = dfCardio[~out_filter]

In [None]:
corr =dfCardio.corr()
f, ax = plt.subplots(figsize = (15,15))
sns.heatmap(corr, annot=True, fmt=".3f", linewidths=0.5, ax=ax)

In [None]:
#Model Testing

In [None]:
y = dfCardio["cardio"]
y.shape

In [None]:
X=dfCardio.drop(['id','cardio'],axis=1)
X

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [None]:

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

dec = DecisionTreeClassifier()
ran = RandomForestClassifier(n_estimators=100)
knn = KNeighborsClassifier(n_neighbors=100)
svm = SVC(random_state=1)
naive = GaussianNB()

models = {"Decision tree" : dec,
          "Random forest" : ran,
          "KNN" : knn,
          "SVM" : svm,
          "Naive bayes" : naive}
scores= { }

for key, value in models.items():    
    model = value
    model.fit(x_train, y_train)
    scores[key] = model.score(x_test, y_test)

In [None]:
scores_frame = pd.DataFrame(scores, index=["Accuracy Score"]).T
scores_frame.sort_values(by=["Accuracy Score"], axis=0 ,ascending=False, inplace=True)
scores_frame

In [None]:
plt.figure(figsize=(5,5))
sns.barplot(x=scores_frame.index,y=scores_frame["Accuracy Score"])
plt.xticks(rotation=45)

In [None]:
#It seems that KNN and SVM algorithms are far ahead of the others.


In [None]:
#Grid Search 
# grid search cross validation with 1 hyperparameter
from sklearn.model_selection import GridSearchCV

grid = {"n_estimators" : np.arange(10,150,10)}

ran_cv = GridSearchCV(ran, grid, cv=3) # GridSearchCV
ran_cv.fit(x_train,y_train)# Fit

# Print hyperparameter
print("Tuned hyperparameter n_estimators: {}".format(ran_cv.best_params_)) 
print("Best score: {}".format(ran_cv.best_score_))



In [None]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(solver="liblinear", max_iter=200)
grid = {"penalty" : ["l1", "l2"],
         "C" : np.arange(60,80,2)} # (60,62,64 ... 78)
log_reg_cv = GridSearchCV(log_reg, grid, cv=3)
log_reg_cv.fit(x_train, y_train)

# Print hyperparameter
print("Tuned hyperparameter n_estimators: {}".format(log_reg_cv.best_params_)) 
print("Best score: {}".format(log_reg_cv.best_score_))


In [None]:
logreg_best = LogisticRegression(C=74, penalty="l1", solver="liblinear")
logreg_best.fit(x_train, y_train)
print("Test accuracy: ",logreg_best.score(x_test, y_test))


In [None]:
#Confusion Matrix


In [None]:
y_true = y_test
y_pred = logreg_best.predict(x_test)
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_true, y_pred)
f, ax = plt.subplots(figsize=(5,5))
sns.heatmap(cm,fmt=".0f", annot=True,linewidths=0.2, linecolor="purple", ax=ax)
plt.xlabel("Predicted")
plt.ylabel("Grand Truth")
plt.show()

In [None]:
#F1 Score 
TN = cm[0,0]
TP = cm[1,1]
FN = cm[1,0]
FP = cm[0,1]
Precision = TP/(TP+FP)
Recall = TP/(TP+FN)
F1_Score = 2*(Recall * Precision) / (Recall + Precision)
pd.DataFrame([[Precision, Recall, F1_Score]],columns=["Precision", "Recall", "F1 Score"], index=["Results"])
#High precision relates to the low false positive rate
#High recall relates to the low false negative rate
