In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
data=pd.read_csv(r"/kaggle/input/cardiovascular-disease-dataset/cardio_train.csv",sep=";")
data.head()

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.isna().sum()

### Here there is no null values present

### there are some corrections need to be made

### age is given in days we need to convert that into years for easy understanding

In [None]:
data["age"]=data["age"]/365
data["age"]=data["age"].astype("int")

In [None]:
data=data.drop(columns=["id"])

In [None]:
fig, ax = plt.subplots(figsize=(15,10))
sns.boxplot(data=data, width= 0.5,ax=ax,  fliersize=3)
plt.title("Visualization of outliers")

#### Here we can see some outliers present in some features
#### There are some abnormal values present in every data which not possible
#### The systolic and diastolic pressure values have values in negative and some are abnormal
#### I made a research on this features and tried to to reduce the outliers


In [None]:
outlier = ((data["ap_hi"]>200) | (data["ap_lo"]>180) | (data["ap_lo"]<50) | (data["ap_hi"]<=80) | (data["height"]<=100)
             | (data["weight"]<=28) )
print("There is {} outlier".format(data[outlier]["cardio"].count()))

### We already have 69301 data and this 1434 is only a 2% of it.
### So we have enough data to train the model even if we remove these outliers.
### Without replacing values for these outliers we can remove it.

In [None]:
data = data[~outlier]

In [None]:
X = data.drop(columns = ['cardio'])
y = data['cardio']

In [None]:
plt.figure(figsize=(20,25), facecolor='white')
plotnumber = 1

for column in X:
    if plotnumber<=16 :
        ax = plt.subplot(4,4,plotnumber)
        sns.stripplot(y,X[column])
    plotnumber+=1

plt.tight_layout()

### From the graph we can see that if ap_lo is more than 120 there is high chance of cardio
### If the age is less 38 there is very less chance or no chance of cardio
### If the weight is more than 175 there is a chance of cardio

## Heat map to check the multicollinearity

In [None]:
corr = X.corr()
f, ax = plt.subplots(figsize = (15,15))
sns.heatmap(corr, annot=True, fmt=".3f", linewidths=0.5,cmap="Blues_r", ax=ax)

### we can see that there is no much collinearity between any data

## Scaling of data

In [None]:
from sklearn.preprocessing import MinMaxScaler
scalar=MinMaxScaler()
x_scaled=scalar.fit_transform(X)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV,train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,f1_score,roc_curve, roc_auc_score

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x_scaled,y,test_size=0.30,random_state=420)

In [None]:
dtc = DecisionTreeClassifier()
ran = RandomForestClassifier(n_estimators=90)
knn = KNeighborsClassifier(n_neighbors=79)
svm = SVC(random_state=6)

In [None]:
models = {"Decision tree" : dtc,
          "Random forest" : ran,
          "KNN" : knn,
          "SVM" : svm}
scores= { }

In [None]:
for key, value in models.items():    
    model = value
    model.fit(x_train, y_train)
    scores[key] = model.score(x_test, y_test)

In [None]:
scores_frame = pd.DataFrame(scores, index=["Accuracy Score"]).T
scores_frame.sort_values(by=["Accuracy Score"], axis=0 ,ascending=False, inplace=True)
scores_frame

In [None]:
from sklearn.metrics import plot_roc_curve

In [None]:
disp = plot_roc_curve(dtc,x_test,y_test)

plot_roc_curve(ran,x_test,y_test,ax=disp.ax_)

plot_roc_curve(knn,x_test,y_test,ax=disp.ax_)

plot_roc_curve(svm,x_test,y_test,ax=disp.ax_)


#### From the table and graph we can see that the SVM and KNN are performing better than other models

### Let's evaluate with other metrics

In [None]:
predicted_svc=svm.predict(x_test)

In [None]:
predicted_knn=knn.predict(x_test)

### Evaluation of SVC

In [None]:
accuracy=accuracy_score(y_test,predicted_svc)
print("The accuracy of svc model is : ",accuracy)

In [None]:
conf_mat = confusion_matrix(y_test,predicted_svc)
print("The Confusion Matrix for SVC in this dataset is : \n",conf_mat)

In [None]:
true_positive = conf_mat[0][0]
false_positive = conf_mat[0][1]
false_negative = conf_mat[1][0]
true_negative = conf_mat[1][1]

In [None]:
# Precison
Precision = true_positive/(true_positive+false_positive)
print("The precision of this svc model is : ",Precision)

In [None]:
# Recall
Recall= true_positive/(true_positive+false_negative)
print("The Recall score of svc model is : ",Recall)

In [None]:
F1_Score = 2*(Recall * Precision) / (Recall + Precision)
print("The F1_Score for this dataset is : ",F1_Score)

### Evaluation of KNN

In [None]:
accuracy=accuracy_score(y_test,predicted_knn)
print("The accuracy of knn model is : ",accuracy)

In [None]:
conf_mat = confusion_matrix(y_test,predicted_knn)
print("The Confusion Matrix for KNN in this dataset is : \n",conf_mat)

In [None]:
true_positive = conf_mat[0][0]
false_positive = conf_mat[0][1]
false_negative = conf_mat[1][0]
true_negative = conf_mat[1][1]

In [None]:
# Precison
Precision = true_positive/(true_positive+false_positive)
print("The precision of this knn model is : ",Precision)

In [None]:
# Recall
Recall= true_positive/(true_positive+false_negative)
print("The Recall score of knn model is : ",Recall)

In [None]:
F1_Score = 2*(Recall * Precision) / (Recall + Precision)
print("The F1_Score for this dataset is : ",F1_Score)

# Conclusion

### SVC gives a better result than other models,in terms of Accuracy score,Auc score and F1_score Svc gives good result. so we can take svc to predict whether a person has cardio or not with good accuracy of 73%.