## Importing Libraries and Dataset.

In [None]:
# Importing the required libraries.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# getting the data into a DataFrame.
data=pd.read_csv(r"/kaggle/input/cardiovascular-disease-dataset/cardio_train.csv",sep=";")
data.head()

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
# The age is given in days, we have to convert it into years.
data["age"] = data["age"]/365
data["age"] = data["age"].astype("int")

In [None]:
# Dropping id column, its of no use.
data = data.drop(columns = ["id"])

## Exploratory Data Analysis.

In [None]:
sns.countplot(x = 'cardio', data = data)

the dataset is well balanced.

In [None]:
# Checking the existence of outliers using boxplots
fig, ax = plt.subplots(figsize = (15,10))
sns.boxplot(data = data, width = 0.5, ax = ax, fliersize = 3)
plt.title("Visualization of outliers")

Here we can see some outliers present in some features (app_hi, app_lo, height and weight)

In [None]:
# ap_hi greater than 200 and lower than or equal to 80 will be removed.
# ap_lo greater than 180 and lower than 50 will be removed.
# height greater or equal to 100 and weight less than 28 will be removed.
outlier = ((data["ap_hi"]>200) | (data["ap_lo"]>180) | (data["ap_lo"]<50) | (data["ap_hi"]<=80) | (data["height"]<=100)
             | (data["weight"]<=28) )
print("There is {} outlier".format(data[outlier]["cardio"].count()))

We already have 70000 data and this 1434 is only a 2% of it.
So we have enough data to train the model even if we remove these outliers.

In [None]:
# Removing  the outlier from the Dataset.
data = data[~outlier]

In [None]:
data

In [None]:
# BoxPlot after removing the outliers.
fig, ax = plt.subplots(figsize = (15,10))
sns.boxplot(data = data, width = 0.5, ax = ax, fliersize = 3)
plt.title("Visualization of outliers")

In [None]:
X = data.drop(columns = ['cardio'])
y = data['cardio']
plt.figure(figsize=(20,25), facecolor='white')
plotnumber = 1

for column in X:
    if plotnumber<=16 :
        ax = plt.subplot(4,4,plotnumber)
        sns.stripplot(y,X[column])
    plotnumber+=1

plt.tight_layout()

In [None]:
# creating a heatmap of correlation of the data.
corr = data.corr()
f, ax = plt.subplots(figsize = (15,15))
sns.heatmap(corr, annot=True, fmt=".3f", linewidths=0.5, ax=ax)

As you can see in the above **heatmap**, there are **correlations** among **gender and height**, **app_lo and app_hi**, **gluc and cholestrol**, and a small correlation among **smoke and alco**.

**Body Mass Index (BMI)**
Height and weight seems uncorrelated with the cardio feature but **Body Mass Index (BMI)** could be helpful to train our model.

In [None]:
data["bmi"] = data["weight"]/ (data["height"]/100)**2

In [None]:
data.head()

In [None]:
# Detecting Genders
a = data[data["gender"]==1]["height"].mean()
b = data[data["gender"]==2]["height"].mean()
if a > b:
    gender = "male"
    gender2 = "female"
else:
    gender = "female"
    gender2 = "male"
print("Gender:1 is "+ gender +" & Gender:2 is " + gender2)

* Women have many of the same risk factors with men for heart disease as men, such as smoking, high blood pressure, and high cholesterol especially after 65. 
* Thus we shouldn't categorize them into 1 and 2 because of 2 is always numerically bigger than 1, the model would take into account that and give a bigger ratio to men for having a disease

In [None]:
data["gender"] = data["gender"] % 2

In [None]:
data

In [None]:
X = data.drop(columns = ['cardio'])
y = data['cardio']

In [None]:
from sklearn.preprocessing import MinMaxScaler
scalar=MinMaxScaler()
x_scaled=scalar.fit_transform(X)

## Preparing the Training and Test set.

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV,train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,f1_score,roc_curve, roc_auc_score

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x_scaled, y, test_size = 0.30, random_state = 9)

In [None]:
dtc = DecisionTreeClassifier()
ran = RandomForestClassifier(n_estimators=90)
knn = KNeighborsClassifier(n_neighbors=79)
svm = SVC(random_state=6)

In [None]:
models = {"Decision tree" : dtc,
          "Random forest" : ran,
          "KNN" : knn,
          "SVM" : svm}
scores= { }

In [None]:
for key, value in models.items():    
    model = value
    model.fit(X_train, y_train)
    scores[key] = model.score(X_test, y_test)

In [None]:
scores_frame = pd.DataFrame(scores, index=["Accuracy Score"]).T
scores_frame.sort_values(by=["Accuracy Score"], axis=0 ,ascending=False, inplace=True)
scores_frame

In [None]:
from sklearn.metrics import plot_roc_curve

In [None]:
disp = plot_roc_curve(dtc, X_test, y_test)

plot_roc_curve(ran,X_test, y_test, ax = disp.ax_)

plot_roc_curve(knn,X_test, y_test, ax = disp.ax_)

plot_roc_curve(svm,X_test, y_test, ax = disp.ax_)

From the table and graph we can see that the SVM and KNN are performing better than other models.

## Evaluation of SVC

In [None]:
predicted_svc=svm.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, predicted_svc)
print("The accuracy of svc model is : ", accuracy)

In [None]:
conf_mat = confusion_matrix(y_test, predicted_svc)
print("The Confusion Matrix for SVC in this dataset is : \n", conf_mat)

In [None]:
true_positive = conf_mat[0][0]
false_positive = conf_mat[0][1]
false_negative = conf_mat[1][0]
true_negative = conf_mat[1][1]

In [None]:
# Precison
Precision = true_positive/(true_positive+false_positive)
print("The precision of this svc model is : ",Precision)

# Recall
Recall= true_positive/(true_positive+false_negative)
print("The Recall score of svc model is : ",Recall)

# F1 Score
F1_Score = 2*(Recall * Precision) / (Recall + Precision)
print("The F1_Score for this dataset is : ",F1_Score)

## Evaluation of KNN.

In [None]:
predicted_knn=knn.predict(X_test)

In [None]:
accuracy=accuracy_score(y_test,predicted_knn)
print("The accuracy of knn model is : ",accuracy)

In [None]:
conf_mat = confusion_matrix(y_test,predicted_knn)
print("The Confusion Matrix for KNN in this dataset is : \n",conf_mat)

In [None]:
true_positive = conf_mat[0][0]
false_positive = conf_mat[0][1]
false_negative = conf_mat[1][0]
true_negative = conf_mat[1][1]

In [None]:
# Precison
Precision = true_positive/(true_positive+false_positive)
print("The precision of this knn model is : ",Precision)

# Recall
Recall= true_positive/(true_positive+false_negative)
print("The Recall score of knn model is : ",Recall)

# F1 Score
F1_Score = 2*(Recall * Precision) / (Recall + Precision)
print("The F1_Score for this dataset is : ",F1_Score)

## Conclusion

#### SVC gives a better result than other models,in terms of Accuracy score,Auc score and F1_score Svc gives good result. so we can take svc to predict whether a person has cardio or not with good accuracy of 73%.