# ****Objective: Develop a Multilayer Perceptron (MLP) to predict cardio vascular disease (cardio) based on other variables. Simple data and model just to help others understanding and develop their own models.

In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [None]:
# Importing dataset

Dados = pd.read_csv (r'../input/cardiovascular-disease-dataset/cardio_train.csv', delimiter=';', encoding = "ISO-8859-1")         # As vezes encoding = "utf-8"
Dados

# ****Data Prep

In [None]:
# Basic statistics

Dados.describe()

In [None]:
# Creating new Variables

Dados['bmi'] = (Dados['weight'] / (((Dados['height']/100)**2))).round(decimals=2)   # Body mass index
Dados['age_y'] = (Dados['age']/365).round(decimals=2)                               # Age in years

# Creating Flags for categories as 0-1

Dados['bmi_high'] = (Dados['bmi'] >= 30).astype(int)

In [None]:
del Dados['age']
del Dados['id']

In [None]:
Dados.head(10)

# **** Variables Analysis

In [None]:
# Visualizing Discrete variables and its effects on cardio

Colunas = ('gender', 'cholesterol',	'gluc',	'smoke',	'alco',	'active',	'bmi_high')
fig=plt.figure(figsize=(20,20))
j = 1
for i in Colunas :
  ax=fig.add_subplot(8,8,j)
  j = j + 1
  plt.figure(figsize=(2, 2))
  sns.barplot(x=i,y='cardio',data=Dados,ci=None,ax=ax)
  fig.tight_layout()  
plt.show()

In [None]:
# Correlation matrix to understand relation between variables

plt.figure(figsize=(18, 8))
sns.heatmap(Dados.corr(), vmin=-1, vmax=1, annot=True, cmap='vlag') # cmap='BrBG'
plt.title('Correlation Map', fontdict={'fontsize':12}, pad=12);

Some variables have higher correlation with cardio and also between each other.

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='age_y', y='weight', data= Dados, hue='cardio')

In [None]:
# Visualizing and understanding continuous variables relation with cardio variable

sns.histplot(Dados, x='weight', bins = range(30,150,5), hue='cardio')

In [None]:
sns.histplot(Dados, x='age_y', bins = range(30,70,2), hue='cardio')

# Splitting data for trainning and testing

In [None]:
DadosX = Dados.iloc[:,[0,1,2,3,4,5,6,7,8,9,11,12,13]]        # independent variable

DadosY = Dados.iloc[:,10]                                    # dependent variable

In [None]:
# Splitting data

X_train, X_test, Y_train, Y_test = train_test_split(DadosX, DadosY,random_state=1, test_size=0.2)

In [None]:
X_train.describe()

In [None]:
X_test.describe()

# Multi model generation comparing trainning and test fit

In [None]:
# Multi MLP Generator
training_accuracy = []
testing_accuracy = []
Layer1 = range(10,90,20)
Layer2 = range(10,90,20)
LayersComb = len(Layer1)*len(Layer2)
Step=0
score=0

for i in Layer1 :
  for j in Layer2 :
    mlp = MLPRegressor(hidden_layer_sizes=(i,j),activation="logistic" ,random_state=1, max_iter=2000).fit(X_train, Y_train)
    Y_pred_train = mlp.predict(X_train).round()
    training_accuracy.append(accuracy_score(Y_train, Y_pred_train))
    Y_pred_test = mlp.predict(X_test).round()
    acc_score = accuracy_score(Y_test,Y_pred_test)
    testing_accuracy.append(acc_score)
    Step = Step + 1
    if score < acc_score:
        score = acc_score
        best_Layer1 = i
        best_Layer2 = j
        best_Step = Step
    
    print('Step ', Step, ' of ', LayersComb, ' Layer1: ', i, ' Layer2: ', j)    
    print('Best Accuracy Score', score.round(4), ' Best Layer1: ', best_Layer1, ' Best Layer2: ', best_Layer2, ' do Step:', best_Step)

In [None]:
sns.lineplot(data=[training_accuracy,testing_accuracy])

In [None]:
# Selected Model
mlp_selected = MLPRegressor(hidden_layer_sizes=(70,70),activation="logistic" ,random_state=1, max_iter=2000)
mlp_selected.fit(X_test, Y_test)

In [None]:
Y_pred_train = mlp_selected.predict(X_train).round()
training_accuracy.append(accuracy_score(Y_train, Y_pred_train))
Y_pred_test = mlp_selected.predict(X_test).round()
acc_score = accuracy_score(Y_test,Y_pred_test)
testing_accuracy.append(acc_score)

In [None]:
# Confusion Matrix for Selected Model
cm=confusion_matrix(Y_test,Y_pred_test)
conf_matrix=pd.DataFrame(data=cm,columns=['Predicted:0','Predicted:1'],index=['Actual:0','Actual:1'])
plt.figure(figsize = (8,5))
sns.heatmap(conf_matrix, annot=True,fmt='d',cmap="YlGnBu");

In [None]:
print(classification_report(Y_test,Y_pred_test))