In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

import pandas_profiling as pdp
import gc
gc.enable()
from scipy import stats
from sklearn import preprocessing
from sklearn.tree import DecisionTreeRegressor
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report

In [None]:
data = pd.read_csv("../input/cardiovascular-disease-dataset/cardio_train.csv",sep=";")

In [None]:
data.info()

In [None]:
data.describe().T

In [None]:
data.head()

In [None]:
data.tail()

# EDA

In [None]:
data['years'] = (data['age'] / 365).round().astype('int')

plt.figure(figsize=(15,8))
sns.countplot(y='years', hue='cardio', data = data, palette="Set1")

### 1 Women / 2 Men

In [None]:
plt.figure(figsize=(20,20))
sns.catplot(y="years", hue="gender", col="cardio",
                data=data, kind="count", palette="Set1")

In [None]:
fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(18, 14))
sns.histplot(data=data,hue='cardio',palette="Set1", x="years", kde=True,ax=axes[0][0])
sns.histplot(data=data,hue='cardio',palette="Set1", x="height", kde=True,ax=axes[0][1])
sns.histplot(data=data,hue='cardio',palette="Set1", x="weight", kde=True,ax=axes[1][0])
sns.histplot(data=data,hue='cardio',palette="Set1", x="ap_hi", kde=True,ax=axes[1][1])
sns.histplot(data=data,hue='cardio',palette="Set1", x="ap_lo", kde=True,ax=axes[2][0])

In [None]:
fig, axes = plt.subplots(nrows=4, ncols=2, figsize=(20, 15))
sns.countplot(data=data,palette="Set1",hue='cardio', x="gender",ax=axes[0][0])
sns.countplot(data=data,palette="Set1",hue='cardio', x="cholesterol",ax=axes[0][1])
sns.countplot(data=data,palette="Set1",hue='cardio', x="gluc",ax=axes[1][0])
sns.countplot(data=data,palette="Set1",hue='cardio', x="smoke",ax=axes[1][1])
sns.countplot(data=data,palette="Set1",hue='cardio', x="alco",ax=axes[2][0])
sns.countplot(data=data,palette="Set1",hue='cardio', x="active",ax=axes[2][1])
sns.countplot(data=data,palette="Set1", x="cardio",ax=axes[3][0])

In [None]:
report = pdp.ProfileReport(data, title="Cardiovascular disease Report",minimal=True)
report

In [None]:
plt.figure(figsize=(16,8))
sns.heatmap(data.corr(),annot=True)

# Preprocessing

### Column identification with a single value.

In [None]:
for i in range(data.shape[1]):
    print(i,len(pd.unique(data.iloc[:,i])))

###  Identification of columns with few values.

In [None]:
for i in range(data.shape[1]):
    num=len(pd.unique(data.iloc[:,i]))
    porcentaje=float(num)/data.shape[0]*100
    print("%d, %d, %.1f%%"%(i,num,porcentaje))

### Duplicate detection.

In [None]:
duplicado = data.duplicated()
print(duplicado.any())
print(data[duplicado])

#### Thanks @sulianova for this "In addition, in some cases diastolic pressure is higher than systolic, which is also incorrect. How many records are inaccurate in terms of blood pressure?"

In [None]:
print("Diastilic pressure is higher than systolic one in {0} cases".format(data[data['ap_lo']> data['ap_hi']].shape[0]))

In [None]:
plt.figure(figsize=(8,4))
sns.boxplot(data=data,x=data["ap_lo"],color='red')

In [None]:
plt.figure(figsize=(8,4))
sns.boxplot(data=data,x=data["ap_hi"],color='red')

In [None]:
data.drop(data[(data['ap_hi'] > data['ap_hi'].quantile(0.975)) | (data['ap_hi'] < data['ap_hi'].quantile(0.025))].index,inplace=True)
data.drop(data[(data['ap_lo'] > data['ap_lo'].quantile(0.975)) | (data['ap_lo'] < data['ap_lo'].quantile(0.025))].index,inplace=True)

In [None]:
blood_pressure = data.loc[:,['ap_lo','ap_hi']]
sns.boxplot(x = 'variable',y = 'value',data = blood_pressure.melt())
print("Diastilic pressure is higher than systolic one in {0} cases".format(data[data['ap_lo']> data['ap_hi']].shape[0]))

### Outliers

In [None]:
plt.figure(figsize=(8,4))
sns.boxplot(data=data,x=data["height"],color='lightblue')

In [None]:
plt.figure(figsize=(8,4))
sns.boxplot(data=data,x=data["weight"],color='lightblue')

In [None]:
data=data[(np.abs(stats.zscore(data["height"])) < 3)]
data=data[(np.abs(stats.zscore(data["weight"])) < 3)]

In [None]:
plt.figure(figsize=(8,4))
sns.boxplot(data=data,x=data["height"],color='lightblue')

In [None]:
plt.figure(figsize=(8,4))
sns.boxplot(data=data,x=data["weight"],color='lightblue')

### Combine weight and height

In [None]:
data["bmi"] = (data["weight"]/ (data["height"]/100)**2).round(1)

In [None]:
plt.figure(figsize=(8,4))
sns.boxplot(data=data,x=data["bmi"],color='lightblue')

In [None]:
data=data[(np.abs(stats.zscore(data["bmi"])) < 3)]

In [None]:
plt.figure(figsize=(8,4))
sns.boxplot(data=data,x=data["bmi"],color='lightblue')

### Delete

In [None]:
data.drop(['weight','height',"id","age"],axis=1,inplace=True)

In [None]:
data.head()

In [None]:
data.describe().T

### Encoding

In [None]:
data['cholesterol']=data['cholesterol'].map({ 1: 'normal', 2: 'above normal', 3: 'well above normal'})
data['gluc']=data['gluc'].map({ 1: 'normal', 2: 'above normal', 3: 'well above normal'})
dummies = pd.get_dummies(data[['cholesterol','gluc']],drop_first=True)
data = pd.concat([data,dummies],axis=1)
data.drop(['cholesterol','gluc'],axis=1,inplace=True)
data.head()

In [None]:
data.columns

### Scaling

In [None]:
robust_sc = preprocessing.RobustScaler()
standard_sc = preprocessing.StandardScaler() 
minmax_sc = preprocessing.MinMaxScaler() 

In [None]:
scaled_features = ['years', 'ap_hi', 'ap_lo','bmi']

In [None]:
X = data[scaled_features]
y = data["cardio"]

In [None]:
resultado = []

for x in [robust_sc,standard_sc,minmax_sc]:
    scaler = x.fit(X)
    X_new = x.transform(X)
    tree = DecisionTreeRegressor(max_depth=10,random_state=42)
    tree.fit(X_new,y)
    y_pred = tree.predict(X_new)
    #r2 = metrics.r2_score(y, y_pred)
    RMSE=np.sqrt(mean_squared_error(y,y_pred))
    resultado.append(RMSE)
    print("El escalado Utilizado--->",x)
    print("RMSE segun el tipo de estrategia:",RMSE)
    print("----------------------------------------")

In [None]:
X = data.drop("cardio",axis=1)
y = data["cardio"]

In [None]:
X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.3,random_state=42,stratify=y)

In [None]:
standard_sc.fit(X_train[scaled_features])
X_train[scaled_features] = scaler.transform(X_train[scaled_features])
X_test[scaled_features] = scaler.transform(X_test[scaled_features])

# Machine Learning

### XGBoost with gridsearch

In [None]:
gbm = XGBClassifier(verbosity=1)
params_xgb = {
        "n_estimators":[150,250,500],
        "learning_rate":[0.01,0.1],
        'gpu_id': [0],
        "predictor":["gpu_predictor"],
        'tree_method': ['gpu_hist'],
        "updater":["grow_gpu_hist"],
        "sampling_method":["gradient_based"],
        "updater":["grow_gpu_hist"]
}

In [None]:
model_xgb = GridSearchCV(gbm,param_grid=params_xgb, cv=5,n_jobs=-1)
model_xgb.fit(X_train,y_train)

In [None]:
print("Best params: "+str(model_xgb.best_params_))
print("Best Score: "+str(model_xgb.best_score_)+'\n')

In [None]:
scores = pd.DataFrame(model_xgb.cv_results_)
scores.sort_values(by="rank_test_score")

In [None]:
y_train_pred_xgb = model_xgb.predict(X_train)
y_test_pred_xgb = model_xgb.predict(X_test)

In [None]:
print(classification_report(y_test,y_test_pred_xgb))

In [None]:
plot_confusion_matrix(model_xgb, X_test, y_test)

### MLPClassifier

In [None]:
clf = MLPClassifier(random_state=42)
params_MLP = {
        "hidden_layer_sizes":[64,128,256],
        "activation":["identity", "logistic", "tanh", "relu"],
        'solver': ["lbfgs", "sgd", "adam"],
        "learning_rate":["constant", "invscaling", "adaptive"],
        'max_iter': [100,200,300],
        "warm_start":[True]
    }

In [None]:
model_MLP = GridSearchCV(clf,param_grid=params_MLP, cv=3,n_jobs=-1)
model_MLP.fit(X_train,y_train)

In [None]:
print("Best params: "+str(model_MLP.best_params_))
print("Best Score: "+str(model_MLP.best_score_)+'\n')

In [None]:
scores = pd.DataFrame(model_MLP.cv_results_)
scores.sort_values(by="rank_test_score")

In [None]:
y_train_pred_MLP = model_MLP.predict(X_train)
y_test_pred_MLP = model_MLP.predict(X_test)

In [None]:
print(classification_report(y_test,y_test_pred_xgb))

In [None]:
plot_confusion_matrix(model_MLP, X_test, y_test)