In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('/kaggle/input/ascvd-heart-risk/heartRisk.csv')

In [None]:
df.info()

In [None]:
df.head()

# Data is very clean and ready for EDA and training

In [None]:
df.isnull().sum()

# Broad correlation searching

In [None]:
plt.figure(figsize = (12,8))
sns.heatmap(df.corr(), annot = True, cmap = 'Spectral')

# no strong correlations between features

# Correlations in order of greatest to least risk: Age, Systolic blood pressure, Diabetic, Smoker, High blood pressure, gender is male

In [None]:
# near even split between smokers and nonsmokers
sns.countplot(x = 'isSmoker', data = df)

In [None]:
# reasonable represenation of ages between 40 and 80 within dataset
sns.displot(x = 'Age', data = df, bins = 40)

In [None]:
# close to even split between male and female subjects
sns.countplot(x = 'isMale', data = df)

In [None]:
# data is slighlty more skewed towards the black population
sns.countplot(x = 'isBlack', data = df)

In [None]:
# slightly more diabetics in dataset than non
sns.countplot(x = 'isDiabetic', data = df)

In [None]:
# even split between high and low blood presures.
sns.countplot(x = 'isHypertensive', data = df)

In [None]:
# even range of blood pressures
sns.displot(x = 'Systolic', data = df)

In [None]:
# data slightly skewed towards lower cholesterols
sns.displot(x = 'Cholesterol', data = df)

In [None]:
# data slightly skewed towards lower cholesterols
sns.displot(x = 'HDL', data = df)

# numerical data skew

In [None]:
# positive skew corresponds to the false column in categorical data
df.skew()

# Data is clean, evenly distributed and not wildly scaled. We're ready for regression prediction of the risk.

In [None]:
# data split

X = df.drop('Risk', axis = 1)
y = df['Risk']

In [None]:
# train test split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=7)

# training regression models

In [None]:
# Linear Regression training

from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)

In [None]:
# Linear Regression evaluation

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

lr_mae = mean_absolute_error(y_test, lr_pred)
lr_rmse = np.sqrt(mean_squared_error(y_test, lr_pred))
lr_r2 = r2_score(y_test, lr_pred)

print(lr_mae, lr_rmse, lr_r2)

In [None]:
# Decision Tree Regressor training

from sklearn.tree import DecisionTreeRegressor
dtr = DecisionTreeRegressor()
dtr.fit(X_train, y_train)
dtr_pred = dtr.predict(X_test)

In [None]:
#Decision tree evaluation

dtr_mae = mean_absolute_error(y_test, dtr_pred)
dtr_rmse = np.sqrt(mean_squared_error(y_test, dtr_pred))
dtr_r2 = r2_score(y_test, dtr_pred)

print(dtr_mae, dtr_rmse, dtr_r2)

In [None]:
# Random Forest training 

from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(n_estimators=200)
rfr.fit(X_train, y_train)
rfr_pred = rfr.predict(X_test)

In [None]:
#Random forest evaluation

rfr_mae = mean_absolute_error(y_test, rfr_pred)
rfr_rmse = np.sqrt(mean_squared_error(y_test, rfr_pred))
rfr_r2 = r2_score(y_test, rfr_pred)

print(rfr_mae, rfr_rmse, rfr_r2)

In [None]:
# XGBoost Regressor training

from xgboost import XGBRegressor
xgbr = XGBRegressor()
xgbr.fit(X_train, y_train)
xgbr_pred = xgbr.predict(X_test)

In [None]:
#XGBoost Regressor evaluation 

xgbr_mae = mean_absolute_error(y_test, xgbr_pred)
xgbr_rmse = np.sqrt(mean_squared_error(y_test, xgbr_pred))
xgbr_r2 = r2_score(y_test, xgbr_pred)

print(xgbr_mae, xgbr_rmse, xgbr_r2)

In [None]:
results = pd.DataFrame({'Model': ['Linear Regression', 'Decision Tree Regressor',\
                                 'Random Forest Regressor', 'XGBoost Regressor'], 'MAE' :\
                      [lr_mae, dtr_mae, rfr_mae, xgbr_mae], 'RMSE' :\
                        [lr_rmse, dtr_rmse, rfr_rmse, xgbr_rmse], 'r2':\
                        [lr_r2, dtr_r2, rfr_r2, xgbr_r2]})

results

# XGBoost Regressor wins!