In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt 
import seaborn as sns 
from warnings import filterwarnings as filt 
from scipy.stats import skew, norm 
import plotly.express as px 

filt('ignore')
plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = (12,8)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/ascvd-heart-risk/heartRisk.csv')
df.shape

In [None]:
df.head()

In [None]:
df.describe().T

In [None]:
df.isnull().values.sum()

In [None]:
for ind, col in enumerate(df.loc[:, 'Age':].columns):
    plt.figure(ind)
    sns.distplot(df[col])

In [None]:
corr = df.corr()
lower_corr = corr.where(np.tril(np.ones(df.corr().shape), k = -1).astype(bool))
sns.heatmap(lower_corr, fmt = '.2f', annot = True, cmap = 'icefire')

In [None]:
sns.lmplot(data = df, x = 'Age', y = 'Risk', hue = 'isMale', col = 'isMale')

In [None]:
sns.scatterplot(data = df, x = 'Systolic', y = 'Risk', hue = 'isDiabetic')

In [None]:
sns.countplot(df.isDiabetic , hue = df.isSmoker)

In [None]:
# systolic is a blood pressure and if its below 120 mm Hg its considered as normal and but if its in range 120 - 139 then there's a risk 
def sustolic(x):
    if x < 120: return 'normal'
    elif x >= 120 and x <= 139 : return 'elevated'
    else: return 'high_pressure'

In [None]:
df['Systolic_levels'] = df.Systolic.apply(lambda x : sustolic(x))
df.head()

In [None]:
dummy_sus = pd.get_dummies(df.Systolic_levels, prefix = 'Blood_pressure')
df = df.drop(['Systolic_levels'], axis = 1)
df = pd.concat([df, dummy_sus], axis = 1)

In [None]:
df.head()

In [None]:
sns.heatmap(df.loc[:, 'Risk':].corr().abs(), fmt = '.2f', annot = True)

to not fall into dummy trap lets drop high_pressure 

In [None]:
sns.heatmap(df.loc[:, 'Risk':].drop(['Blood_pressure_high_pressure'], axis = 1).corr().abs(), fmt = '.2f', annot = True)

In [None]:
df = df.drop(['Blood_pressure_high_pressure'], axis = 1)

In [None]:
df.head()

In [None]:
from eli5 import show_weights
from eli5.sklearn import PermutationImportance
from sklearn.ensemble import RandomForestRegressor
from pdpbox.pdp import * 
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import train_test_split

In [None]:
def permImp(x, y):
    model = RandomForestRegressor().fit(x, y)
    perm = PermutationImportance(model).fit(x, y)
    return show_weights(perm, feature_names = x.columns.tolist())

def plot_mi(score):
    score = score.sort_values('mi_score', ascending = True)
    plt.barh(score.index, score.mi_score)
    return 

def mi_score(x, y):
    score = pd.DataFrame(mutual_info_regression(x, y, discrete_features = False, random_state = 123), index = x.columns, columns = ['mi_score']).sort_values('mi_score', ascending = False)
    plot_mi(score)
    return score

def isolate(x, y, col):
    model = RandomForestRegressor().fit(x, y)
    dist = pdp_isolate(model, model_features = x.columns, dataset = x, feature = col)
    return pdp_plot(dist, feature_name = col)

In [None]:
corr = df.corr().abs()
lower_corr = corr.where(np.tril(np.ones(df.corr().shape), k = -1).astype(bool))
sns.heatmap(lower_corr, fmt = '.2f', annot = True, cmap = 'icefire')

In [None]:
x = df.drop(['Risk'], axis = 1)
y = df.Risk
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3)
x_train.shape, x_test.shape

### feature importance 

In [None]:
permImp(x, y)

In [None]:
mscore = mi_score(x, y)

In [None]:
isolate(x, y, 'Age')

form this plot we can interprete that greater the age greater the chance of heart risk rate, which is why age had high positive corr to risk

### model building 

In [None]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from lightgbm import LGBMRegressor

from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, GridSearchCV

from sklearn.metrics import mean_squared_error, mean_absolute_error

In [None]:
def best_model(x, y, fold = 10):
    models = [LinearRegression(), Lasso(), Ridge(), SVR(), DecisionTreeRegressor(), RandomForestRegressor(), LGBMRegressor()]
    mnames = ['linar regg', 'lasso', 'ridge', 'svm', 'decision tree', 'random forest', 'lgbm']
    scalers = [None, StandardScaler(), RobustScaler(), MinMaxScaler()]
    snames = ['none', 'std', 'robust', 'minmax']
    scores = [[] for _ in range(4)]
    
    print(f'Total number of iterations : {len(mnames) * len(snames)}')
    for model in models:
        for ind, scaler in enumerate(scalers):
            if scaler:
                model = Pipeline(steps = [('scaler', scaler), ('model', model)])
            score = cross_val_score(model, x, y, cv = fold, scoring = 'neg_mean_squared_error').mean()
            rmse = np.sqrt(-1 * score)
            scores[ind].append(rmse)
            
    return pd.DataFrame(scores, index = snames, columns = mnames).T

def report(xt, yt, xtest, ytest, pred, model):
    print(' REPORT '.center(60, '='))
    print()
    print(f'Training RMSE :====> {np.sqrt(mean_squared_error(yt, model.predict(xt)))}')
    print(f'Testing  RMSE :====> {np.sqrt(mean_squared_error(ytest, pred))}')
    print()
    print(f"Training  MAE :====> {mean_absolute_error(yt, model.predict(xt))}")
    print(f'Testing   MAE :====> {mean_absolute_error(ytest, pred)}')
    print()
    return pd.DataFrame({
        'actual value' : ytest.values,
        'predicted value' : pred
    }).head()

def get_score(xt, yt, xtest, ytest, model, scaler = None, predict = True):
    if scaler:
        model = Pipeline(steps = [('scaler', scaler), ('model', model)])
    
    model.fit(xt, yt)
    pred = model.predict(xtest)
    return report(xt, yt, xtest, ytest, pred, model)
    
def gridcv(x, y, model, params, scaler = None, fold = 10):
    if scaler:
        model = Pipeline(steps = [('scaler', scaler), ('model', model)])
    clf = GridSearchCV(model, param_grid = params, cv = fold, scoring = 'neg_mean_squared_error', return_train_score = True, verbose = 2, n_jobs = -1)
    clf.fit(x, y)
    res = pd.DataFrame(clf.cv_results_)
    res[['mean_train_score', 'mean_test_score']] = np.sqrt(-1 * res[['mean_train_score', 'mean_test_score']]) 
    res = res[['mean_train_score', 'mean_test_score', 'params']]
    return clf, res.sort_values('mean_test_score', ascending = True)

def plot_cv(res):
    sns.lineplot(x = res.reset_index().index, y = res.mean_train_score)
    sns.lineplot(x = res.reset_index().index, y = res.mean_test_score)
    plt.title('RMSE comparision')
    plt.legend(['train', 'test'])
    

In [None]:
best_model(x_train, y_train)

In [None]:
get_score(x_train, y_train, x_test, y_test, LGBMRegressor(), StandardScaler())

In [None]:
sns.distplot(df.Risk, fit = norm)

In [None]:
og_risk = df.Risk
df['Risk'] = np.round(np.log1p(df.Risk), 2)

In [None]:
df.Risk.head()

In [None]:
sns.distplot(df.Risk, fit = norm)

In [None]:
new_x = df.drop(['Risk'], axis = 1)
new_y = df.Risk
new_x_train, new_x_test, new_y_train, new_y_test = train_test_split(new_x, new_y, test_size = 0.3)
new_x_train.shape, x_test.shape

In [None]:
best_model(new_x_train, new_y_train)

after normalizing the target variable, all models RMSE reduced tremendously 

In [None]:
get_score(new_x_train, new_y_train, new_x_test, new_y_test, SVR(), StandardScaler())

its already good enough, but lets try to reduce the error even more 

In [None]:
params = {
    'C' : [1, 50, 100, 500],
    'kernel' : ['rbf', 'sigmoid'],
    'gamma' : ['scale', 'auto'],
    'epsilon' : [0.1, 0.01, 1, 0.5]
}

pip_params = {f"model__{key}" : values for key, values in params.items()}
pip_params

In [None]:
clf, results = gridcv(new_x_train, new_y_train, SVR(), pip_params, StandardScaler(), 5)

In [None]:
plot_cv(results)

In [None]:
results.head()

In [None]:
clf.best_estimator_

In [None]:
get_score(new_x_train, new_y_train, new_x_test, new_y_test, clf.best_estimator_)

finally we were able to reduce the RMSE to 0.082

In [None]:
from sklearn.manifold import TSNE

In [None]:
tsne = TSNE(n_components = 2, n_iter=5000)
tsne_x = tsne.fit_transform(new_x)
tsne_x

In [None]:
tsne_x = pd.DataFrame(tsne_x, columns = ['x', 'y'])
tsne_x = pd.concat([tsne_x, new_x[['isMale', 'isSmoker', 'Blood_pressure_normal']]], axis = 1)
tsne_x.head()

In [None]:
sns.scatterplot(data = tsne_x, x = 'x', y = 'y', hue = 'Blood_pressure_normal')