In [None]:
# import libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
from numpy import mean, std
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
from sklearn.model_selection import train_test_split, LeaveOneOut
from rfpimp import * # might not be installed already, otherwise install using pip install
import collections
from sklearn.inspection import permutation_importance

In [None]:
# load data
data = pd.read_excel('model_data_S_not_zero_absERROR_NotVent.xlsx').drop(columns = ['Unnamed: 0', 'abs_error_estimate'])
data

In [None]:
# prep data into X and Y (X also scaled for the linear model)
X = data.drop(columns = ['S'])
X_RF = X[['lh_parstriangularis_volume', 'lh_postcentral_volume', 'Left-Pallidum',
       'rh_postcentral_volume', 'rh_paracentral_volume',
       'lh_superiortemporal_volume', 'lh_parsorbitalis_volume', 'Putamen',
       'lh_caudalmiddlefrontal_volume', 'Right-Amygdala',
       'lh_entorhinal_volume', 'lh_fusiform_volume', 'Right-Pallidum',
       '-VentralDC', 'rh_supramarginal_volume', 'rh_parsopercularis_volume',
       'Left-Accumbens-area', 'CC_Posterior', 'lh_lateralorbitofrontal_volume',
       'rh_lateralorbitofrontal_volume', 'lh_inferiortemporal_volume']]
X_lasso = X[['lh_caudalmiddlefrontal_volume', 'lh_entorhinal_volume',
       'lh_fusiform_volume', 'lh_parsorbitalis_volume',
       'lh_parstriangularis_volume', 'lh_postcentral_volume',
       'lh_superiorparietal_volume', 'lh_superiortemporal_volume',
       'rh_entorhinal_volume', 'rh_postcentral_volume',
       'rh_rostralanteriorcingulate_volume', 'rh_transversetemporal_volume',
       'Left-Accumbens-area', 'gender_F']]
X_scaled = MinMaxScaler().fit_transform(X_lasso) 
Y = data.S
print( X.shape, Y.shape)

In [None]:
# create a new set of random values for the RANDOM feature for each new cv
X_sc_df = pd.DataFrame(X_scaled, columns = X_lasso.columns)
X_sc_df.loc[:, ('RANDOM')] = np.random.uniform(0,1, size = len(X_sc_df))

In [None]:
X_sc_df

**LOOCV LINEAR REGRESSION**

In [None]:
# initiate the storage for the data
MSE_lasso = list()
MSE_base = list()
Y_preds = list()
Y_trues = list()
Y_means = list()
coef = collections.defaultdict(list)
alphas = list()

# prep loocv
cv = LeaveOneOut()
cv.get_n_splits(X_sc_df)

# make feature list
feature_list = X_sc_df.keys().tolist()

# execute the loocv
for train_ix, test_ix in cv.split(X_sc_df):
    
    # create a new set of random values for the RANDOM feature for each new cv
    X_sc_df.loc[:, ('RANDOM')] = np.random.uniform(0,1, size = len(X_sc_df)) 
    
    # split data
    X_train, X_test = X_sc_df.iloc[train_ix, :], X_sc_df.iloc[test_ix, :]
    y_train, y_test = Y.iloc[train_ix], Y.iloc[test_ix]

    # create the model grid search for finetuning alpha (the inner loop)
    cv_inner = KFold(n_splits = 5, shuffle = True, random_state = 10)
    
    # define model
    model_s = linear_model.Lasso()
    
    # define grid
    grid = {"alpha": np.arange(0.001, 0.999, 0.001)} 

    # fit the gridsearch
    search = GridSearchCV(model_s, grid, scoring = 'neg_mean_squared_error', cv = cv_inner, n_jobs = 2)
    search.fit(X_train, y_train)
    
    # get best model from cross validated grid search 
    model = search.best_estimator_
    alphas.append(model.alpha) # store alpha value resulting from the grid search
    
    # evaluate best Lasso model
    y_pred = model.predict(X_test)
    Y_preds.append(y_pred) # store predicted Y value
    score = mean_squared_error(y_test, y_pred)
    MSE_lasso.append(score) # store Lasso model score
    
    # store true Y value
    Y_trues.append(y_test)
    
    # evaluate baseline
    y_pred_base = [mean(y_train)]
    Y_means.append(y_pred_base) # store base model Y value
    score_base = mean_squared_error(y_test, y_pred_base)
    MSE_base.append(score_base) # store base model score
    
    print(model)
    
    # store coefficients of the features
    coe = model.coef_
    for feature, coeff in zip(feature_list, coe):
        coef[feature].append(coeff)
        
# print mean MSE and its standarddeviation once finished the loocv
print('MSE Lasso: %.3f (%.3f)' % (mean(MSE_lasso), std(MSE_lasso)))
print(stats.sem(MSE_lasso))
print('MSE Baseline: %.3f (%.3f)' % (mean(MSE_base), std(MSE_base)))
print(stats.sem(MSE_base))

In [None]:
# print mean MSE and its standarddeviation once finished the loocv
print('MSE Lasso: %.4f (%.4f)' % (mean(MSE_lasso), std(MSE_lasso)))
print(stats.sem(MSE_lasso))
print('MSE Baseline: %.3f (%.3f)' % (mean(MSE_base), std(MSE_base)))
print(stats.sem(MSE_base))

In [None]:
# calculate mean and std for coefficients and save coefficients to excel
coef_lasso = pd.DataFrame.from_dict(coef, orient = 'index')
coef_lasso_tot = coef_lasso.copy()
coef_lasso_tot['coefficient'] = coef_lasso.mean(axis = 1)
coef_lasso_tot['std'] = coef_lasso.std(axis = 1)
coef_lasso_tot.to_excel("coef_lasso_loocv_nonzero_bestmodel.xlsx")
coef_lasso_tot

In [None]:
# coef_lasso_tot = pd.read_excel("coef_lasso_loocv_nonzero.xlsx").set_index('Unnamed: 0')
# coef_lasso_tot

In [None]:
coef_lasso_tot['abs'] = np.abs(coef_lasso_tot.coefficient) #coef_lasso_tot.apply(lambda row: row.coefficient, axis = 1)
coef_lasso_tot = coef_lasso_tot.sort_values(by = ['abs'], ascending = False)

In [None]:
coef_lasso_tot

features_list = [item for item in coef_lasso_tot.index]
features_list


vis_coef = dict()

for item in features_list:
    vis_coef[item] = coef_lasso_tot.drop(columns = ['coefficient', 'std']).loc[item]

vis_coef = pd.DataFrame.from_dict(vis_coef)
vis_coef

In [None]:
# calculate mean and std of MSE of the lasso and Base model and save MSE to excel
MSE = dict()
MSE['lasso'] = MSE_lasso
MSE['base'] = MSE_base
MSE = pd.DataFrame.from_dict(MSE, orient = 'index')
MSE_tot = MSE.copy()
MSE_tot['mean'] = MSE.mean(axis = 1)
MSE_tot['std'] = MSE.std(axis = 1)
MSE_tot.to_excel("MSE_lasso_loocv_bestmodel_nonzero.xlsx")

In [None]:
# combine y_pred, y_mean (base prediction), y_true and alpha and save to excel
S_values = dict()
S_values['y_pred'] = [item[0] for item in Y_preds]
S_values['y_mean'] = [item[0] for item in Y_means]
S_values['y_true'] = [float(item) for item in Y_trues]
S_values['alpha'] = alphas
S_value = pd.DataFrame.from_dict(S_values, orient = 'index')
S_value.to_excel("S_values_lasso_loocv_bestmodel.xlsx")

**LOOCV RANDOM FOREST REGRESSION**

In [None]:
# initiate storage of the data
rf_feature_importance_perm = collections.defaultdict(list) # permutation of feature importances
rf_feature_importance_perm_std = collections.defaultdict(list) # permutation of feature importances
Y_preds_rf = list()
Y_trues_rf = list()
Y_means_rf = list()
MSE_rf = list()
MSE_base_rf = list()
count = 0

# prepare loocv
cv_outer = LeaveOneOut()
cv_outer.get_n_splits(X_RF)

# execute the loocv 
for train_ix, test_ix in cv_outer.split(X_RF):
    
    # create a new set of random values for the RANDOM feature for each new cv
    X_RF.loc[:, ('RANDOM')] = np.random.uniform(0,1, size = len(X_RF)) 
    
    # split data
    X_train, X_test = X_RF.iloc[train_ix, :], X_RF.iloc[test_ix, :]
    y_train, y_test = Y.iloc[train_ix], Y.iloc[test_ix]
    
    # initiate and fit the model
    rf = RandomForestRegressor(n_estimators = 1000, random_state = 42, n_jobs = 2)
    rf.fit(X_train, y_train)
    print("fitted model:", count)

    # evaluate best RF model
    y_pred = rf.predict(X_test)
    Y_preds_rf.append(y_pred) # store predicted Y value
    score = mean_squared_error(y_test, y_pred)
    MSE_rf.append(score) # store RF model score
    
    # evaluate base model
    y_base = [mean(y_train)]
    Y_means_rf.append(y_base) # store base model Y value
    score_base = mean_squared_error(y_test, y_base)
    MSE_base_rf.append(score_base) # store base model score
    
    # store true Y value
    Y_trues_rf.append(y_test) 
    
    # initiate a list of all features
    feature_list = X_train.keys().tolist()

    # also store permutated feature importances and its standarddeviation
    imp = permutation_importance(rf, X_train, y_train, scoring = 'neg_mean_squared_error', random_state = 0,  n_jobs = 2)
    for item in imp:
        if item == 'importances_mean':
            for feature, importance in zip(feature_list, imp[item]):
                rf_feature_importance_perm[feature].append(importance)       
        if item == 'importances_std':
            for feature, importance_std in zip(feature_list, imp[item]):
                rf_feature_importance_perm_std[feature].append(importance_std)
        
    # increase count
    count = count + 1

# print mean MSE and its standarddeviation once finished the loocv
print('MSE RF: %.4f (%.4f)' % (mean(MSE_rf), std(MSE_rf)))
print(stats.sem(MSE_rf))
print('MSE Baseline: %.4f (%.4f)' % (mean(MSE_base_rf), std(MSE_base_rf)))
print(stats.sem(MSE_base_rf))

In [None]:
# calculate mean and std of permutation importance and save to excel
rf_fi_perm = pd.DataFrame.from_dict(rf_feature_importance_perm, orient = 'index')
RF_perm = rf_fi_perm.copy()
RF_perm['importance'] = rf_fi_perm.mean(axis = 1)
RF_perm['std'] = rf_fi_perm.std(axis = 1)
RF_perm.to_excel("RF_LOOCV_perm_importance_bestmodel_nonzero.xlsx")

In [None]:
# calculate mean and std of MSE of the lasso and Base model and save MSE to excel
MSE = dict()
MSE['RF'] = MSE_rf
MSE['base'] = MSE_base_rf
MSE = pd.DataFrame.from_dict(MSE, orient = 'index')
MSE_tot = MSE.copy()
MSE_tot['mean'] = MSE.mean(axis = 1)
MSE_tot['std'] = MSE.std(axis = 1)
MSE_tot.to_excel("MSE_RF_LOOCV_bestmodel_nonzero.xlsx")

In [None]:
# combine y_pred, y_mean (base prediction), y_true and alpha and save to excel
S_values = dict()
S_values['y_pred'] = [item[0] for item in Y_preds_rf]
S_values['y_mean'] = [item[0] for item in Y_means_rf]
S_values['y_true'] = [float(item) for item in Y_trues_rf]
S_value = pd.DataFrame.from_dict(S_values, orient = 'index')
S_value.to_excel("S_values_RF_loocv_bestmodel.xlsx")