In [None]:
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import KFold
import statsmodels.api as sm
import statsmodels

import pandas as pd
from os.path import join, expanduser
from glob import glob

from PyQt5.QtWidgets import QFileDialog

In [None]:
home = expanduser('~')
# dataset_path = join(home, 'Work', 'ADNI_Project', 'Data_revision')  # Modify this to match your dataset location
dataset_path = str(QFileDialog.getExistingDirectory(None, 'Select directory'))

# Lasso 
alpha float, default=1.0

    Constant that multiplies the L1 term. Defaults to 1.0. alpha = 0 is equivalent to an ordinary least square, solved by the LinearRegression object. For numerical reasons, using alpha = 0 with the Lasso object is not advised. Given this, you should use the LinearRegression object.

I've used LassoCV which does the CV without us having to write the loop. Check out the documentation [here](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LassoCV.html#sklearn.linear_model.LassoCV)


In [None]:
def lasso(X, y, alphas, cv=None):
    '''
    alphas: array-like; the alpha values to be tested
    cv: int or None; if None, then LOOCV, if int then KFold with cv number of splits
    '''
    clf_lasso = linear_model.LassoCV(alphas=alphas, cv=cv).fit(X,y)
    return clf_lasso

# Ridge
alpha {float, ndarray of shape (n_targets,)}, default=1.0

    Regularization strength; must be a positive float. Regularization improves the conditioning of the problem and reduces the variance of the estimates. Larger values specify stronger regularization. Alpha corresponds to 1 / (2C) in other linear models such as LogisticRegression or LinearSVC. If an array is passed, penalties are assumed to be specific to the targets. Hence they must correspond in number.

I've used RidgeCV which does the CV without us having to write the loop. Check out the documentation [here](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RidgeCV.html#sklearn.linear_model.RidgeCV)


In [None]:
def ridge(X, y, alphas, cv=None):
    '''
    alphas: array-like; the alpha values to be tested
    cv: int or None; if None, then LOOCV, if int then KFold with cv number of splits
    '''
    clf_ridge = linear_model.RidgeCV(alphas=alphas, cv=cv).fit(X,y)
    return clf_ridge

In [None]:
def drop_nans(X, y):
    nan_indices = np.where(np.isnan(y))[0]
    X.drop(axis='index', index=nan_indices, inplace=True)
    y = y[~np.isnan(y)]
    return (X, y)

# LOOCV

In [None]:
def loocv(radioisotope, psych_test='MMSE'):
    indep_df = pd.read_csv(join(dataset_path, 'AD', radioisotope, 'stats', 'output_'+psych_test.lower()+'.csv'))
    indep_df = pd.concat([indep_df, pd.read_csv(join(dataset_path, 'MCI', radioisotope, 'stats', 'output_'+psych_test.lower()+'.csv'))], ignore_index=True)
    indep_df = pd.concat([indep_df, pd.read_csv(join(dataset_path, 'CN', radioisotope, 'stats', 'output_'+psych_test.lower()+'.csv'))], ignore_index=True)
    indep_df.drop([indep_df.columns[i] for i in range(2)], axis=1, inplace=True)

    target_df = pd.read_csv(join(dataset_path, 'AD', radioisotope, 'stats', 'summary.csv'))
    target_df = pd.concat([target_df, pd.read_csv(join(dataset_path, 'MCI', radioisotope, 'stats', 'summary.csv'))], ignore_index=True)
    target_df = pd.concat([target_df, pd.read_csv(join(dataset_path, 'CN', radioisotope, 'stats', 'summary.csv'))], ignore_index=True)

    X = indep_df
    y = target_df[psych_test]
    X, y = drop_nans(X, y)
    X_num = X.to_numpy(copy=True)
    y_num = y.to_numpy(copy=True)
    
    return (X_num, y_num)

    # model = statsmodels.regression.linear_model.OLS(y, X, missing='drop').fit_regularized(alpha=2., L1_wt=0, refit=True)
    # results_summary = model.summary()
    # print(radioisotope)
    # print(results_summary.tables[0])

    # alphas = np.logspace(-6, -1, 30)

    # clf_ridge = ridge(X, y, alphas)
    # print(radioisotope, ' R-squared: ', clf_ridge.score(X, y), '\talpha: ', clf_ridge.alpha_)
    # # clf_lasso = lasso(X, y, alphas)
    # # print(radioisotope, ' R-squared: ', clf_lasso.score(X, y))

In [None]:
radioisotopes = ['AV45', 'PiB']

df = None
for radioisotope in radioisotopes:
    loocv(radioisotope)
    # loocv(radioisotope, psych_test='NPIQ')

# Pranav's Code

In [None]:
# clf = linear_model.Lasso(alpha=0.1)
def loocv_loop(X, y):
    clf = linear_model.Ridge(alpha=2) # choose one of lasso (L1) or ridge (L2), vary alpha, and check rmse

    sum_sq_errors = 0
    N = len(X)

    pred = []
    actual = []
    for i in range(N):
        X_val, y_val = np.array([X[i]]), np.array([y[i]])
        actual.append(y_val)
        # print('X.shape: ', X.shape)
        X_train = np.delete(X, i, axis=0)
        y_train = np.delete(y, i)

        clf.fit(X_train,y_train)
        pred_y_val = clf.predict(X_val)
        pred.append(pred_y_val)
        
        sq_error = (pred_y_val - y_val)**2
        sum_sq_errors += sq_error
        
    rmse_val =  np.sqrt(sum_sq_errors / N)
    stdev = np.std(pred)
    print('rmse_val: ', rmse_val) # currently the dataset is random, so wouldn't make much sense
    print('stdev: ', stdev)

In [None]:
radioisotopes = ['AV45', 'PiB']

df = None

for radioisotope in radioisotopes:
    X, y = loocv(radioisotope)
    loocv_loop(X, y)
    # loocv(radioisotope, psych_test='NPIQ')

# k-fold CV

In [None]:
# clf = linear_model.Lasso(alpha=0.1)
clf = linear_model.Ridge(alpha=1) # choose one, vary alpha, and check rmse


kf = KFold(n_splits=3) # 3 fold CV
for train_index, val_index in kf.split(X):
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]
    
    clf.fit(X_train,y_train)
    pred_y_val = clf.predict(X_val)
    
    sq_error = (pred_y_val - y_val)**2
    sum_sq_errors = np.sum(sq_error)
    rmse_fold = np.sqrt(sum_sq_errors / N) # rmse for each fold
    print(rmse_fold)