# Prostate Cancer Dataset

In [None]:
# %matplotlib inline
# %matplotlib nbagg
import numpy as np
import numpy.linalg as la
import numpy.random as rd
from numpy.linalg import svd

import matplotlib.pyplot as plt
import seaborn as sns

import pandas as pd

from sklearn import linear_model, preprocessing


from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV

In [None]:
#Load prostate cancer dataset
df = pd.read_csv('./data/prostate/data.txt', header=0, index_col=0, sep='\t')

# Display the first 6 rows of the DataFrame
df.head(6)

In [None]:
# Training data or test data
# id_train = np.array(df.iloc[:,9]=='T')
id_train = np.array(df.train=='T')

# Extract features and objective variables
X_all = np.array(df.iloc[:,:8])
X_all = preprocessing.scale(X_all) #standarlization (mean=0, std=1)
# y_all = np.array(df.iloc[:,8])
y_all = np.array(df.lpsa)
y_all = preprocessing.scale(y_all) #standarlization (mean=0, std=1)

# Training data
X_train = X_all[id_train,:].copy()
y_train = y_all[id_train].copy()

# Test data
X_test = X_all[~id_train,:].copy()
y_test = y_all[~id_train].copy()

# Feature names
fea_names = df.columns[:8]

In [None]:
# Pairs plot to check correlations between variables
sns.set(style='whitegrid', context='notebook')
cols = df.columns
sns.pairplot(df[cols[:9]], size=2)
plt.tight_layout()
plt.show()

# Ridge Regression

In [None]:
# Define and training
model_ridge = Ridge(alpha = 0.001)
model_ridge.fit(X_train,y_train)

# Display optimized coefficients
print( pd.Series(model_ridge.coef_, index=fea_names) )

# Prediction for both test data and training data
y_test_pred = model_ridge.predict(X_test)
y_train_pred = model_ridge.predict(X_train)

# Plot result
fig = plt.figure(figsize=(5, 5))
plt.scatter(y_train, y_train_pred, color='black', s=30, alpha=0.3, label='train')
plt.scatter(y_test, y_test_pred, color='red', s=30, alpha=0.8, label='test')
min_val = np.min(np.r_[y_train, y_test]) -0.2
max_val = np.max(np.r_[y_train, y_test]) + 0.2
plt.xlim([min_val, max_val])
plt.ylim([min_val, max_val])
plt.xlabel('Observed',fontsize=14)
plt.ylabel('Predicted',fontsize=14)
plt.grid()
plt.legend()
plt.show()

# Regularization path of ridge regression

In [None]:
# Define model
model = Ridge()

# Points on the path
n_alphas = 100
alphas = np.logspace(-5, 5, n_alphas)

### compute the path
coefs = []
df = []  # the degree of freedom
u, s, v = svd(X_train)
for a in alphas:
    model.set_params(alpha=a)
    model.fit(X_train, y_train)  # training 
    coefs.append(model.coef_)
    df.append( np.sum(s**2/(s**2+a)) )
coefs = np.array(coefs)

# Cross-Validation to optimize the weight value
model_cv = RidgeCV(alphas=alphas, cv=3)
model_cv.fit(X_train, y_train)
n = np.where(model_cv.alphas == model_cv.alpha_)[0]

# Display the regularization path
plt.figure(figsize=(5, 5))
for i in range(8):
    plt.plot(df, coefs[:,i], '.-', label=fea_names[i])
plt.axvline(df[n[0]], color='black')
plt.xlabel('df(alpha)')
plt.ylabel('Coefficient')
plt.title('Ridge coefficients as a function of the regularization')
plt.axis('tight')
plt.grid()
plt.legend()
plt.show()

# LASSO

In [None]:
# Define and training
model_lasso = Lasso(alpha = 0.01)

# Display optimized coefficients
model_lasso.fit(X_train,y_train)
print( pd.Series(model_lasso.coef_, index=fea_names) )

# Prediction for both test data and training data
y_test_pred = model_lasso.predict(X_test)
y_train_pred = model_lasso.predict(X_train)

# Plot result
fig = plt.figure(figsize=(5, 5))
plt.scatter(y_train, y_train_pred, color='black', s=30, alpha=0.3, label='train')
plt.scatter(y_test, y_test_pred, color='red', s=30, alpha=0.8, label='test')
min_val = np.min(np.r_[y_train, y_test]) -0.2
max_val = np.max(np.r_[y_train, y_test]) + 0.2
plt.xlim([min_val, max_val])
plt.ylim([min_val, max_val])
plt.xlabel('Observed',fontsize=14)
plt.ylabel('Predicted',fontsize=14)
plt.grid()
plt.legend()
plt.show()

# Regularization Path by Lasso

In [None]:
# Define model
model = Lasso()

# Points on the path
n_alphas = 100
alphas = np.logspace(-2, 2, n_alphas)

### compute the path
coefs = []
for a in alphas:
    model.set_params(alpha=a)
    model.fit(X_train, y_train)
    coefs.append(model.coef_)
coefs = np.array(coefs)

# Shrinkage factor
xx = np.sum(np.abs(coefs), axis=1)
xx /= xx[0]    
 
# Display the result
plt.figure(figsize=(5, 5))
for i in range(8):
    plt.plot(xx, coefs[:,i], '.-', label=fea_names[i])
ymin, ymax = plt.ylim()
plt.xlabel('Shrinkage Factor s')
plt.title('Lasso path')
plt.axis('tight')
plt.grid()
plt.legend()
plt.show()

# LARS

In [None]:
# Define and training to compute path
alphas, _, coefs = linear_model.lars_path(X_train, y_train, method='lasso')

# Cross-Validation to optimize the weight value
model_cv = LassoCV(alphas=alphas, cv=3)
model_cv.fit(X_train, y_train)
n = np.where(model_cv.alphas == model_cv.alpha_)[0]
# print(model_cv.alpha_)

# Shrinkage factor
xx = np.sum(np.abs(coefs.T), axis=1)
xx /= xx[-1]

# Display results
plt.figure(figsize=(5, 5))
for i in range(8):
    plt.plot(xx, coefs[i,:], '.-', label=fea_names[i])
plt.axvline(xx[n], color='black')
ymin, ymax = plt.ylim()
plt.xlabel('Shrinkage Factor s')
plt.ylabel('Coefficients')
plt.title('LASSO Path by LARS')
plt.axis('tight')
plt.grid()
plt.legend()
plt.show()

# Display CV error
plt.figure(figsize=(5, 5))
plt.plot(xx, model_cv.mse_path_.mean(axis=1), '*-')
plt.axvline(xx[n], color='black')
plt.xlabel('Shrinkage Factor s')
plt.ylabel('Averaged MSE')
plt.grid()
plt.show()