In [1]:
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

# Training and Test R² with default parameters of diabetes dataset
* This dataset has 10 feature
* Those features are age, sex, bmi, bp, s1, s2, s3, s4, s5, s6.

In [2]:
diabetes = np.genfromtxt('diabetes.data')

x = diabetes[1:,:10]
y = diabetes[1:,10:]

xtrain,xtest,ytrain,ytest = train_test_split(x,y,random_state=311)

lasso = Lasso().fit(xtrain,ytrain)

print('Training R² with default parameters of diabetes dataset :',lasso.score(xtrain,ytrain))
print('Test R² with original parameters of diabetes dataset :',lasso.score(xtest,ytest))

Training R² with default parameters of diabetes dataset : 0.5137701693173082
Test R² with original parameters of diabetes dataset : 0.49252165153314775


# Training and Test R² with scaled parameters of diabetes dataset
Standardization of parameters is better suited for Lasso, as it allows a uniform penalization of features and, when normalization is used, the different ranges of parameters might cause differences in penalizing the parameters.

In [3]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaler.fit(xtrain)

scaled_xtrain = scaler.transform(xtrain)

scaled_xtest = scaler.transform(xtest)

lasso_scaled = Lasso().fit(scaled_xtrain,ytrain)

print('Training R² with scaled parameters :',lasso_scaled.score(scaled_xtrain,ytrain))
print('Test R² with scaled parameters :',lasso_scaled.score(scaled_xtest,ytest))

Training R² with scaled parameters : 0.5166974081783697
Test R² with scaled parameters : 0.49783142364534294


# Determining the best ⍺ for lasso by using grid search and cross validation
When the model is iterated through a range of values of ⍺, the test R² of the model is high when ⍺=0.1 and the model used 10 features.

In [4]:
best_alpha = 0
best_score = 0
non_zero_features = []
alpha = [0.00001,0.0001,0.001,0.01,0.1,1,10]
for i in alpha:
    lasso_cv = Lasso(alpha=i).fit(scaled_xtrain,ytrain)
    scores=cross_val_score(lasso_cv,scaled_xtrain,ytrain,cv=5)
    score = np.mean(scores)
    if score>best_score:
        best_score=score
        best_alpha=i
    non_zero_features.append(np.sum(1- (lasso_cv.coef_ == 0)))
print('Best score for cross validation is',best_score)
print('Best alpha based on cross validation is',best_alpha)
print('alpha values tried :-',alpha)
print('non zero features for each alpha :-',non_zero_features)

Best score for cross validation is 0.4815742066905676
Best alpha based on cross validation is 0.1
alpha values tried :- [1e-05, 0.0001, 0.001, 0.01, 0.1, 1, 10]
non zero features for each alpha :- [10, 10, 10, 10, 10, 9, 4]


# Training and Test R² after using best ⍺ based on cross validation

In [5]:
lasso = Lasso(alpha=0.1).fit(scaled_xtrain,ytrain)
print('Training R² with scaled train set with alpha=0.1 :',lasso.score(scaled_xtrain,ytrain))
print('Test R² with scaled test set with alpha=0.1 :',lasso.score(scaled_xtest,ytest))

Training R² with scaled train set with alpha=0.1 : 0.5198615788686103
Test R² with scaled test set with alpha=0.1 : 0.4910363431094602


# Inductive Conformal Prediciton

In [6]:
x_train,x_calib,y_train,y_calib = train_test_split(xtrain,ytrain,test_size = 0.25,random_state=311)

scaler = StandardScaler()
scaled_x_train = scaler.fit_transform(x_train)
scaled_x_calib = scaler.transform(x_calib)
scaled_x_test = scaler.transform(xtest)

lasso_icp = Lasso(alpha=0.1).fit(scaled_x_train,y_train)
y_pred = lasso_icp.predict(scaled_x_calib)
non_conf_score = []
for i in range(len(scaled_x_calib)):
    non_conf = abs(y_pred[i] - y_calib[i])
    non_conf_score.append(non_conf.item())

non_conf_scores = sorted(non_conf_score)

y_pred_test = lasso_icp.predict(scaled_x_test)

def icp(sig_level):
    k = np.ceil((1-sig_level)*(len(y_calib)+1))
    non_conf_k = non_conf_scores[int(k-1)]
    icp_lasso=[]
    for i in range(len(y_pred_test)):
        icp_lasso.append([y_pred_test[i]-non_conf_k,y_pred_test[i]+non_conf_k])
    correct_pred = 0
    for i in range(len(icp_lasso)):
        if ytest[i]>=icp_lasso[i][0] and ytest[i]<=icp_lasso[i][1]:
            correct_pred = correct_pred+1
    test_error_rate = 1-(correct_pred/len(ytest))
    return non_conf_k*2,icp_lasso,test_error_rate

# Inductive conformal prediction for 5% significance level

In [7]:
length,prediction_sets,test_error_rate = icp(0.05)
print('\nlength of prediction sets :',length)
print('\nPrediction sets at 5% significance level :\n',prediction_sets)
print('\nTest error rate at 5% significance level :\n',test_error_rate)


length of prediction sets : 215.4001417047757

Prediction sets at 5% significance level :
 [[144.20687291059204, 359.60701461536775], [36.582955450629555, 251.98309715540526], [133.23165354271924, 348.63179524749495], [12.900967065401531, 228.30110877017722], [107.99795327797577, 323.39809498275145], [96.216891380144, 311.61703308491974], [74.10605283348812, 289.5061945382638], [-17.61846432361523, 197.78167738116048], [-8.800395034141133, 206.59974667063457], [65.53717657314317, 280.93731827791885], [140.9407230199177, 356.3408647246934], [70.00309829917785, 285.40324000395356], [17.204280676004146, 232.60442238077985], [103.15009589723925, 318.55023760201493], [95.69130310205375, 311.0914448068295], [47.124755936748755, 262.5248976415245], [-11.947846414414613, 203.4522952903611], [29.65881329403564, 245.05895499881134], [122.61979960385685, 338.01994130863255], [36.95270707651005, 252.35284878128576], [63.7404415497617, 279.14058325453743], [77.02836452346412, 292.4285062282398], [

# Inductive conformal prediction for 20% significance level

In [8]:
length,prediction_sets,test_error_rate = icp(0.2)
print('length of prediction sets :',length)
print('\nPrediction sets at 20% significance level :\n',prediction_sets)
print('\nTest error rate at 20% significance level :\n',test_error_rate)

length of prediction sets : 146.38764651714217

Prediction sets at 20% significance level :
 [[178.7131205044088, 325.10076702155095], [71.08920304444632, 217.4768495615885], [167.737901136536, 314.12554765367815], [47.4072146592183, 193.79486117636048], [142.50420087179253, 288.8918473889347], [130.72313897396077, 277.11078549110294], [108.61230042730489, 254.99994694444706], [16.88778327020154, 163.2754297873437], [25.705852559675634, 172.0934990768178], [100.04342416695994, 246.4310706841021], [175.44697061373446, 321.8346171308766], [104.50934589299462, 250.8969924101368], [51.71052826982091, 198.09817478696309], [137.65634349105602, 284.0439900081982], [130.19755069587052, 276.5851972130127], [81.63100353056552, 228.0186500477077], [22.558401179402154, 168.94604769654433], [64.1650608878524, 210.55270740499458], [157.1260471976736, 303.51369371481576], [71.45895467032682, 217.846601187469], [98.24668914357846, 244.63433566072064], [111.53461211728089, 257.9222586344231], [-25.2282