In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy.stats import boxcox
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Lasso

In [None]:
X = pd.read_pickle('data/features.p')
y = pd.read_pickle('data/target.p')

In [None]:
X.columns

In [None]:
X.sample(5)

In [None]:
X.describe()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
X_train_bc = pd.DataFrame()
X_test_bc = pd.DataFrame()
for col in X_train.columns:
    box_cox_trans_tr, lmbda = boxcox(X_train[col]+.000001)
    box_cox_trans_ts = boxcox(X_test[col]+.000001, lmbda)
    X_train_bc[col] = pd.Series(box_cox_trans_tr)
    X_test_bc[col] = pd.Series(box_cox_trans_ts)

In [None]:
sc = StandardScaler()

In [None]:
X_train_bc_sc  = sc.fit_transform(X_train_bc)
X_test_bc_sc = sc.transform(X_test_bc)

In [None]:
lr = LinearRegression().fit(X_train_bc_sc, y_train)

In [None]:
print("Training set score: {:.2f}".format(lr.score(X_train_bc_sc, y_train)))
print("Test set score: {:.2f}".format(lr.score(X_test_bc_sc, y_test)))

In [None]:
from sklearn.linear_model import Lasso

def fit_score_lasso(alpha, verbose=True):
    
    lasso = Lasso(alpha=alpha, max_iter=1E5).fit(X_train, y_train)
    if verbose: print("Training set score: {:.4f}".format(lasso.score(X_train, y_train)))
    if verbose: print("Test set score: {:.4f}".format(lasso.score(X_test, y_test)))
    if verbose: print("Number of features used: {}".format(np.sum(lasso.coef_ != 0)))
    return lasso, lasso.score(X_test, y_test)

In [None]:
# # this takes about 20 minutes
# scores = list()
# for alpha in tqdm(np.logspace(-1,4,60)):
#     scores.append(fit_score_lasso(alpha, False))
# plt.plot(np.logspace(-2,4,6), scores)
# plt.xscale('log')
# results = pd.DataFrame(np.array(scores))
# results.columns = ['model', 'test_score']
# results['alpha'] = results.model.apply(lambda x: x.alpha)

In [None]:
results = pd.read_pickle('data/ames_lasso_results.p')

In [None]:
results.test_score.argmax()
peak = results.loc[37]
peak

In [None]:
plt.plot(np.logspace(-2,4,60), results.test_score)
plt.xlabel('alpha')
plt.ylabel('test score')
plt.xscale('log')

In [None]:
lasso_1 = results.loc[36].model
lasso_2 = results.loc[37].model
lasso_3 = results.loc[38].model

In [None]:
plt.plot(lasso_1.coef_, 's', label="Lasso alpha=1")
plt.plot(lasso_2.coef_, '^', label="Lasso alpha=0.1")
plt.plot(lasso_3.coef_, 'v', label="Lasso alpha=0.004")

plt.legend(ncol=2, loc=(0, 1.05))
plt.ylim(-25, 25)
plt.xlabel("Coefficient index")
plt.ylabel("Coefficient magnitude")

In [None]:
best_results = results[results.test_score > 0.9]
best_results

In [None]:
coefs = np.array([model.coef_ for model in best_results.model.values])
np.array(coefs).shape

In [None]:
coefs = pd.DataFrame(coefs)
coefs.index = ['Lasso({:0.2f})'.format(alpha) for alpha in best_results.alpha.values]
coefs.columns = X.columns

In [None]:
coefs.describe()

In [None]:
zeros = coefs.T[(coefs.max() == 0) & (coefs.min() == 0)]
zeros.index.values

In [None]:
X_red = X.copy()
for col in zeros.index.values:
    X_red.drop(col, axis=1, inplace=True)
X_red.shape    
