In [1]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.linear_model import LinearRegression, LassoCV, Lasso, RidgeCV, ElasticNetCV
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score, mean_squared_error
from modelling_functions import *

## Processing for Models and Stacking

In [2]:
#### Read Data files
dict_dictonary = {}
housing, housing_features, feat_labels = read_and_clean(filepath = "../data/clean_train.csv")
htest_id, htest_features, htest_labels = read_and_clean(filepath = "../data/clean_test.csv", test = True)
training = housing_features
testing = htest_features

#### Process and Generate Train Test Splits
test_col = testing.columns
train_col = training.columns

missing = [x for x in train_col if x not in test_col]
needed = [x for x in test_col if x not in train_col]

training = training.drop(missing, axis=1)
testing = testing.drop(needed, axis=1)

X_train, X_test, Y_train, Y_test = train_test_split(training, housing.saleprice, test_size = 0.2, random_state=42)

In [5]:
# Train each linear model on the train test splits
from sklearn import linear_model as lm
import sklearn.model_selection as ms
from sklearn.feature_selection import SelectFromModel
import warnings
warnings.simplefilter('ignore')

## Split Training / Testing Data
X_train, X_test, Y_train, Y_test = train_test_split(training, housing.saleprice, test_size = 0.2, random_state=42)

## Build the Model Set
alpha_steps = (1e-5,1e-2,200)
steps = np.linspace(1,0.8,11)
lasso = lm.Lasso()
ridge = lm.Ridge()
elasticnets = [lm.ElasticNet(l1_ratio = i) for i in steps[1:]]
names = ['Lasso'] + [str(round(i,2)) for i in steps[1:]]
modelList = pd.Series([lasso] + elasticnets, index = names)

## Hyperparameter Tuning
print('Tuning Hyperparameters...')
param_grid = {'max_iter': [10,100,1000],
            'alpha':np.linspace(1e-7,9e-7,100)}
grid = ms.GridSearchCV(modelList.Lasso,param_grid,scoring='r2',cv=10)
grid.fit(X_train, Y_train)
best_alpha = grid.best_params_['alpha']
best_iter = grid.best_params_['max_iter']
print('Best Alpha Found: {}\n'.format(best_alpha))
#modelList.apply(lambda x: x.set_params(alpha = best_alpha, max_iter = best_iter))


## Feature Selection
#print("Running Lasso Regression for Feature Selection...")
#modelList.Lasso.fit(X_train, Y_train)
#drop_col = list(training.columns[np.where(modelList.Lasso.coef_ == 0)[0]])
#print("Dropping {} columns\n".format(len(drop_col)))
#training = training.drop(drop_col, axis = 1)
#testing = testing.drop(drop_col, axis = 1)

#results = modelStack(training, testing, housing.saleprice, produce_submission = True, n_splits = 3)

Tuning Hyperparameters...
Best Alpha Found: 9e-07



In [40]:
scores = grid.cv_results_['mean_train_score']
alphas = grid.cv_results_['param_alpha']
iters = grid.cv_results_['param_max_iter']
df = pd.DataFrame({'Score':scores,'Alpha':alphas,'Iters':iters})
X, Y = np.meshgrid(alphas, iters)


In [12]:
#Submission(htest_id, results, "submission_stacked.csv")