# Interactions 111001001 all features

In this notebook, we're specifically working on the dataset formed by dropping (31, 496, 524, 917, 1299) with all features.

We want to explore whether we can improve our linear models by including interaction terms.

In [1]:
import itertools
import numpy as np
import pandas as pd
import scipy
from scipy import optimize

pd.set_option('display.precision',20)
pd.set_option('display.max_colwidth',100)

from sklearn import linear_model, svm, tree
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_predict, KFold, cross_val_score, \
                                    GridSearchCV, RandomizedSearchCV, ShuffleSplit, StratifiedShuffleSplit
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import PolynomialFeatures
    
    
    
import xgboost as xgb

from time import time
from scipy.stats import randint as sp_randint

import matplotlib.pylab as plt
from matplotlib.pylab import rcParams
from matplotlib import pyplot
rcParams['figure.figsize'] = 12, 4
%matplotlib inline

In [2]:
# def to compare goodness of fit on training set
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [3]:
# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [4]:
# run randomized search
def random_search(regr, param_dist, n_iter_search): 
    rs = RandomizedSearchCV(regr, param_distributions=param_dist, scoring = 'neg_mean_squared_error',
                                   n_jobs=-1, n_iter=n_iter_search, cv=kfold) #, verbose = 4)
    start = time()
    rs.fit(x_train, y_train)
    print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
    report(rs.cv_results_)

In [5]:
# run single parameter search (for ridge or lasso)
def single_search(regr, params):
    regr_results_df = pd.DataFrame(dtype = 'float64')
    count = 0
    for k, v in params.items():
        for val in v:
            regr.set_params(**{k: val})
            regr_results_df.loc[count, k] = val
            results = cross_val_score(regr, x_train, y_train, cv=kfold, scoring = 'neg_mean_squared_error')
            (regr_results_df.loc[count, 'RMSE'], regr_results_df.loc[count, 'std dev']) = \
                    (np.sqrt(-results.mean()), np.sqrt(results.std()))
            count += 1
    return regr_results_df

In [6]:
# test against validation set
def validate(regr):
    regr.fit(x_train, y_train)
    y_pred = regr.predict(x_validation)
    return rmse(y_validation, y_pred)

In [7]:
# Cross-validation sets
kfold = KFold(n_splits=10, random_state=7)

In [8]:
df = pd.read_csv("./input/train_tidy_111001001.csv")
test_df = pd.read_csv("./input/test_tidy_111001001.csv")

## Interactions

In [17]:
ss = ShuffleSplit(n_splits=1, test_size=0.20, random_state=71)

X = df.values

for train_idx, validation_idx in ss.split(X):
    train_df = df.iloc[train_idx]
    validation_df = df.iloc[validation_idx]
    
y_validation = validation_df['SalePrice'].values
x_validation = validation_df.drop(['HouseId', 'SalePrice', 'GarageAge', 'GarageAgeLin'],axis=1).values
y_train = train_df['SalePrice'].values
x_train = train_df.drop(['HouseId', 'SalePrice', 'GarageAge', 'GarageAgeLin'],axis=1).values

In [18]:
lassolarscv_regr = linear_model.LassoLarsCV()
baseline = validate(lassolarscv_regr)
baseline



0.09813879414399386

We have already engineered several features that linearize single features, so we will only focus on generating degree 2 interactions between our features and not powers.

In [10]:
pf = PolynomialFeatures(degree=2, interaction_only=True)

In [11]:
pf.fit(x_train, y_train)

PolynomialFeatures(degree=2, include_bias=True, interaction_only=True)

In [14]:
len(pf.get_feature_names())

58997

In [15]:
x_train = pf.transform(x_train)

In [21]:
x_validation = pf.transform(x_validation)

In [20]:
intresult = validate(lassolarscv_regr)
(intresult, intresult - baseline)



ValueError: shapes (291,343) and (58997,) not aligned: 343 (dim 1) != 58997 (dim 0)

In [23]:
y_pred = lassolarscv_regr.predict(x_validation)
intresult = rmse(y_validation, y_pred)
(intresult, intresult - baseline)

(0.10227051672210778, 0.0041317225781139222)

In [30]:
pf.get_feature_names()[-20:]

['x336 x338',
 'x336 x339',
 'x336 x340',
 'x336 x341',
 'x336 x342',
 'x337 x338',
 'x337 x339',
 'x337 x340',
 'x337 x341',
 'x337 x342',
 'x338 x339',
 'x338 x340',
 'x338 x341',
 'x338 x342',
 'x339 x340',
 'x339 x341',
 'x339 x342',
 'x340 x341',
 'x340 x342',
 'x341 x342']

In [31]:
lassolarscv_regr.coef_

array([ 0.,  0.,  0., ...,  0.,  0.,  0.])

In [26]:
lassolarscv_features = zip(pf.get_feature_names(), lassolarscv_regr.coef_)

In [27]:
lassolars_features_df = pd.DataFrame.from_dict(lassolarscv_features)
lassolars_features_df.columns = ["Feature", "Coeff"]
lassolars_features_df = lassolars_features_df[lassolars_features_df["Coeff"]!=0]
lassolars_features_df["sort_ind"] = abs(lassolars_features_df["Coeff"])
lassolars_features_df = lassolars_features_df.sort_values(by="sort_ind", ascending = False)
lassolars_features_df = lassolars_features_df.drop('sort_ind', 1)

In [28]:
lassolars_features_df

Unnamed: 0,Feature,Coeff
25581,x83 x338,0.5577533030480789
25840,x84 x339,0.5252872290329554
58497,x310 x339,0.1838980852977174
58741,x319 x340,0.1764837540299795
55137,x254 x311,0.1699131296370882
57867,x294 x341,0.1609299440331946
39199,x143 x246,-0.1535307223363843
58434,x308 x341,0.1500778350526902
25583,x83 x340,0.1330594273429093
25558,x83 x315,0.1073129908850054


It appears that only interaction terms were selected in the regression. 