In [154]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

#Global Variable
use_log = True
scale_X = True

house = pd.read_csv('../derivedData/train_NotDum.csv', index_col=0)
house['logSalePrice'] = np.log(house['SalePrice'])
cols = house.columns 
cols = cols[:-2] # remove SalePrice and logSalePrice

In [155]:
# implement MV encoding
for col in cols[house[cols].dtypes == 'object']:
    if use_log:
        gp = house.groupby(col)['logSalePrice'].mean()
    else:
        gp = house.groupby(col)['SalePrice'].mean()
    house[col] = house[col].apply(lambda x: gp[x])

In [156]:
def key(s):
    # returns the beginning of the string up to the next capital letter
    return ([a for a in re.split(r'([A-Z][a-z]*)', s) if a][0])

def equiv(s):
    group = [t for t in cols if key(t)==key(s)]
    return(group)

groups = []
seen = set()
for col in cols:
    s = key(col)
    if s not in seen:
        seen.add(s)
        groups.append(equiv(col))
n_gps = len(groups)

In [157]:
# pick winners from each group using mv linear
if scale_X:
    X = (house[cols] - house[cols].mean(axis=0)) / house[cols].std(axis=0)
else:
    X = house[cols]

if use_log:
    y = house['logSalePrice']
else:
    y = house['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=0)

lin = LinearRegression()
results_lin = pd.DataFrame(index=range(n_gps), columns=['col', 'R2_in', 'R2_out'])
for i in range(n_gps):
    R2s_in = [lin.fit(X_train[[c]], y_train).score(X_train[[c]], y_train) for c in groups[i]]
    R2s_out = [lin.fit(X_train[[c]], y_train).score(X_test[[c]], y_test) for c in groups[i]]
    best = R2s_out.index(max(R2s_out))
    results_lin.iloc[i, 0] = groups[i][best]
    results_lin.iloc[i, 1] = R2s_in[best]
    results_lin.iloc[i, 2] = R2s_out[best]

results_lin = results_lin.sort_values('R2_out', ascending=False)

In [158]:
# pick winners from each group using tree
tree = DecisionTreeRegressor(max_depth=5)
results_tree = pd.DataFrame(index=range(n_gps), columns=['col', 'R2_in', 'R2_out'])
for i in range(n_gps):
    R2s_in = [tree.fit(X_train[[c]], y_train).score(X_train[[c]], y_train) for c in groups[i]]
    R2s_out = [tree.fit(X_train[[c]], y_train).score(X_test[[c]], y_test) for c in groups[i]]
    best = R2s_out.index(max(R2s_out))
    results_tree.iloc[i, 0] = groups[i][best]
    results_tree.iloc[i, 1] = R2s_in[best]
    results_tree.iloc[i, 2] = R2s_out[best]

results_tree = results_tree.sort_values('R2_out', ascending=False)

In [169]:
outer = results_lin.merge(results_tree, 'outer', 'col', suffixes=('_lin', '_tree'))
outer['R2_avg'] = outer.apply(lambda x: np.nanmean((x[2], x[4])), axis=1)
outer = outer.sort_values('R2_avg', ascending=False)
outer

Unnamed: 0,col,R2_in_lin,R2_out_lin,R2_in_tree,R2_out_tree,R2_avg
0,OverallQual,0.66328,0.702248,0.670367,0.699902,0.701075
1,Neighborhood,0.56438,0.591823,0.566537,0.571988,0.581905
2,GrLivArea,0.5082,0.549108,0.575316,0.53151,0.540309
3,ExterQual,0.45507,0.496346,0.456868,0.486486,0.491416
46,GarageCars,,,0.494553,0.460282,0.460282
4,KitchenQual,0.442152,0.454006,0.444087,0.453264,0.453635
8,BsmtQual,0.374519,0.391126,0.44625,0.468499,0.429812
7,YearBuilt,0.334728,0.393324,0.460119,0.461632,0.427478
6,GarageFinish,0.355462,0.406687,,,0.406687
5,TotalBsmtSF,0.408132,0.423232,0.48549,0.372717,0.397975
