In [15]:
from sklearn.linear_model import Lasso
import numpy as np
import pandas as pd

In [2]:
Lasso().get_params()

{'alpha': 1.0,
 'copy_X': True,
 'fit_intercept': True,
 'max_iter': 1000,
 'normalize': False,
 'positive': False,
 'precompute': False,
 'random_state': None,
 'selection': 'cyclic',
 'tol': 0.0001,
 'warm_start': False}

In [4]:
import pandas as pd
df = pd.read_csv('../data/housing.csv')

In [5]:
X = df.drop('PRICE', axis=1)
y = df['PRICE']

In [6]:
lasso = Lasso()

In [7]:
lasso.fit(X, y)

Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [8]:
lasso.coef_

array([-0.06343729,  0.04916467, -0.        ,  0.        , -0.        ,
        0.9498107 ,  0.02090951, -0.66879   ,  0.26420643, -0.01521159,
       -0.72296636,  0.00824703, -0.76111454])

In [11]:
X @ lasso.coef_ + lasso.intercept_

0      30.997539
1      25.776817
2      29.986014
3      29.517998
4      28.032490
5      27.516462
6      24.452932
7      19.853367
8      11.039311
9      20.526056
10     18.714799
11     23.714893
12     21.293438
13     23.002302
14     22.118866
15     22.751724
16     23.618037
17     18.785783
18     19.104862
19     21.207987
20     14.090727
21     19.765814
22     16.299936
23     15.195004
24     17.711655
25     16.275244
26     18.333023
27     16.197424
28     20.833179
29     21.544969
         ...    
476    17.977899
477    11.300180
478    17.867319
479    21.388264
480    22.383960
481    25.400710
482    26.259625
483    21.368430
484    19.378187
485    21.993171
486    19.243188
487    21.455981
488    12.170497
489     7.372226
490     2.457340
491    12.723109
492    15.889584
493    21.891581
494    20.651919
495    16.765937
496    14.750509
497    20.382800
498    21.709085
499    19.794621
500    20.885955
501    23.904173
502    24.232283
503    28.0418

In [13]:
train = pd.read_csv('../data/iowa_housing/train.csv')
test  = pd.read_csv('../data/iowa_housing/test.csv')

In [16]:
y = np.log(train['SalePrice'])
train.drop('SalePrice', axis=1, inplace=True)
test_id = test['Id']
train.drop('Id', axis=1, inplace=True)
test.drop('Id', axis=1, inplace=True)

In [17]:
train_empty = train.loc[:, train.isnull().sum() > 0]
# grab the columns
cols = train_empty.columns.tolist()
# fill with the appropriate value  -- NA, Other, could also work
train[['GarageType', 'GarageFinish']] = train[['GarageType', 'GarageFinish']].fillna('None')
test[['GarageType', 'GarageFinish']]  = test[['GarageType', 'GarageFinish']].fillna('None')

# we'll use this for GarageYrBlt since it's a numeric column
train['GarageYrBlt'].fillna(0, inplace=True)
test['GarageYrBlt'].fillna(0, inplace=True)

# finding the values to use in the training set
ms_mode   = train['MSZoning'].mode()[0]
gcarsmean = train['GarageCars'].mean()

# and applying them to the test set
test['MSZoning'].fillna(ms_mode, inplace=True)
test['GarageCars'].fillna(gcarsmean, inplace=True)

In [18]:
train['MSSubClass'] = train['MSSubClass'].astype(str)
test['MSSubClass']  = test['MSSubClass'].astype(str)

In [21]:
# your answer here
from sklearn.pipeline import make_pipeline
from category_encoders import OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge

ridge = Ridge()

garage_mapping = {
    'None': 0, # no garage
    'Unf' : 1, # unfinished garage
    'RFn' : 2, # partially finished garage
    'Fin' : 3  # finished garage
}

# mapping for the ordinal column
mapping = {
    'col': 'GarageFinish',
    'mapping': garage_mapping
}

# initialize everything
ore   = OrdinalEncoder(cols=['GarageFinish'], mapping=[mapping])
ohe   = OneHotEncoder()
sc    = StandardScaler()
ridge = Ridge(alpha=100)

# make the pipeline
pipe = make_pipeline(ore, ohe, sc, ridge)

In [22]:
pipe

Pipeline(memory=None,
         steps=[('ordinalencoder',
                 OrdinalEncoder(cols=['GarageFinish'], drop_invariant=False,
                                handle_missing='value', handle_unknown='value',
                                mapping=[{'col': 'GarageFinish',
                                          'mapping': {'Fin': 3, 'None': 0,
                                                      'RFn': 2, 'Unf': 1}}],
                                return_df=True, verbose=0)),
                ('onehotencoder',
                 OneHotEncoder(cols=None, drop_invariant=False,
                               handle_missing='value', handle_unknown='value',
                               return_df=True, use_cat_names=False,
                               verbose=0)),
                ('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('ridge',
                 Ridge(alpha=100, copy_X=True, fit_intercept=True,
               

In [33]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, make_scorer
import category_encoders as ce

scorer = make_scorer(mean_squared_error)

scores = np.sqrt(cross_val_score(estimator=pipe, X=train, y=y, scoring=scorer, cv=10))

In [35]:
np.mean(scores)

0.14393025367074605

In [26]:
?cross_val_score

In [27]:
pipe.fit(train, y).score(train, y)

0.881486530074926

In [36]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()

In [37]:
rf_pipe = make_pipeline(ore, ohe, rf)

In [38]:
rf_pipe.fit(train, y)

Pipeline(memory=None,
         steps=[('ordinalencoder',
                 OrdinalEncoder(cols=['GarageFinish'], drop_invariant=False,
                                handle_missing='value', handle_unknown='value',
                                mapping=[{'col': 'GarageFinish',
                                          'mapping': {'Fin': 3, 'None': 0,
                                                      'RFn': 2, 'Unf': 1}}],
                                return_df=True, verbose=0)),
                ('onehotencoder',
                 OneHotEncoder(cols=['MSSubClass', 'MSZoning', 'Neighborhood',
                                     'GarageType'],
                               drop_...
                 RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                       criterion='mse', max_depth=None,
                                       max_features='auto', max_leaf_nodes=None,
                                       max_samples=None,
                       

In [54]:
pipe.steps[1][1]

OneHotEncoder(cols=['MSSubClass', 'MSZoning', 'Neighborhood', 'GarageType'],
              drop_invariant=False, handle_missing='value',
              handle_unknown='value', return_df=True, use_cat_names=False,
              verbose=0)

In [48]:
columns = rf_pipe.steps[1][1].get_feature_names()

In [43]:
importances = rf_pipe.steps[2][1].feature_importances_

In [50]:
features = pd.DataFrame({
    'Column': columns,
    'Importance': importances,
}).sort_values(by='Importance', ascending=False)

In [61]:
features.tail(25)['Importance'].sum()

0.004589736194177761

In [63]:
rf_pipe.steps[2][1]

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [66]:
import pickle

with open('pipe.pkl', 'wb') as pipe:
    pickle.dump(rf_pipe, pipe)

In [75]:
mean_squared_error

<function sklearn.metrics._regression.mean_squared_error(y_true, y_pred, sample_weight=None, multioutput='uniform_average', squared=True)>

In [71]:
encoding_pipe = make_pipeline(ore, ohe)

In [72]:
train = encoding_pipe.fit_transform(train)

In [73]:
train.head()

Unnamed: 0,MSSubClass_1,MSSubClass_2,MSSubClass_3,MSSubClass_4,MSSubClass_5,MSSubClass_6,MSSubClass_7,MSSubClass_8,MSSubClass_9,MSSubClass_10,...,GarageType_1,GarageType_2,GarageType_3,GarageType_4,GarageType_5,GarageType_6,GarageType_7,GarageYrBlt,GarageFinish,GarageCars
0,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,2003.0,2,2
1,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1976.0,2,2
2,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,2001.0,2,2
3,0,0,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1998.0,1,3
4,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,2000.0,2,3


In [80]:
def mse(y_true, y_pred):
    return np.mean((y_true - y_pred)**2)

In [81]:
root_error = mse(y, y.mean())

In [82]:
root_error

0.15945250615661058

In [86]:
left  = train[train.GarageYrBlt > 2003]
right = train[~(train.GarageYrBlt > 2003)]

In [91]:
y[left.index].mean()

12.359443710924218

In [94]:
right_error = mse(y[right.index], y[right.index].mean())
left_error  = mse(y[left.index], y[left.index].mean())

In [95]:
left_error

0.11204182589070334

In [96]:
right_error

0.13780060493057458

In [103]:
weighted_left_error = (len(left)/len(train)) * left_error

In [104]:
weighted_right_error = (len(right)/len(train)) * right_error

In [105]:
root_error - weighted_left_error - weighted_right_error

0.02657429804255934