In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold

from sklearn.metrics import mean_squared_log_error

In [2]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

In [3]:
quantitative = [f for f in train.columns if train.dtypes[f] != 'object']
quantitative.remove('SalePrice')
quantitative.remove('Id')
qualitative = [f for f in train.columns if train.dtypes[f] == 'object']

In [4]:
class TargetOrderEncode:
    def __init__(self):
        pass
    
    def fit_and_transform(self, train, feature):
        self.feature = feature
        self.ordering = pd.DataFrame()
        self.ordering['val'] = train[feature].unique()
        self.ordering.index = self.ordering.val
        self.ordering['spmean'] = train[[feature, 'SalePrice']].groupby(feature).mean()['SalePrice']
        self.ordering = self.ordering.sort_values('spmean')
        self.ordering['ordering'] = range(1, self.ordering.shape[0]+1)
        self.ordering = self.ordering['ordering'].to_dict()

        self.transform(train)

    def transform(self, test):
        for cat, o in self.ordering.items():
            test.loc[test[self.feature] == cat, self.feature+'_E'] = o

In [5]:
qual_encoded = []
for q in qualitative:
    encoder = TargetOrderEncode()
    encoder.fit_and_transform(train, q)
    encoder.transform(test)
    qual_encoded.append(q+'_E')
print(qual_encoded)

['MSZoning_E', 'Street_E', 'Alley_E', 'LotShape_E', 'LandContour_E', 'Utilities_E', 'LotConfig_E', 'LandSlope_E', 'Neighborhood_E', 'Condition1_E', 'Condition2_E', 'BldgType_E', 'HouseStyle_E', 'RoofStyle_E', 'RoofMatl_E', 'Exterior1st_E', 'Exterior2nd_E', 'MasVnrType_E', 'ExterQual_E', 'ExterCond_E', 'Foundation_E', 'BsmtQual_E', 'BsmtCond_E', 'BsmtExposure_E', 'BsmtFinType1_E', 'BsmtFinType2_E', 'Heating_E', 'HeatingQC_E', 'CentralAir_E', 'Electrical_E', 'KitchenQual_E', 'Functional_E', 'FireplaceQu_E', 'GarageType_E', 'GarageFinish_E', 'GarageQual_E', 'GarageCond_E', 'PavedDrive_E', 'PoolQC_E', 'Fence_E', 'MiscFeature_E', 'SaleType_E', 'SaleCondition_E']


In [6]:
log_features = [
    'GrLivArea',
    '1stFlrSF',
    '2ndFlrSF',
    'TotalBsmtSF',
    'LotArea',
    'LotFrontage',
    'KitchenAbvGr',
    'GarageArea'
]

quad_feats = [
    'OverallQual',
    'YearBuilt',
    'YearRemodAdd',
    'TotalBsmtSF',
    '2ndFlrSF',
    'Neighborhood_E',
    'RoofMatl_E',
    'GrLivArea'
]

qdr = [f+'2' for f in quad_feats]

In [7]:
def log_transform(df, feature):
    df[feature] = np.log1p(df[feature].values)

def quadratic(df, feature):
    df[feature+'2'] = df[feature]**2

def bool_encode(train):
    train['HasBasement'] = train['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
    train['HasGarage'] = train['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
    train['Has2ndFloor'] = train['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
    train['HasMasVnr'] = train['MasVnrArea'].apply(lambda x: 1 if x > 0 else 0)
    train['HasWoodDeck'] = train['WoodDeckSF'].apply(lambda x: 1 if x > 0 else 0)
    train['HasPorch'] = train['OpenPorchSF'].apply(lambda x: 1 if x > 0 else 0)
    train['HasPool'] = train['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
    train['IsNew'] = train['YearBuilt'].apply(lambda x: 1 if x > 2000 else 0)

In [8]:
for f in quad_feats:
    quadratic(train, f)
    quadratic(test, f)

for f in log_features:
    log_transform(train, f)
    log_transform(test, f)

bool_encode(train)
bool_encode(test)

boolean = ['HasBasement', 'HasGarage', 'Has2ndFloor', 'HasMasVnr', 'HasWoodDeck',
            'HasPorch', 'HasPool', 'IsNew']

features = quantitative + qual_encoded + boolean + qdr

In [9]:
def error(actual, predicted):
    return np.sqrt(mean_squared_log_error(actual, predicted))

In [10]:
kf = KFold(n_splits=5, shuffle=True, random_state=216)

In [12]:
X = train[features].fillna(0.).values
Y = train['SalePrice'].values

models = []
Ypred = np.zeros_like(Y)

for train_index, val_index in kf.split(X):
    X_tr = X[train_index]
    X_val = X[val_index]
    y_tr = Y[train_index]
    y_val = Y[val_index]
    
    ridge = Ridge()
    ridge.fit(X_tr, np.log(y_tr))
    
    y_pred = np.exp(ridge.predict(X_val))
    Ypred[val_index] = y_pred
    
    print(error(y_val, y_pred))
    
    models.append(ridge)

0.11205509234113975
0.13071949887035056
0.1169484041791938
0.12753968125947115
0.11915025549134804


  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,


In [13]:
error(Y, Ypred)

0.12147756793269066

In [18]:
X_test = test[features].fillna(0.).values

In [25]:
test_preds = []

for m in models:
    test_pred = np.exp(m.predict(X_test))
    test_preds.append(test_pred)

test_preds = np.vstack(test_preds).mean(axis=0)

In [31]:
submission = test[['Id']]

In [32]:
submission['SalePrice'] = test_preds

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submission['SalePrice'] = test_preds


In [33]:
submission

Unnamed: 0,Id,SalePrice
0,1461,109456.581751
1,1462,167049.524142
2,1463,177811.294942
3,1464,190956.779346
4,1465,201882.252490
...,...,...
1454,2915,90137.933924
1455,2916,83494.887252
1456,2917,171981.050210
1457,2918,119754.129709


In [34]:
submission.to_csv('../output/base_line.csv', index=False)