In [34]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
import math

This Kaggle Competition will evaluate using RMSLE - "Submissions are evaluated on Root-Mean-Squared-Error (RMSE) between the logarithm of the predicted value and the logarithm of the observed sales price. (Taking logs means that errors in predicting expensive houses and cheap houses will affect the result equally.)"

The Root Mean Squared Log Error (RMSLE) can be defined using a slight modification on sklearn's mean_squared_log_error function, which itself a modification on the familiar Mean Squared Error (MSE) metric.

In [35]:
def root_mean_squared_log_error(y_validations, y_predicted):
    if len(y_predicted) != len(y_validations):
        return 'error: mismatch in number of data points between x and y'
    y_predict_modified = [math.log(i) for i in y_predicted]
    y_validations_modified = [math.log(i) for i in y_validations]

    return mean_squared_error(y_validations_modified, y_predict_modified, squared=False)


In [36]:
df_train = pd.read_csv('./input/house-prices-advanced-regression-techniques/train.csv')
df_test = pd.read_csv('./input/house-prices-advanced-regression-techniques/test.csv')

In [37]:
df_train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [38]:
df_test.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [39]:
df_train.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [40]:
# Get the overall concise summary of the DataFrame
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [41]:
# Lets check what are the numerical features in the df_train dataset
numeric_features = df_train.select_dtypes(include=[np.number])
numeric_features.columns

Index(['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd',
       'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF',
       'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea',
       'MiscVal', 'MoSold', 'YrSold', 'SalePrice'],
      dtype='object')

In [42]:
# numeric features head
numeric_features.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,0,61,0,0,0,0,0,2,2008,208500
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,298,0,0,0,0,0,0,5,2007,181500
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,0,42,0,0,0,0,0,9,2008,223500
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,0,35,272,0,0,0,0,2,2006,140000
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,192,84,0,0,0,0,0,12,2008,250000


In [43]:
# Now a general description of SalePrice
# Now, as we will be predicting 'SalePrice' lets see description of that column
df_train['SalePrice'].describe()

count      1460.000000
mean     180921.195890
std       79442.502883
min       34900.000000
25%      129975.000000
50%      163000.000000
75%      214000.000000
max      755000.000000
Name: SalePrice, dtype: float64

In [44]:
top_correlations = df_train.corr()
top_feature_columns = top_correlations['SalePrice'][top_correlations['SalePrice'].values > 0.2].index.values
top_feature_columns

array(['LotFrontage', 'LotArea', 'OverallQual', 'YearBuilt',
       'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtUnfSF',
       'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'BsmtFullBath',
       'FullBath', 'HalfBath', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF',
       'OpenPorchSF', 'SalePrice'], dtype=object)

In [45]:
# Handling Missing Values for 19 features which have missing values mentioned above
df_train['GarageYrBlt'] = df_train['GarageYrBlt'].fillna(0)
# filling in missing GarageYrBuilt values with zeros.  
# But this may not be the most logical approach - refer to this discussion below for mor perspective
# https://www.kaggle.com/c/house-prices-advanced-regression-techniques/discussion/60143

# similary fillingup na valuse for couple of other features
df_train['LotFrontage'] = df_train.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))
df_train['MasVnrArea'] = df_train['MasVnrArea'].fillna(0)

In [46]:
# Checking out missing data
total = df_train.isnull().sum().sort_values(ascending=False)
percent = (df_train.isnull().sum() / df_train.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)

Unnamed: 0,Total,Percent
PoolQC,1453,0.995205
MiscFeature,1406,0.963014
Alley,1369,0.937671
Fence,1179,0.807534
FireplaceQu,690,0.472603
GarageType,81,0.055479
GarageCond,81,0.055479
GarageFinish,81,0.055479
GarageQual,81,0.055479
BsmtExposure,38,0.026027


## And we will use the famous LinearRegression() function of scikit-learn to implement this in a while.
First some data pre-procession.


In [47]:

train_target_label = df_train['SalePrice']
# labels are dependent variables whose values are to be predicted. 
top_feature_columns_modified = ['OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'YearBuilt', 'FullBath', 'OpenPorchSF', 
        'WoodDeckSF', '2ndFlrSF', 'YearRemodAdd', 'MasVnrArea', 'LotFrontage', 'LotArea']

training_sample_df = df_train[top_feature_columns_modified]
test_sample_df = df_test[top_feature_columns_modified]

training_sample_df.head()

Unnamed: 0,OverallQual,GrLivArea,GarageCars,TotalBsmtSF,YearBuilt,FullBath,OpenPorchSF,WoodDeckSF,2ndFlrSF,YearRemodAdd,MasVnrArea,LotFrontage,LotArea
0,7,1710,2,856,2003,2,61,0,854,2003,196.0,65.0,8450
1,6,1262,2,1262,1976,2,0,298,0,1976,0.0,80.0,9600
2,7,1786,2,920,2001,2,42,0,866,2002,162.0,68.0,11250
3,7,1717,3,756,1915,1,35,0,756,1970,0.0,60.0,9550
4,8,2198,3,1145,2000,2,84,192,1053,2000,350.0,84.0,14260


In [48]:
test_sample_df.head()

Unnamed: 0,OverallQual,GrLivArea,GarageCars,TotalBsmtSF,YearBuilt,FullBath,OpenPorchSF,WoodDeckSF,2ndFlrSF,YearRemodAdd,MasVnrArea,LotFrontage,LotArea
0,5,896,1.0,882.0,1961,1,0,140,0,1961,0.0,80.0,11622
1,6,1329,1.0,1329.0,1958,1,36,393,0,1958,108.0,81.0,14267
2,5,1629,2.0,928.0,1997,2,34,212,701,1998,0.0,74.0,13830
3,6,1604,2.0,926.0,1998,2,36,360,678,1998,20.0,78.0,9978
4,8,1280,2.0,1280.0,1992,2,82,0,0,1992,0.0,43.0,5005


Do some data pre-processing by replacing all null values with median
using scikit-learn's Imputer class.

The strategy parameter is optional and takes a string, (default="mean")

If "mean", then replace missing values using the mean along the axis.
If "median", then replace missing values using the median along the axis.
If "most_frequent", then replace missing using the most frequent value along the axis.

In [49]:
imputer = SimpleImputer(strategy = 'median')

# During fit() the imputer learns about the mean, median etc of the data,
# which is then applied to the missing values during transform().
imputer.fit(training_sample_df)
imputer.fit(test_sample_df)

# Note - sklearn.preprocessing.Imputer.fit_transform returns a new array, it doesn't alter the argument array.
training_sample_df = imputer.transform(training_sample_df)
test_sample_df = imputer.transform(test_sample_df)

Previously we standardized the data with manual plain-python implement
#### Now we will use sklearn function

In [50]:
scaler = StandardScaler()

# Again, during fit() the imputer learns about the mean, median etc of the data,
# which is then applied to the missing values during transform().
scaler.fit(training_sample_df)
scaler.fit(test_sample_df)

training_sample_df = scaler.transform(training_sample_df)
test_sample_df = scaler.transform(test_sample_df)

In [51]:
# Syntax of train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.33, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(training_sample_df, train_target_label, random_state=42, train_size = 0.8 )

X_train.shape

(1168, 13)

In [52]:
X_test.shape

(292, 13)

In [53]:
training_sample_df.shape

(1460, 13)

In [54]:
test_sample_df.shape

(1459, 13)

### Now finally run a Linear Regression model.

In [55]:
model = LinearRegression()
# we have to fit this model to our data, in other words, we have to make it “learn” using our training data.
# The syntax of the fit function is just plain model.fit(X_train, Y_train)
model.fit(training_sample_df, train_target_label)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [56]:
print('Intercept is ', model.intercept_)

# For retrieving the slope (coefficient of x). This will be an array of values.
print("Slope i.e. coefficient of x is ", model.coef_)

Intercept is  177921.01851212673
Slope i.e. coefficient of x is  [28196.91947993 26349.76648831  9645.4413169   7658.2055073
  6637.47089276 -2965.55939821   768.02475758  4437.72171596
 -4227.6251946   6888.80707764  5992.62389513   579.10402749
  2935.94808718]


In [57]:
# regression = model.fit(training_sample_df, train_target_label)
print("Regression score is", model.score(training_sample_df, train_target_label))
print('train_target_label is ', train_target_label)

Regression score is 0.7858973705366121
train_target_label is  0       208500
1       181500
2       223500
3       140000
4       250000
         ...  
1455    175000
1456    210000
1457    266500
1458    142125
1459    147500
Name: SalePrice, Length: 1460, dtype: int64


Now that my model is trained with the training data set, I can now start testing the model with the testing dataset to Predict Sales Price from Test data set.
The y_predict is a numpy array that contains all the predicted values for the input values in the X_test series.

The general syntax is for making predictions on the test data, I have to execute the following form of:

```python
regressor = LinearRegression()
y_predition_from_model = regressor.predict(X_test)
```

In [58]:
y_predict = model.predict(X_test)

In [59]:
rmse_simple_linear = np.sqrt(metrics.mean_squared_error(y_test, y_predict))
print('Root Mean Square Error is ', rmse_simple_linear)

# MSE_Log
rmse_log_simple_linear = np.sqrt(mean_squared_log_error(y_test, y_predict))
print('Root Mean Square Log Error is ', rmse_log_simple_linear)


Root Mean Square Error is  36992.39012928462
Root Mean Square Log Error is  0.26703294966342594


In [60]:
prediction_on_test_data = model.predict(test_sample_df)
# print("Regression score on test sample data is", model.score(test_sample_df, train_target_label ))
testID = df_test['Id']
predict_submission = pd.DataFrame()
predict_submission['ID'] = testID
predict_submission['SalePrice'] = prediction_on_test_data
predict_submission

Unnamed: 0,ID,SalePrice
0,1461,108125.218262
1,1462,171809.395377
2,1463,172711.837537
3,1464,195073.734546
4,1465,209815.191496
...,...,...
1454,2915,68025.514633
1455,2916,80707.019360
1456,2917,174307.976512
1457,2918,114170.511870


### Now I will do Ridge Regression

In [61]:
model_ridge = Ridge(alpha=0.5)
model_ridge.fit(training_sample_df, train_target_label)
y_predict_ridge = model_ridge.predict(X_test)

mse_linear_ridge = np.sqrt(metrics.mean_squared_error(y_test, y_predict_ridge))
print('MSE of Linear Ridge is ', mse_linear_ridge)

# Log Error
mse_log_linear_ridge = np.sqrt(mean_squared_log_error(y_test, y_predict_ridge))
print('MSE Log of Linear Ridge is ', mse_log_linear_ridge)

MSE of Linear Ridge is  36992.54832288
MSE Log of Linear Ridge is  0.26682113387130174


#### And now MSE Calculation for Lasso Regression

In [62]:
model_lasso = Lasso(alpha=33)
model_lasso.fit(training_sample_df, train_target_label)
y_predict_lasso = model_lasso.predict(X_test)

mse_linear_lasso = np.sqrt(metrics.mean_squared_error(y_test, y_predict_lasso))
print('MSE of Lasso Regression is ', mse_linear_lasso)

# Log Error
mse_log_linear_lasso = np.sqrt(mean_squared_log_error(y_test, y_predict_lasso))
print('MSE Log of Lasso Regression is ', mse_log_linear_lasso)

MSE of Lasso Regression is  37002.420402536394
MSE Log of Lasso Regression is  0.2665108348893785


##### Now we will be using Random Forest

In [63]:
RFR = RandomForestRegressor(max_depth=50)
RFR.fit(training_sample_df, train_target_label)

y_predict_random_forest = RFR.predict(X_test)

mse_random_forest = np.sqrt(metrics.mean_squared_error(y_test, y_predict_random_forest))
print('MSE Random Forest is ', mse_random_forest)

# Log Error
mse_log_random_forest = np.sqrt(mean_squared_log_error(y_test, y_predict_random_forest))
print('MSE Log Random Forest is ', mse_log_random_forest)

MSE Random Forest is  10597.67973713408
MSE Log Random Forest is  0.06849030791233557


## Now final predictions for submissions

In [64]:
# Y_test_predicted_for_submission = RFR.predict(df_test)
# Y_test_predicted_for_submission = RFR.predict(X_test)
Y_test_predicted_for_submission = RFR.predict(test_sample_df)

indexes = np.arange(df_test.shape[0]+2, 2*df_test.shape[0]+2)
print('Indexex ', indexes)

# output_for_submission = pd.DataFrame({'Id': test_sample_df.Id,
#                        'SalePrice': Y_test_predicted_for_submission})

# output_for_submission = pd.DataFrame({'Id': indexes,
#                        'SalePrice': Y_test_predicted_for_submission})

# output_for_submission = pd.DataFrame({'Id': df_test.Id,
#                        'SalePrice': Y_test_predicted_for_submission})

# print(output_for_submission)

# *****************
testID = df_test['Id']
output_for_submission = pd.DataFrame()
output_for_submission['ID'] = testID
output_for_submission['SalePrice'] = Y_test_predicted_for_submission
output_for_submission.to_csv('submission.csv', index=False)
# The above gave me a rank of 3000+ in the Leaderboard which is 68% from Top and a score of 0.15
# So thats not very encouraging, so lets get back to improving the model


Indexex  [1461 1462 1463 ... 2917 2918 2919]


In [65]:
### More Feature Engineering after the above relatively large MSE

In [66]:
all_features_except_target = [i for i in df_train.columns if i not in ['SalePrice'] ]
features_X = df_train[all_features_except_target]
y = df_train['SalePrice']


X_feature_engineering = features_X.copy()

# To measure how recently the House was re-modified - calculate a column by subtracting YearBuilt from YearRemodAdd.
# The notes on the data says - YearRemodAdd: Remodel date (same as construction date if no remodeling or additions)
X_feature_engineering['years_since_update'] = X_feature_engineering['YearRemodAdd'] - X_feature_engineering['YearBuilt']
X_feature_engineering['garage_value'] = X_feature_engineering['YearBuilt'] * X_feature_engineering['GarageCars']
# X_feature_engineering['misc_value'] = X_feature_engineering['Fireplaces'] + X_feature_engineering['OverallQual']

X_feature_engineering = X_feature_engineering.drop(columns=['GarageCars'])

feature_numerical_cols = [col_name for col_name in X_feature_engineering.columns if
                X_feature_engineering[col_name].dtype in ['int64', 'float64']]

# Pandas dataframe.nunique() function - return Series with number of distinct observations over requested axis. If we set the value of axis to be 0, then it finds the total number of unique observations over the index axis. If we set the value of axis to be 1, then it find the total number of unique observations over the column axis. It also provides the feature to exclude the NaN values from the count of unique numbers.
feature_categorical_cols = [col_name for col_name in X_feature_engineering.columns if
                    X_feature_engineering[col_name].nunique() < 50 and
                    X_feature_engineering[col_name].dtype in ['object', 'bool']]


feature_numerical_transformer = SimpleImputer(strategy='constant')

feature_categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

feature_preprocessor = ColumnTransformer(
    transformers=[
        ('num', feature_numerical_transformer, feature_numerical_cols),
        ('cat', feature_categorical_transformer, feature_categorical_cols)
])

feature_model = XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.6, gamma=0.0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.02, max_delta_step=0, max_depth=4,
             min_child_weight=0.0, monotone_constraints='()',
             n_estimators=1250, n_jobs=0, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.8,
             tree_method='exact', validate_parameters=1, verbosity=None)

feature_clf = Pipeline(steps=[('feature_preprocessor', feature_preprocessor),
                      ('feature_model', feature_model)
                     ])

feature_X_train, feature_X_valid, feature_y_train, feature_y_valid = train_test_split(X_feature_engineering, y, random_state=0)

feature_clf.fit(feature_X_train, feature_y_train, feature_model__verbose=False)
feature_preds = feature_clf.predict(feature_X_valid)

print('RMSLE:', root_mean_squared_log_error(feature_y_valid, feature_preds))

RMSLE: 0.12255286983099915


ValueError: Number of features of the input must be equal to or greater than that of the fitted transformer. Transformer n_features is 81 and input n_features is 13.

In [None]:
X_test = pd.read_csv('./input/house-prices-advanced-regression-techniques/test.csv')

X_test['years_since_update'] = X_test['YearRemodAdd'] - X_feature_engineering['YearBuilt']
X_test['garage_value'] = X_test['YearBuilt'] * X_test['GarageCars']

X_test = X_test.drop(columns=['GarageCars'])

feature_clf.fit(X_feature_engineering, y, feature_model__verbose=False)

preds = feature_clf.predict(X_test)
output = pd.DataFrame({'Id': X_test.Id,
                       'SalePrice': preds})
output.to_csv('submission.csv', index=False)



