## House SalePrice Prediction 

### Read the dataset

In [1]:
from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
import pandas as pd
df = pd.read_csv('training_set.csv')
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


## Perform basic Data quality checks

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [4]:
df.shape

(1460, 81)

In [5]:
s=df.isna().sum()
s[s>0]

LotFrontage      259
Alley           1369
MasVnrType       872
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

In [6]:
df.duplicated().sum()

0

### There are no missing values and duplicated values in this dataset

## Define X and Y features

In [7]:
## Removing statistically insignificant columns for X. Y is our target feaure i.e SalePrice
X = df.drop(columns=['Id','SalePrice'])
Y = df[['SalePrice']]

In [8]:
X.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


In [9]:
Y.head()

Unnamed: 0,SalePrice
0,208500
1,181500
2,223500
3,140000
4,250000


## Determine categorical and continuous features 

In [10]:
cat = list(X.columns[X.dtypes=='object'])
con = list(X.columns[X.dtypes!='object'])

In [11]:
cat[:5]

['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour']

In [12]:
con[:5]

['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond']

## Feature Selection Pipeline
Categorical: Ordinal Encoded

Continuous: Standard Scaler

In [13]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OrdinalEncoder
from sklearn.compose import ColumnTransformer

In [14]:
num_pipe1 = Pipeline(steps=[('imputer',SimpleImputer(strategy='mean')),
                            ('scaler',StandardScaler())])

In [15]:
cat_pipe1 = Pipeline(steps=[('imputer',SimpleImputer(strategy='constant',fill_value='NotAvail')),
                            ('ordinal',OrdinalEncoder())])

In [16]:
pre1 = ColumnTransformer([('num',num_pipe1,con),
                          ('cat',cat_pipe1,cat)]).set_output(transform='pandas')

In [17]:
pre1

In [18]:
X_pre1 = pre1.fit_transform(X)
X_pre1.head()

Unnamed: 0,num__MSSubClass,num__LotFrontage,num__LotArea,num__OverallQual,num__OverallCond,num__YearBuilt,num__YearRemodAdd,num__MasVnrArea,num__BsmtFinSF1,num__BsmtFinSF2,...,cat__GarageType,cat__GarageFinish,cat__GarageQual,cat__GarageCond,cat__PavedDrive,cat__PoolQC,cat__Fence,cat__MiscFeature,cat__SaleType,cat__SaleCondition
0,0.073375,-0.229372,-0.207142,0.651479,-0.5172,1.050994,0.878668,0.511418,0.575425,-0.288653,...,1.0,2.0,5.0,5.0,2.0,3.0,4.0,1.0,8.0,4.0
1,-0.872563,0.451936,-0.091886,-0.071836,2.179628,0.156734,-0.429577,-0.57441,1.171992,-0.288653,...,1.0,2.0,5.0,5.0,2.0,3.0,4.0,1.0,8.0,4.0
2,0.073375,-0.09311,0.07348,0.651479,-0.5172,0.984752,0.830215,0.32306,0.092907,-0.288653,...,1.0,2.0,5.0,5.0,2.0,3.0,4.0,1.0,8.0,4.0
3,0.309859,-0.456474,-0.096897,0.651479,-0.5172,-1.863632,-0.720298,-0.57441,-0.499274,-0.288653,...,5.0,3.0,5.0,5.0,2.0,3.0,4.0,1.0,8.0,0.0
4,0.073375,0.633618,0.375148,1.374795,-0.5172,0.951632,0.733308,1.36457,0.463568,-0.288653,...,1.0,2.0,5.0,5.0,2.0,3.0,4.0,1.0,8.0,4.0


## Checking which feature selector gives us best score : forward selection, backward elimination

In [19]:
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SequentialFeatureSelector

In [64]:
lr = LinearRegression()
sel = SequentialFeatureSelector(lr,direction='backward',n_features_to_select='auto')

In [65]:
sel.fit_transform(X_pre1,Y)
sel_cols = sel.get_feature_names_out()
print(len(sel_cols),'\n',sel_cols)

40 
 ['num__MSSubClass' 'num__LotArea' 'num__OverallQual' 'num__OverallCond'
 'num__YearBuilt' 'num__MasVnrArea' 'num__BsmtUnfSF' 'num__1stFlrSF'
 'num__2ndFlrSF' 'num__LowQualFinSF' 'num__GrLivArea' 'num__BsmtFullBath'
 'num__TotRmsAbvGrd' 'num__Fireplaces' 'num__GarageCars' 'num__WoodDeckSF'
 'num__EnclosedPorch' 'num__ScreenPorch' 'num__PoolArea' 'cat__Alley'
 'cat__LandContour' 'cat__Neighborhood' 'cat__HouseStyle' 'cat__RoofMatl'
 'cat__Exterior1st' 'cat__MasVnrType' 'cat__ExterQual' 'cat__Foundation'
 'cat__BsmtQual' 'cat__BsmtCond' 'cat__BsmtExposure' 'cat__BsmtFinType2'
 'cat__HeatingQC' 'cat__KitchenQual' 'cat__Functional' 'cat__FireplaceQu'
 'cat__GarageCond' 'cat__PavedDrive' 'cat__Fence' 'cat__SaleCondition']


In [66]:
imp_cols = []
for i in sel_cols:
    s=i.split('__')[1]
    imp_cols.append(s)
imp_cols[:5]

['MSSubClass', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt']

## Get the dataframe for features selected 

In [67]:
X_sel = X[imp_cols]
X_sel.head()

Unnamed: 0,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,MasVnrArea,BsmtUnfSF,1stFlrSF,2ndFlrSF,LowQualFinSF,...,BsmtExposure,BsmtFinType2,HeatingQC,KitchenQual,Functional,FireplaceQu,GarageCond,PavedDrive,Fence,SaleCondition
0,60,8450,7,5,2003,196.0,150,856,854,0,...,No,Unf,Ex,Gd,Typ,,TA,Y,,Normal
1,20,9600,6,8,1976,0.0,284,1262,0,0,...,Gd,Unf,Ex,TA,Typ,TA,TA,Y,,Normal
2,60,11250,7,5,2001,162.0,434,920,866,0,...,Mn,Unf,Ex,Gd,Typ,TA,TA,Y,,Normal
3,70,9550,7,5,1915,0.0,540,961,756,0,...,No,Unf,Gd,Gd,Typ,Gd,TA,Y,,Abnorml
4,60,14260,8,5,2000,350.0,490,1145,1053,0,...,Av,Unf,Ex,Gd,Typ,TA,TA,Y,,Normal


## Get categorical and continous features from this dataframe

In [68]:
cat_sel = list(X_sel.columns[X_sel.dtypes=='object'])
con_sel = list(X_sel.columns[X_sel.dtypes!='object'])
print(f"cat: {cat_sel} '\n' con: {con_sel}")

cat: ['Alley', 'LandContour', 'Neighborhood', 'HouseStyle', 'RoofMatl', 'Exterior1st', 'MasVnrType', 'ExterQual', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType2', 'HeatingQC', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageCond', 'PavedDrive', 'Fence', 'SaleCondition'] '
' con: ['MSSubClass', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'MasVnrArea', 'BsmtUnfSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'WoodDeckSF', 'EnclosedPorch', 'ScreenPorch', 'PoolArea']


## Create a final pipeline for model building

In [69]:
from sklearn.preprocessing import OneHotEncoder

In [70]:
num_pipe2 = Pipeline(steps=[('imputer',SimpleImputer(strategy='mean')),
                            ('scaler',StandardScaler())])

In [71]:
cat_pipe2 = Pipeline(steps=[('imputer',SimpleImputer(strategy='constant',fill_value='NotAvail')),
                            ('ohe',OneHotEncoder(handle_unknown='ignore',sparse_output=False))])

In [72]:
pre2 = ColumnTransformer([('num',num_pipe2,con_sel),
                          ('cat',cat_pipe2,cat_sel)]).set_output(transform='pandas')

In [73]:
pre2

In [74]:
X_sel_pre = pre2.fit_transform(X_sel)
X_sel_pre.head()

Unnamed: 0,num__MSSubClass,num__LotArea,num__OverallQual,num__OverallCond,num__YearBuilt,num__MasVnrArea,num__BsmtUnfSF,num__1stFlrSF,num__2ndFlrSF,num__LowQualFinSF,...,cat__Fence_GdWo,cat__Fence_MnPrv,cat__Fence_MnWw,cat__Fence_NotAvail,cat__SaleCondition_Abnorml,cat__SaleCondition_AdjLand,cat__SaleCondition_Alloca,cat__SaleCondition_Family,cat__SaleCondition_Normal,cat__SaleCondition_Partial
0,0.073375,-0.207142,0.651479,-0.5172,1.050994,0.511418,-0.944591,-0.793434,1.161852,-0.120242,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,-0.872563,-0.091886,-0.071836,2.179628,0.156734,-0.57441,-0.641228,0.25714,-0.795163,-0.120242,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.073375,0.07348,0.651479,-0.5172,0.984752,0.32306,-0.301643,-0.627826,1.189351,-0.120242,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.309859,-0.096897,0.651479,-0.5172,-1.863632,-0.57441,-0.06167,-0.521734,0.937276,-0.120242,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.073375,0.375148,1.374795,-0.5172,0.951632,1.36457,-0.174865,-0.045611,1.617877,-0.120242,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


## Apply train test split on this dataframe
training 80%

testing 20%

In [75]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(X_sel_pre,Y,test_size=0.2,train_size=0.8,random_state=21)

In [76]:
xtrain.shape

(1168, 160)

In [77]:
xtest.shape

(292, 160)

## Build the model

In [78]:
model = LinearRegression()
model.fit(xtrain,ytrain)

In [79]:
model.score(xtrain,ytrain)

0.9213428725295385

In [80]:
model.score(xtest,ytest)

-3.39628265684305e+18

## Forward Selection results:
#### training score: 92.54%     testing score: 80.31%

In [81]:
## Checking adjusted R2 score

def adjr2(model, xtrain, ytrain):
    # r2 score
    r2 = model.score(xtrain,ytrain)
    # Get Number of Rows
    N = xtrain.shape[0]
    # Get number of columns
    p = xtrain.shape[1]
    # Get adjr2 
    num = (1-r2)*(N-1)
    den = N-p-1
    r2a = 1 - num/den
    return r2a 

In [82]:
adjr2(model,xtrain,ytrain)

0.9088452157318485

## Backward Selection results:
#### training score: 92.13%     testing score: -3.39

## Testing score is very less in Backward Elimination. This is an overfitting scenario. Hence, building Ridge and Lasso Regularisation models

In [83]:
def evaluate_model(model, x, y):
    from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
    # Predict results for train and test
    ypred = model.predict(x)
    # Calulate the metrics
    mse = mean_squared_error(y, ypred)
    rmse = mse**(1/2)
    mae = mean_absolute_error(y, ypred)
    r2 = r2_score(y, ypred)
    # Print above results
    print(f'Mean Squared Error : {mse:.2f}')
    print(f'Root Mean Squared Error : {rmse:.2f}')
    print(f'Mean Absolute Error : {mae:.2f}')
    print(f'R2 Score : {r2:.4f}')

In [84]:
evaluate_model(model,xtrain,ytrain)

Mean Squared Error : 486803103.64
Root Mean Squared Error : 22063.61
Mean Absolute Error : 15029.98
R2 Score : 0.9213


In [85]:
evaluate_model(model,xtest,ytest)

Mean Squared Error : 23017667148173353777734090752.00
Root Mean Squared Error : 151715744562564.62
Mean Absolute Error : 8878492413402.60
R2 Score : -3396282656843049984.0000


## Ridge Regularisation

In [86]:
from sklearn.linear_model import Ridge
model2 = Ridge(alpha=2)
model2.fit(xtrain,ytrain)

In [87]:
model2.score(xtrain,ytrain)

0.9040146543618535

In [88]:
model2.score(xtest,ytest)

0.8358030778789745

## Hyperparameter tuning(alpha tuning)

In [90]:
import numpy as np
params = {'alpha': np.arange(start=0.1,stop=100,step=0.1)}

In [91]:
from sklearn.model_selection import GridSearchCV

rr =Ridge()
gscv = GridSearchCV(estimator=rr, param_grid=params,cv=5,scoring='neg_mean_squared_error')
gscv.fit(xtrain,ytrain)

In [92]:
gscv.best_params_

{'alpha': 22.900000000000002}

In [93]:
best_ridge = gscv.best_estimator_
best_ridge

## Evaluate tuned model

In [95]:
best_ridge.score(xtrain,ytrain)

0.8839687715401155

In [96]:
best_ridge.score(xtest,ytest)

0.8310546522328829

## Cross validate r2 for above model

In [97]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(best_ridge,xtrain,ytrain,cv=5,scoring='r2')
scores

array([0.68017036, 0.84636731, 0.90087311, 0.87887193, 0.92388514])

In [99]:
scores.mean()

0.8460335710099027

## Build Lasso model

In [100]:
from sklearn.linear_model import Lasso
ls = Lasso(alpha=5)
ls.fit(xtrain,ytrain)

In [101]:
ls.score(xtrain,ytrain)

0.9212005833863275

In [102]:
ls.score(xtest,ytest)

0.8215414469063469

## Hyperparameter tuning(alpha tuning)

In [109]:
params2 = {'alpha': np.arange(0.1,1000,1)}

In [110]:
ls1 = Lasso()

In [111]:
gscv1 = GridSearchCV(ls1,param_grid=params2,scoring='neg_mean_squared_error')
gscv1.fit(xtrain,ytrain)

In [112]:
gscv1.best_params_

{'alpha': 151.1}

In [113]:
best_lasso = gscv1.best_estimator_
best_lasso

## Evaluate Lasso model

In [114]:
best_lasso.score(xtrain,ytrain)

0.9069729228087539

In [115]:
best_lasso.score(xtest,ytest)

0.8297898226998163

## Get the cross validated r2 score on train

In [116]:
from sklearn.model_selection import cross_val_score
scores1 = cross_val_score(best_lasso,xtrain,ytrain,cv=5,scoring='r2')
scores1

array([0.68149579, 0.86499958, 0.90621675, 0.88262743, 0.93011996])

In [117]:
scores1.mean()

0.8530919036035323

## Evaluate the models with MSE,MAE,RMSE,r2 metrics

In [136]:
 def evaluate_model(model, x, y):
    from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
    # Predict results for train and test
    ypred = model.predict(x)
    # Calulate the metrics
    mse = mean_squared_error(y, ypred)
    rmse = mse**(1/2)
    mae = mean_absolute_error(y, ypred)
    r2 = r2_score(y, ypred)
    # Print above results
    print(f'Mean Squared Error : {mse:.2f}')
    print(f'Root Mean Squared Error : {rmse:.2f}')
    print(f'Mean Absolute Error : {mae:.2f}')
    print(f'R2 Score : {r2:.4f}')

## Ridge model Metrics

In [137]:
evaluate_model(best_ridge,xtrain,ytrain)

Mean Squared Error : 718108631.09
Root Mean Squared Error : 26797.55
Mean Absolute Error : 16257.19
R2 Score : 0.8840


In [138]:
evaluate_model(best_ridge,xtest,ytest)

Mean Squared Error : 1144995329.91
Root Mean Squared Error : 33837.78
Mean Absolute Error : 17961.92
R2 Score : 0.8311


## Lasso model Metrics

In [139]:
evaluate_model(best_lasso,xtrain,ytrain)

Mean Squared Error : 575737652.21
Root Mean Squared Error : 23994.53
Mean Absolute Error : 15850.95
R2 Score : 0.9070


In [140]:
evaluate_model(best_lasso,xtest,ytest)

Mean Squared Error : 1153567474.26
Root Mean Squared Error : 33964.21
Mean Absolute Error : 17507.31
R2 Score : 0.8298


## Ridge and Lasso, both the models are performing good in Metrics 

# Lets perform out of sample predictions

In [119]:
xnew = pd.read_csv('sample_set.csv')
xnew.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [120]:
xnew_pre = pre2.transform(xnew)
xnew_pre.head()

Unnamed: 0,num__MSSubClass,num__LotArea,num__OverallQual,num__OverallCond,num__YearBuilt,num__MasVnrArea,num__BsmtUnfSF,num__1stFlrSF,num__2ndFlrSF,num__LowQualFinSF,...,cat__Fence_GdWo,cat__Fence_MnPrv,cat__Fence_MnWw,cat__Fence_NotAvail,cat__SaleCondition_Abnorml,cat__SaleCondition_AdjLand,cat__SaleCondition_Alloca,cat__SaleCondition_Family,cat__SaleCondition_Normal,cat__SaleCondition_Partial
0,-0.872563,0.110763,-0.795151,0.381743,-0.340077,-0.57441,-0.672923,-0.689929,-0.795163,-0.120242,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,-0.872563,0.37585,-0.071836,0.381743,-0.43944,0.023903,-0.365032,0.430511,-0.795163,-0.120242,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.073375,0.332053,-0.795151,-0.5172,0.852269,-0.57441,-0.974021,-0.607125,0.811239,-0.120242,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.073375,-0.054002,-0.071836,0.381743,0.88539,-0.463612,-0.550672,-0.6123,0.758532,-0.120242,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1.492282,-0.552407,1.374795,-0.5172,0.686666,-0.57441,1.018211,0.303718,-0.795163,-0.120242,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


## Predicting Saleprice using Ridge model and Lasso model

In [129]:
ypreds_1 = best_ridge.predict(xnew_pre)
ypreds_1

array([[120218.11651374],
       [149637.97576739],
       [171483.63023587],
       ...,
       [174416.48251728],
       [109323.15927699],
       [226118.8662548 ]])

In [134]:
xnew['SalePrice_RidgeModel_Predicted'] = ypreds_1
xnew.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice_RidgeModel_Predicted
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,6,2010,WD,Normal,120218.116514
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,,,Gar2,12500,6,2010,WD,Normal,149637.975767
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,,0,3,2010,WD,Normal,171483.630236
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,,,,0,6,2010,WD,Normal,181308.907825
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,0,,,,0,1,2010,WD,Normal,196030.106011


In [126]:
xnew1 = pd.read_csv('sample_set.csv')
xnew1_pre = pre2.transform(xnew1)

In [127]:
ypreds_2 = best_lasso.predict(xnew1_pre)
ypreds_2

array([115231.03381355, 152000.93223498, 173640.43587961, ...,
       166686.88694447, 108927.40198215, 218809.75531552])

In [128]:
xnew1['SalePrice_LassoModel_Predicted'] = ypreds_2
xnew1.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice_LassoModel_Predicted
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,6,2010,WD,Normal,115231.033814
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,,,Gar2,12500,6,2010,WD,Normal,152000.932235
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,,0,3,2010,WD,Normal,173640.43588
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,,,,0,6,2010,WD,Normal,184622.345999
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,0,,,,0,1,2010,WD,Normal,199546.12092


## save the file to csv

In [135]:
xnew.to_csv('RidgePredicted_Results.csv',index=False)
xnew1.to_csv('LassoPredicted_Results.csv',index=False)