# Kaggle Submission

In [1]:
#imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from itertools import combinations

from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, RidgeCV, ElasticNet, ElasticNetCV
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

In [2]:
#read in test.csv
kaggle = pd.read_csv('datasets/test.csv')
kaggle.columns = kaggle.columns.str.lower()
kaggle.columns = kaggle.columns.str.replace(' ', '_')

In [3]:
#filtering for features of choice
kaggle_choice = kaggle.loc[:, [
    'lot_frontage', 'lot_area', 'mas_vnr_area', 
    'total_bsmt_sf', 'gr_liv_area', 'bsmt_full_bath',
    'bsmt_half_bath', 'full_bath', 'half_bath', 'bedroom_abvgr',
    'kitchen_abvgr','totrms_abvgrd', 'fireplaces','garage_area',
    'wood_deck_sf', 'open_porch_sf', 'enclosed_porch', '3ssn_porch',
    'screen_porch', 'pool_area','misc_val',
     'ms_subclass', 'ms_zoning', 'street', 'alley',
    'land_contour', 'lot_config', 'utilities',
    'bldg_type', 'house_style', 'roof_style',
    'roof_matl', 'exterior_1st', 'mas_vnr_type',
    'foundation', 'heating', 'central_air',
    'garage_type', 'misc_feature', 'sale_type',
    'lot_shape', 'land_slope', 'overall_qual', 'overall_cond',
    'exter_qual', 'exter_cond','bsmt_qual',
       'bsmt_cond', 'bsmt_exposure', 'heating_qc',
    'electrical','kitchen_qual', 'functional', 'fireplace_qu',
    'garage_finish', 'garage_qual', 'neighborhood',
       'garage_cond', 'paved_drive',
    'pool_qc', 'fence', 'year_built', 'year_remod/add' 
]]

In [4]:
#Calculate the age of the house and years since last remodelling/addition and making a new column
kaggle_choice['years_since_built'] = kaggle_choice['year_built'].map(lambda x: 2020 - x)
kaggle_choice['years_since_remod'] = kaggle_choice['year_remod/add'].map(lambda x: 2020 - x)
kaggle_choice.drop(columns = ['year_built', 'year_remod/add'], axis = 1, inplace=True)

In [5]:
kaggle_numerical = kaggle.loc[:, [
    'lot_frontage', 'lot_area', 'mas_vnr_area', 
    'total_bsmt_sf', 'gr_liv_area', 'bsmt_full_bath',
    'bsmt_half_bath', 'full_bath', 'half_bath', 'bedroom_abvgr',
    'kitchen_abvgr','totrms_abvgrd', 'fireplaces','garage_area',
    'wood_deck_sf', 'open_porch_sf', 'enclosed_porch', '3ssn_porch',
    'screen_porch', 'pool_area','misc_val'
]]

#checking for null values
kaggle_numerical.isnull().sum()

lot_frontage      160
lot_area            0
mas_vnr_area        1
total_bsmt_sf       0
gr_liv_area         0
bsmt_full_bath      0
bsmt_half_bath      0
full_bath           0
half_bath           0
bedroom_abvgr       0
kitchen_abvgr       0
totrms_abvgrd       0
fireplaces          0
garage_area         0
wood_deck_sf        0
open_porch_sf       0
enclosed_porch      0
3ssn_porch          0
screen_porch        0
pool_area           0
misc_val            0
dtype: int64

In [6]:
#Fixing 'lot_frontage' values in the main kaggle_choice df.
fr_mean = kaggle_choice.groupby('ms_subclass')['lot_frontage'].mean()

#Retrieving indexes of 'lot_frontage' null value rows.
fr_index = kaggle_choice[kaggle_choice['lot_frontage'].isnull()].index

#Setting the value of NaN to the mean according to 'ms_subclass' value.
for index in fr_index:
    kaggle_choice.loc[index, 'lot_frontage'] = fr_mean[kaggle_choice.loc[index,'ms_subclass']]    

In [7]:
#double-checking
kaggle_choice['lot_frontage'].isnull().sum()

0

In [8]:
#set mas_vnr_area NaN to zero
kaggle_choice['mas_vnr_area'] = kaggle_choice['mas_vnr_area'].map(lambda x: 0 if x != x else x)

In [9]:
#double-checking
kaggle_choice['mas_vnr_area'].isnull().sum()

0

In [10]:
#Cleaning Ordinal values
kaggle_choice['fence'] = kaggle_choice.loc[:, 'fence'].map(lambda x:                                                                  
                                                                 4 if x == 'GdPrv' else
                                                                 3 if x == 'MnPrv' else
                                                                 2 if x == 'GdWo' else
                                                                 1 if x == 'MnWw' else
                                                                 0
                                                                )

kaggle_choice['paved_drive'] = kaggle_choice.loc[:, 'paved_drive'].map(lambda x:                                                                  
                                                                 1 if x == 'Y' else
                                                                 1 if x == 'P' else
                                                                 0
                                                                )

kaggle_choice['garage_finish'] = kaggle_choice.loc[:, 'garage_finish'].map(lambda x:                                                                  
                                                                 3 if x == 'Fin' else
                                                                 2 if x == 'RFn' else
                                                                 1 if x == 'Unf' else
                                                                 0
                                                                )


kaggle_choice['electrical'] = kaggle_choice.loc[:, 'electrical'].map(lambda x: 
                                                                 4 if x == 'SBrkr' else
                                                                 3 if x == 'FuseA' else
                                                                 2 if x == 'FuseF' else
                                                                 1 if x == 'FuseP' else
                                                                 0
                                                                )

kaggle_choice['bsmt_exposure'] = kaggle_choice.loc[:, 'bsmt_exposure'].map(lambda x: 
                                                                 4 if x == 'Gd' else
                                                                 3 if x == 'Av' else
                                                                 2 if x == 'Mn' else
                                                                 1 if x == 'No' else
                                                                 0
                                                                )

kaggle_choice['lot_shape'] = kaggle_choice.loc[:, 'lot_shape'].map(lambda x: 
                                                                 4 if x == 'Reg' else
                                                                 3 if x == 'IR1' else
                                                                 2 if x == 'IR2' else
                                                                 1 if x == 'IR3' else
                                                                 0
                                                                )

kaggle_choice['land_slope'] = kaggle_choice.loc[:, 'land_slope'].map(lambda x: 
                                                                 3 if x == 'Gtl' else
                                                                 2 if x == 'Mod' else
                                                                 1 if x == 'Sev' else
                                                                 0
                                                                )

ord_feature = ['exter_qual', 'bsmt_qual', 'bsmt_cond', 'heating_qc',
               'kitchen_qual','fireplace_qu', 'garage_qual', 'garage_cond',
               'pool_qc', 'exter_cond'
              ]

for feature in ord_feature:
        kaggle_choice[feature] = kaggle_choice.loc[:, feature].map(lambda x: 
                                                                 5 if x == 'Ex' else 
                                                                 4 if x == 'Gd' else 
                                                                 3 if x == 'TA' else
                                                                 2 if x == 'Fa' else
                                                                 1 if x == 'Po' else
                                                                 0
                                                                )

kaggle_choice['functional'] = kaggle_choice.loc[:, 'functional'].map(lambda x: 
                                                                 8 if x == 'Typ' else 
                                                                 7 if x == 'Min1' else 
                                                                 6 if x == 'Min2' else
                                                                 5 if x == 'Mod' else
                                                                 4 if x == 'Maj1' else
                                                                 3 if x == 'Maj2' else
                                                                 2 if x == 'Sev' else
                                                                 1 if x == 'Sal' else
                                                                 0
                                                                )

In [11]:
#double checking
kaggle_ordinal = kaggle_choice.loc[:,[
    'lot_shape', 'land_slope', 'overall_qual', 'overall_cond',
    'exter_qual', 'exter_cond','bsmt_qual',
       'bsmt_cond', 'bsmt_exposure', 'heating_qc',
    'electrical','kitchen_qual', 'functional', 'fireplace_qu',
    'garage_finish', 'garage_qual',
       'garage_cond', 'paved_drive',
    'pool_qc', 'fence']         
]

kaggle_choice[kaggle_ordinal.columns].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878 entries, 0 to 877
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   lot_shape      878 non-null    int64
 1   land_slope     878 non-null    int64
 2   overall_qual   878 non-null    int64
 3   overall_cond   878 non-null    int64
 4   exter_qual     878 non-null    int64
 5   exter_cond     878 non-null    int64
 6   bsmt_qual      878 non-null    int64
 7   bsmt_cond      878 non-null    int64
 8   bsmt_exposure  878 non-null    int64
 9   heating_qc     878 non-null    int64
 10  electrical     878 non-null    int64
 11  kitchen_qual   878 non-null    int64
 12  functional     878 non-null    int64
 13  fireplace_qu   878 non-null    int64
 14  garage_finish  878 non-null    int64
 15  garage_qual    878 non-null    int64
 16  garage_cond    878 non-null    int64
 17  paved_drive    878 non-null    int64
 18  pool_qc        878 non-null    int64
 19  fence   

In [12]:
feature_nominal = [
    'ms_subclass', 'ms_zoning', 'street', 'alley',
    'land_contour', 'lot_config', 'utilities',
    'bldg_type', 'house_style', 'roof_style',
    'roof_matl', 'exterior_1st', 'mas_vnr_type',
    'foundation', 'heating', 'central_air',
    'garage_type', 'misc_feature', 'sale_type',
    'neighborhood'
]

kaggle_nominal_test = kaggle_choice.loc[:, feature_nominal]

In [13]:
def manual_dummies(DataFrame, feature_list):
    """Function accepts a DataFrame and a feature list to create dummies columns for the values within the features of the DataFrame."""
    """Returns the modified DataFrame and prints out the value that has been dropped."""
    for feature in feature_list:
        nominal_values = list(DataFrame[feature].value_counts().index)
        for value in nominal_values[:-1]:
            DataFrame[(str(feature) + '_' + str(value))] = DataFrame.loc[:, str(feature)].map(lambda x: 1 if x == value else 0)
        print(f"'{nominal_values[-1]}' was dropped from '{feature}'.")
        DataFrame.drop(columns = str(feature), axis = 1, inplace=True)
    return DataFrame.columns

In [14]:
manual_dummies(kaggle_nominal_test, feature_nominal)

'40' was dropped from 'ms_subclass'.
'I (all)' was dropped from 'ms_zoning'.
'Grvl' was dropped from 'street'.
'Pave' was dropped from 'alley'.
'Low' was dropped from 'land_contour'.
'FR3' was dropped from 'lot_config'.
'NoSewr' was dropped from 'utilities'.
'2fmCon' was dropped from 'bldg_type'.
'2.5Fin' was dropped from 'house_style'.
'Shed' was dropped from 'roof_style'.
'Roll' was dropped from 'roof_matl'.
'AsphShn' was dropped from 'exterior_1st'.
'CBlock' was dropped from 'mas_vnr_type'.
'Wood' was dropped from 'foundation'.
'Floor' was dropped from 'heating'.
'N' was dropped from 'central_air'.
'CarPort' was dropped from 'garage_type'.
'Gar2' was dropped from 'misc_feature'.
'VWD' was dropped from 'sale_type'.
'Blueste' was dropped from 'neighborhood'.


Index(['ms_subclass_20', 'ms_subclass_60', 'ms_subclass_50', 'ms_subclass_120',
       'ms_subclass_160', 'ms_subclass_30', 'ms_subclass_70', 'ms_subclass_90',
       'ms_subclass_80', 'ms_subclass_85',
       ...
       'neighborhood_NoRidge', 'neighborhood_ClearCr', 'neighborhood_SWISU',
       'neighborhood_MeadowV', 'neighborhood_StoneBr', 'neighborhood_BrDale',
       'neighborhood_Veenker', 'neighborhood_Blmngtn', 'neighborhood_NPkVill',
       'neighborhood_Greens'],
      dtype='object', length=116)

Comparing the above dropped columns with the columns dropped from Train dataset, we see different values were dropped


Columns previously dropped from Train set

'150' was dropped from 'ms_subclass'.
'I (all)' was dropped from 'ms_zoning'.
'Grvl' was dropped from 'street'.
'Pave' was dropped from 'alley'.
'Low' was dropped from 'land_contour'.
'FR3' was dropped from 'lot_config'.
'NoSewr' was dropped from 'utilities'.
'2fmCon' was dropped from 'bldg_type'.
'2.5Fin' was dropped from 'house_style'.
'Shed' was dropped from 'roof_style'.
'ClyTile' was dropped from 'roof_matl'.
'AsphShn' was dropped from 'exterior_1st'.
'BrkCmn' was dropped from 'mas_vnr_type'.
'Wood' was dropped from 'foundation'.
'OthW' was dropped from 'heating'.
'N' was dropped from 'central_air'.
'CarPort' was dropped from 'garage_type'.
'Elev' was dropped from 'misc_feature'.
'Oth' was dropped from 'sale_type'.
'Landmrk' was dropped from 'neighborhood'.

Columns dropped that are different from the Test dataset

'150' was dropped from 'ms_subclass'.
'ClyTile' was dropped from 'roof_matl'.
'BrkCmn' was dropped from 'mas_vnr_type'.
'OthW' was dropped from 'heating'.
'Elev' was dropped from 'misc_feature'.
'Oth' was dropped from 'sale_type'.
'Landmrk' was dropped from 'neighborhood'.

In [15]:
#check if the values dropped from Train dataset existed in the test dataset.
kaggle['roof_matl'].value_counts() #Checking for 'ClyTile'. No

CompShg    861
Tar&Grv      8
WdShake      5
WdShngl      2
Metal        1
Roll         1
Name: roof_matl, dtype: int64

In [16]:
kaggle['ms_subclass'].value_counts() #Checking for '150'. No

20     309
60     180
50      89
120     60
160     41
30      38
70      38
90      34
80      32
85      20
190     15
45       7
75       7
180      6
40       2
Name: ms_subclass, dtype: int64

In [17]:
kaggle['mas_vnr_type'].value_counts() #Checking for 'BrkCmn'. Yes

None       534
BrkFace    250
Stone       80
BrkCmn      12
CBlock       1
Name: mas_vnr_type, dtype: int64

In [18]:
kaggle['heating'].value_counts() #Checking for 'OthW'. No.

GasA     866
GasW       7
Grav       4
Floor      1
Name: heating, dtype: int64

In [19]:
kaggle['misc_feature'].value_counts() #Checking for 'Elev'. No.

Shed    39
Othr     1
Gar2     1
Name: misc_feature, dtype: int64

In [20]:
kaggle['sale_type'].value_counts() #Checking for 'Oth'. Yes.

WD       755
New       78
COD       24
ConLD      9
Oth        3
ConLw      3
CWD        2
ConLI      2
Con        1
VWD        1
Name: sale_type, dtype: int64

In [21]:
kaggle['neighborhood'].value_counts() #Checking for 'Landmrk'. No.

NAmes      133
CollgCr     87
OldTown     76
Somerst     52
Edwards     50
Gilbert     49
NridgHt     44
NWAmes      44
Sawyer      40
SawyerW     38
Mitchel     32
Crawfor     32
BrkSide     32
Timber      24
IDOTRR      24
NoRidge     23
ClearCr     17
SWISU       16
MeadowV     13
StoneBr     13
BrDale      11
Veenker      7
Blmngtn      6
NPkVill      6
Greens       5
Blueste      4
Name: neighborhood, dtype: int64

In [22]:
#those with values that does not exist in Test, do get_dummies without dropping first.
#those with values in Test, do get_dummies without dropping first. Then drop the column.
def manual_dummies_no_drop(DataFrame, feature_list):
    """Function accepts a DataFrame and a feature list to create dummies columns for the values within the features of the DataFrame."""
    """Returns the modified DataFrame and prints out the value that has been dropped."""
    for feature in feature_list:
        nominal_values = list(DataFrame[feature].value_counts().index)
        for value in nominal_values:
            DataFrame[(str(feature) + '_' + str(value))] = DataFrame.loc[:, str(feature)].map(lambda x: 1 if x == value else 0)
        DataFrame.drop(columns = str(feature), axis = 1, inplace=True)
    return DataFrame.columns

In [23]:
#setting Nominal features without those exceptions

feature_nominal_w_drop = [
     'ms_zoning', 'street', 'alley',
    'land_contour', 'lot_config', 'utilities',
    'bldg_type', 'house_style', 'roof_style',
     'exterior_1st', 'foundation',  'central_air',
    'garage_type',      
]

feature_nominal_wo_drop = ['ms_subclass','roof_matl','mas_vnr_type','heating','misc_feature','sale_type','neighborhood']

In [24]:
manual_dummies(kaggle_choice, feature_nominal_w_drop)

'I (all)' was dropped from 'ms_zoning'.
'Grvl' was dropped from 'street'.
'Pave' was dropped from 'alley'.
'Low' was dropped from 'land_contour'.
'FR3' was dropped from 'lot_config'.
'NoSewr' was dropped from 'utilities'.
'2fmCon' was dropped from 'bldg_type'.
'2.5Fin' was dropped from 'house_style'.
'Shed' was dropped from 'roof_style'.
'AsphShn' was dropped from 'exterior_1st'.
'Wood' was dropped from 'foundation'.
'N' was dropped from 'central_air'.
'CarPort' was dropped from 'garage_type'.


Index(['lot_frontage', 'lot_area', 'mas_vnr_area', 'total_bsmt_sf',
       'gr_liv_area', 'bsmt_full_bath', 'bsmt_half_bath', 'full_bath',
       'half_bath', 'bedroom_abvgr',
       ...
       'foundation_CBlock', 'foundation_BrkTil', 'foundation_Slab',
       'foundation_Stone', 'central_air_Y', 'garage_type_Attchd',
       'garage_type_Detchd', 'garage_type_BuiltIn', 'garage_type_Basment',
       'garage_type_2Types'],
      dtype='object', length=104)

In [25]:
manual_dummies_no_drop(kaggle_choice, feature_nominal_wo_drop)

Index(['lot_frontage', 'lot_area', 'mas_vnr_area', 'total_bsmt_sf',
       'gr_liv_area', 'bsmt_full_bath', 'bsmt_half_bath', 'full_bath',
       'half_bath', 'bedroom_abvgr',
       ...
       'neighborhood_ClearCr', 'neighborhood_SWISU', 'neighborhood_MeadowV',
       'neighborhood_StoneBr', 'neighborhood_BrDale', 'neighborhood_Veenker',
       'neighborhood_Blmngtn', 'neighborhood_NPkVill', 'neighborhood_Greens',
       'neighborhood_Blueste'],
      dtype='object', length=166)

In [26]:
kaggle_choice.drop(columns = ['mas_vnr_type_BrkCmn', 'sale_type_Oth'],axis = 1,inplace = True)

In [27]:
#drop garage_cond, drop bldg type, drop pool area, drop fireplaces, drop totrms_abvgrnd, drop house_style.
kaggle_choice.drop(columns = [
    'garage_cond', 'pool_area', 'fireplaces','totrms_abvgrd',
    'bldg_type_1Fam', 'bldg_type_TwnhsE', 'bldg_type_Duplex',
    'bldg_type_Twnhs', 'house_style_1Story', 'house_style_2Story',
    'house_style_1.5Fin', 'house_style_SLvl', 'house_style_SFoyer',
    'house_style_2.5Unf', 'house_style_1.5Unf'],
                 axis = 1,
                 inplace=True,             
)

In [28]:
#dropping features from lasso zero coef
kaggle_choice.drop(columns = [
    'lot_frontage', 'garage_type_BuiltIn', 'garage_type_Detchd', 'garage_type_Attchd', 'central_air_Y',
 'heating_Grav', 'heating_GasW', 'heating_GasA', 'garage_type_2Types', 'foundation_Stone', 'foundation_BrkTil',
 'foundation_CBlock', 'mas_vnr_type_Stone', 'mas_vnr_type_BrkFace', 'mas_vnr_type_None',
 'foundation_Slab', 'sale_type_ConLD',
 'neighborhood_Veenker', 'neighborhood_BrDale', 'neighborhood_Blmngtn', 'neighborhood_MeadowV', 'neighborhood_ClearCr',
 'neighborhood_SWISU', 'neighborhood_Timber', 'sale_type_WD ', 'neighborhood_IDOTRR', 'neighborhood_NWAmes',
 'neighborhood_Sawyer', 'neighborhood_Gilbert', 'neighborhood_CollgCr', 'sale_type_ConLw', 'sale_type_ConLI',
 'sale_type_CWD', 'neighborhood_SawyerW', 'neighborhood_Blueste', 'exterior_1st_BrkComm', 'exterior_1st_WdShing',
 'ms_subclass_45', 'ms_subclass_85', 'ms_subclass_190', 'ms_subclass_90', 'ms_subclass_80', 'ms_subclass_50',
 'ms_subclass_60', 'ms_subclass_40', 'fence', 'garage_qual', 'electrical', 'exter_cond', 'lot_shape',
 '3ssn_porch', 'enclosed_porch', 'bsmt_half_bath', 'paved_drive', 'exterior_1st_AsbShng', 'ms_zoning_RL',
 'ms_zoning_RH', 'exterior_1st_Plywood', 'exterior_1st_MetalSd', 'exterior_1st_VinylSd',
 'roof_matl_WdShake', 'roof_matl_Tar&Grv', 'roof_style_Flat', 'ms_zoning_FV', 'roof_style_Gable',
 'utilities_AllPub', 'lot_config_FR2', 'lot_config_Inside', 'land_contour_Lvl', 'alley_Grvl', 'street_Pave', 
'neighborhood_Greens'],
                 axis = 1,
                 inplace=True,             
)

In [38]:
#import production dataset
data_prod = pd.read_csv('datasets/data_prod.csv')
X_prod = data_prod.drop(columns = 'saleprice', axis = 1)
y_prod = data_prod['saleprice']

In [40]:
#compare between Test columns and Train columns
list(zip(kaggle_choice.columns,X_prod.columns))

[('lot_area', 'lot_area'),
 ('mas_vnr_area', 'mas_vnr_area'),
 ('total_bsmt_sf', 'total_bsmt_sf'),
 ('gr_liv_area', 'gr_liv_area'),
 ('bsmt_full_bath', 'bsmt_full_bath'),
 ('full_bath', 'full_bath'),
 ('half_bath', 'half_bath'),
 ('bedroom_abvgr', 'bedroom_abvgr'),
 ('kitchen_abvgr', 'kitchen_abvgr'),
 ('garage_area', 'garage_area'),
 ('wood_deck_sf', 'wood_deck_sf'),
 ('open_porch_sf', 'open_porch_sf'),
 ('screen_porch', 'screen_porch'),
 ('misc_val', 'misc_val'),
 ('land_slope', 'land_slope'),
 ('overall_qual', 'overall_qual'),
 ('overall_cond', 'overall_cond'),
 ('exter_qual', 'exter_qual'),
 ('bsmt_qual', 'bsmt_qual'),
 ('bsmt_cond', 'bsmt_cond'),
 ('bsmt_exposure', 'bsmt_exposure'),
 ('heating_qc', 'heating_qc'),
 ('kitchen_qual', 'kitchen_qual'),
 ('functional', 'functional'),
 ('fireplace_qu', 'fireplace_qu'),
 ('garage_finish', 'garage_finish'),
 ('pool_qc', 'pool_qc'),
 ('years_since_built', 'years_since_built'),
 ('years_since_remod', 'years_since_remod'),
 ('ms_zoning_RM', '

In [41]:
#features missing to add
kaggle_choice['heating_Wall'] = 0
kaggle_choice['neighborhood_GrnHill'] = 0

#features extra to remove from test set

kaggle_choice.drop(columns = [
    'exterior_1st_PreCast', 'roof_matl_Metal', 'roof_matl_Roll','mas_vnr_type_CBlock','heating_Floor','sale_type_VWD'],
                  axis = 1,
                   inplace = True,
                  )

In [48]:
X_prod.columns

Index(['lot_area', 'mas_vnr_area', 'total_bsmt_sf', 'gr_liv_area',
       'bsmt_full_bath', 'full_bath', 'half_bath', 'bedroom_abvgr',
       'kitchen_abvgr', 'garage_area', 'wood_deck_sf', 'open_porch_sf',
       'screen_porch', 'misc_val', 'land_slope', 'overall_qual',
       'overall_cond', 'exter_qual', 'bsmt_qual', 'bsmt_cond', 'bsmt_exposure',
       'heating_qc', 'kitchen_qual', 'functional', 'fireplace_qu',
       'garage_finish', 'pool_qc', 'years_since_built', 'years_since_remod',
       'ms_subclass_20', 'ms_subclass_120', 'ms_subclass_30', 'ms_subclass_70',
       'ms_subclass_160', 'ms_subclass_75', 'ms_subclass_180', 'ms_zoning_RM',
       'ms_zoning_C (all)', 'land_contour_HLS', 'land_contour_Bnk',
       'lot_config_Corner', 'lot_config_CulDSac', 'roof_style_Hip',
       'roof_style_Gambrel', 'roof_style_Mansard', 'roof_matl_CompShg',
       'roof_matl_WdShngl', 'exterior_1st_HdBoard', 'exterior_1st_Wd Sdng',
       'exterior_1st_CemntBd', 'exterior_1st_BrkFace', 'exter

In [50]:
#training the whole train dataset (train + holdout)
X_prod = data_prod.drop(columns = 'saleprice', axis = 1)
y_prod = data_prod['saleprice']
X_test = kaggle_choice.loc[:,['lot_area', 'mas_vnr_area', 'total_bsmt_sf', 'gr_liv_area',
       'bsmt_full_bath', 'full_bath', 'half_bath', 'bedroom_abvgr',
       'kitchen_abvgr', 'garage_area', 'wood_deck_sf', 'open_porch_sf',
       'screen_porch', 'misc_val', 'land_slope', 'overall_qual',
       'overall_cond', 'exter_qual', 'bsmt_qual', 'bsmt_cond', 'bsmt_exposure',
       'heating_qc', 'kitchen_qual', 'functional', 'fireplace_qu',
       'garage_finish', 'pool_qc', 'years_since_built', 'years_since_remod',
       'ms_subclass_20', 'ms_subclass_120', 'ms_subclass_30', 'ms_subclass_70',
       'ms_subclass_160', 'ms_subclass_75', 'ms_subclass_180', 'ms_zoning_RM',
       'ms_zoning_C (all)', 'land_contour_HLS', 'land_contour_Bnk',
       'lot_config_Corner', 'lot_config_CulDSac', 'roof_style_Hip',
       'roof_style_Gambrel', 'roof_style_Mansard', 'roof_matl_CompShg',
       'roof_matl_WdShngl', 'exterior_1st_HdBoard', 'exterior_1st_Wd Sdng',
       'exterior_1st_CemntBd', 'exterior_1st_BrkFace', 'exterior_1st_Stucco',
       'foundation_PConc', 'heating_Wall', 'garage_type_Basment',
       'misc_feature_Shed', 'misc_feature_Gar2', 'misc_feature_Othr',
       'sale_type_New', 'sale_type_COD', 'sale_type_Con', 'neighborhood_NAmes',
       'neighborhood_OldTown', 'neighborhood_Edwards', 'neighborhood_Somerst',
       'neighborhood_NridgHt', 'neighborhood_Mitchel', 'neighborhood_BrkSide',
       'neighborhood_Crawfor', 'neighborhood_NoRidge', 'neighborhood_StoneBr',
       'neighborhood_NPkVill', 'neighborhood_GrnHill']]

#Standardize
ss_prod = StandardScaler()
Z_prod = ss_prod.fit_transform(X_prod)
Z_test = ss_prod.transform(X_test)

alpha_ridge = 335.1602650938841
ridge_prod = Ridge(alpha=alpha_ridge)

ridge_prod.fit(Z_prod, y_prod)

Ridge(alpha=335.1602650938841)

In [51]:
ridge_prod.predict(Z_test)

array([130408.49185086, 162040.36719309, 227504.36962095, 108961.62970651,
       190428.16288363,  84966.96910722,  98874.90148221, 162930.55016227,
       180968.66063375, 158355.53415876, 145959.16814963, 118161.21021804,
       148252.25718914, 278621.19782042, 142571.81211217, 126770.43617396,
       176294.29033445, 122583.19062022, 185787.18296308, 191199.67311864,
       168090.01666065, 134863.68567411, 182111.73260969, 164532.00003185,
       187249.35749305, 123706.48238861, 136363.25949781, 128312.90648318,
       169424.7736463 ,  46512.08314275,  99782.47271914,  99640.77046442,
       188376.51415056, 150341.74187185, 229368.25510723, 184394.32837064,
       106132.73752293,  73382.06077265, 137051.77072007, 206373.3449153 ,
       182473.39536626, 205578.86947514, 149483.63744359, 164773.74155913,
       210281.27679862,  93472.51866605, 227517.43966071, 123158.89554654,
       136355.8180514 , 119086.47706969,  91689.31650883, 202953.94851669,
       245970.97103886, 1

In [53]:
submit = pd.DataFrame(ridge_prod.predict(Z_test), index = kaggle['id'], columns = ['SalePrice'])

submit

Unnamed: 0_level_0,SalePrice
id,Unnamed: 1_level_1
2658,130408.491851
2718,162040.367193
2414,227504.369621
1989,108961.629707
625,190428.162884
...,...
1662,175185.226420
1234,220255.002845
1373,129771.588270
1672,100186.058659


In [54]:
submit.to_csv('../project_2/datasets/submit.csv')

Kaggle Score

Score: 24052.28611
Public score: 33203.03021