# Ames Housing Data and Kaggle Challenge
### Part 4: Kaggle Submission



In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
import matplotlib.pyplot as plt
from sklearn import metrics
import pickle

## First Submission: Model 4

Import and clean the Kaggle test data

In [2]:
kaggle = pd.read_csv('../datasets/test.csv')

In [3]:
kaggle.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,3Ssn Porch,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type
0,2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,...,0,0,0,,,,0,4,2006,WD
1,2718,905108090,90,RL,,9662,Pave,,IR1,Lvl,...,0,0,0,,,,0,8,2006,WD
2,2414,528218130,60,RL,58.0,17104,Pave,,IR1,Lvl,...,0,0,0,,,,0,9,2006,New
3,1989,902207150,30,RM,60.0,8520,Pave,,Reg,Lvl,...,0,0,0,,,,0,7,2007,WD
4,625,535105100,20,RL,,9500,Pave,,IR1,Lvl,...,0,185,0,,,,0,7,2009,WD


In [4]:
kaggle.shape

(878, 80)

In [5]:
kaggle.isnull().sum().sort_values(ascending=False).head(15)

Pool QC           874
Misc Feature      837
Alley             820
Fence             706
Fireplace Qu      422
Lot Frontage      160
Garage Cond        45
Garage Qual        45
Garage Yr Blt      45
Garage Finish      45
Garage Type        44
Bsmt Exposure      25
BsmtFin Type 1     25
Bsmt Qual          25
BsmtFin Type 2     25
dtype: int64

Rename the columns to be lowercase and underscores:

In [6]:
def rename_column(col):
    col = col.lower().replace(' ', '_')
    return col

In [7]:
kaggle.columns = kaggle.columns.map(rename_column)

In [8]:
kaggle.head(2)

Unnamed: 0,id,pid,ms_subclass,ms_zoning,lot_frontage,lot_area,street,alley,lot_shape,land_contour,...,3ssn_porch,screen_porch,pool_area,pool_qc,fence,misc_feature,misc_val,mo_sold,yr_sold,sale_type
0,2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,...,0,0,0,,,,0,4,2006,WD
1,2718,905108090,90,RL,,9662,Pave,,IR1,Lvl,...,0,0,0,,,,0,8,2006,WD


In [9]:
to_drop = ['pool_qc', 'misc_feature', 'alley', 'fence', 'fireplace_qu', 'lot_frontage', 'garage_yr_blt', 'garage_finish', 'garage_qual', 'garage_cond', 'garage_type']

In [10]:
kaggle.drop(columns= to_drop, inplace=True)

In [11]:
kaggle.shape

(878, 69)

Saving the cleaned test set:

In [12]:
kaggle.to_csv('../datasets/kaggle_test_cleaned.csv', index = False)

Transform my data to include the features needed

Combining all the bathroom columns:

In [13]:
kaggle['total_bath'] = kaggle['bsmt_full_bath'] + (kaggle['bsmt_half_bath'] *.5) + kaggle['full_bath'] + (kaggle['half_bath'] *.5)


Copying over my dictionary of average price by city:

In [14]:
neighb_dict = {'Blmngtn': 2.0, 'Blueste': 1.45, 'BrDale': 1.03, 'BrkSide': 1.29, 'ClearCr': 2.21, 'CollgCr': 2.02, 'Crawfor': 2.06, 'Edwards': 1.32, 'Gilbert': 1.89, 'Greens': 1.89, 'GrnHill': 3.3, 'IDOTRR': 1.04, 'Landmrk': 1.37, 'MeadowV': 1.0, 'Mitchel': 1.71, 'NAmes': 1.48, 'NPkVill': 1.4, 'NWAmes': 1.95, 'NoRidge': 3.16, 'NridgHt': 3.22, 'OldTown': 1.26, 'SWISU': 1.35, 'Sawyer': 1.39, 'SawyerW': 1.92, 'Somerst': 2.27, 'StoneBr': 3.3, 'Timber': 2.4, 'Veenker': 2.54}

In [15]:
kaggle['neighborhood_avg'] = kaggle['neighborhood'].apply(lambda x: neighb_dict[x])

Creating a column that looks at the ratio of bed to bath:

In [16]:
kaggle['Bed_bath_ratio'] = kaggle['bedroom_abvgr'] / kaggle['total_bath']

Getting dummies:

In [17]:
house_style = pd.get_dummies(kaggle['house_style'], drop_first=True, prefix='style')
ext_qual = pd.get_dummies(kaggle['exter_qual'], drop_first=True, prefix='ext')
base_qual = pd.get_dummies(kaggle['bsmt_qual'], drop_first=True, prefix='base')
kitch_qual = pd.get_dummies(kaggle['kitchen_qual'], drop_first=True, prefix='kitch')
kitch_qual.drop(columns=['kitch_Po'], inplace=True)
ms_zone = pd.get_dummies(kaggle['ms_zoning'], drop_first=True, prefix='zone')
ms_zone.drop(columns=['zone_I (all)'], inplace=True)

In [18]:
ms_zone

Unnamed: 0,zone_FV,zone_RH,zone_RL,zone_RM
0,0,0,0,1
1,0,0,1,0
2,0,0,1,0
3,0,0,0,1
4,0,0,1,0
...,...,...,...,...
873,0,0,1,0
874,0,0,1,0
875,0,0,1,0
876,0,0,1,0


In [19]:
kaggle = pd.concat([kaggle, house_style, ext_qual, base_qual, kitch_qual, ms_zone], axis=1)

In [20]:
kaggle

Unnamed: 0,id,pid,ms_subclass,ms_zoning,lot_area,street,lot_shape,land_contour,utilities,lot_config,...,base_Gd,base_Po,base_TA,kitch_Fa,kitch_Gd,kitch_TA,zone_FV,zone_RH,zone_RL,zone_RM
0,2658,902301120,190,RM,9142,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,1,0,0,0,0,0,1
1,2718,905108090,90,RL,9662,Pave,IR1,Lvl,AllPub,Inside,...,1,0,0,0,0,1,0,0,1,0
2,2414,528218130,60,RL,17104,Pave,IR1,Lvl,AllPub,Inside,...,1,0,0,0,1,0,0,0,1,0
3,1989,902207150,30,RM,8520,Pave,Reg,Lvl,AllPub,Inside,...,0,0,1,0,0,1,0,0,0,1
4,625,535105100,20,RL,9500,Pave,IR1,Lvl,AllPub,Inside,...,1,0,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
873,1662,527377110,60,RL,8000,Pave,Reg,Lvl,AllPub,Inside,...,0,0,1,0,0,1,0,0,1,0
874,1234,535126140,60,RL,14670,Pave,Reg,Lvl,AllPub,Inside,...,0,0,1,0,1,0,0,0,1,0
875,1373,904100040,20,RL,8250,Pave,Reg,Lvl,AllPub,Inside,...,0,0,1,0,0,1,0,0,1,0
876,1672,527425140,20,RL,9000,Pave,Reg,Lvl,AllPub,FR2,...,0,0,1,0,0,1,0,0,1,0


In [21]:
kaggle.isnull().sum().sort_values(ascending=False).head(10)

bsmtfin_type_2    25
bsmt_qual         25
bsmt_cond         25
bsmt_exposure     25
bsmtfin_type_1    25
mas_vnr_type       1
mas_vnr_area       1
electrical         1
bsmtfin_sf_1       0
exter_qual         0
dtype: int64

Copying over the features:

In [22]:
features_1 = ['overall_qual', 'gr_liv_area', 'garage_cars', 'garage_area', '1st_flr_sf', 'total_bsmt_sf', 'year_built', 'total_bath', 'year_remod/add', 'totrms_abvgrd', 'mas_vnr_area']
features_2 = features_1 + ['neighborhood_avg', 'Bed_bath_ratio']
features_3 = features_2 + list(house_style.columns) + list(ext_qual.columns) + list(base_qual.columns) + list(kitch_qual.columns) + list(ms_zone.columns) 

In [23]:
kaggle['overall_qual'] = kaggle['overall_qual'].map(np.log)

features_4 = features_2 + list(house_style.columns) + list(ext_qual.columns) + list(base_qual.columns) + list(kitch_qual.columns) + list(ms_zone.columns)

In [24]:
kaggle[features_4].isnull().sum()

overall_qual        0
gr_liv_area         0
garage_cars         0
garage_area         0
1st_flr_sf          0
total_bsmt_sf       0
year_built          0
total_bath          0
year_remod/add      0
totrms_abvgrd       0
mas_vnr_area        1
neighborhood_avg    0
Bed_bath_ratio      0
style_1.5Unf        0
style_1Story        0
style_2.5Fin        0
style_2.5Unf        0
style_2Story        0
style_SFoyer        0
style_SLvl          0
ext_Fa              0
ext_Gd              0
ext_TA              0
base_Fa             0
base_Gd             0
base_Po             0
base_TA             0
kitch_Fa            0
kitch_Gd            0
kitch_TA            0
zone_FV             0
zone_RH             0
zone_RL             0
zone_RM             0
dtype: int64

Looks like there's some missing values for the features I use so I'll have to impute those values:

In [25]:
kaggle[kaggle['mas_vnr_area'].isnull()]['neighborhood']

865    CollgCr
Name: neighborhood, dtype: object

In [26]:
kaggle.groupby(kaggle['neighborhood'])['mas_vnr_area'].mean()

neighborhood
Blmngtn     40.833333
Blueste      0.000000
BrDale     321.272727
BrkSide     22.281250
ClearCr     92.941176
CollgCr    115.011628
Crawfor    152.656250
Edwards     32.380000
Gilbert     40.734694
Greens       0.000000
IDOTRR       0.000000
MeadowV      0.000000
Mitchel     51.500000
NAmes       89.300752
NPkVill      0.000000
NWAmes     148.454545
NoRidge    533.913043
NridgHt    324.886364
OldTown     26.552632
SWISU        5.500000
Sawyer      46.500000
SawyerW     53.052632
Somerst    149.500000
StoneBr    174.461538
Timber     200.000000
Veenker    178.571429
Name: mas_vnr_area, dtype: float64

In [27]:
kaggle.loc[865, 'mas_vnr_area'] =  115

In [28]:
kaggle.loc[865, 'mas_vnr_area']

115.0

In [29]:
X = kaggle[features_4]

In [30]:
X

Unnamed: 0,overall_qual,gr_liv_area,garage_cars,garage_area,1st_flr_sf,total_bsmt_sf,year_built,total_bath,year_remod/add,totrms_abvgrd,...,base_Gd,base_Po,base_TA,kitch_Fa,kitch_Gd,kitch_TA,zone_FV,zone_RH,zone_RL,zone_RM
0,1.791759,1928,1,440,908,1020,1910,2.0,1950,9,...,0,0,0,1,0,0,0,0,0,1
1,1.609438,1967,2,580,1967,1967,1977,2.0,1977,10,...,1,0,0,0,0,1,0,0,1,0
2,1.945910,1496,2,426,664,654,2006,3.5,2006,7,...,1,0,0,0,1,0,0,0,1,0
3,1.609438,968,2,480,968,968,1923,1.0,2006,5,...,0,0,1,0,0,1,0,0,0,1
4,1.791759,1394,2,514,1394,1394,1963,2.5,1963,6,...,1,0,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
873,1.791759,1877,2,488,1084,1084,1974,3.5,1974,8,...,0,0,1,0,0,1,0,0,1,0
874,1.791759,1988,2,480,1104,1104,1966,2.5,1999,9,...,0,0,1,0,1,0,0,0,1,0
875,1.609438,1211,1,322,1211,952,1968,1.0,1968,5,...,0,0,1,0,0,1,0,0,1,0
876,1.386294,864,2,528,864,864,1971,1.0,1971,5,...,0,0,1,0,0,1,0,0,1,0


Import my model

In [31]:
model_4 = pickle.load(open('../model/linear_model_4.pkl', 'rb'))

Make predictions

In [32]:
logged_prediction = model_4.predict(X)

In [33]:
final_predictions = np.exp(logged_prediction)

Add predictions to dataframe:

In [34]:
kaggle['SalePrice'] = final_predictions

In [35]:
kaggle.tail()

Unnamed: 0,id,pid,ms_subclass,ms_zoning,lot_area,street,lot_shape,land_contour,utilities,lot_config,...,base_Po,base_TA,kitch_Fa,kitch_Gd,kitch_TA,zone_FV,zone_RH,zone_RL,zone_RM,SalePrice
873,1662,527377110,60,RL,8000,Pave,Reg,Lvl,AllPub,Inside,...,0,1,0,0,1,0,0,1,0,186854.743609
874,1234,535126140,60,RL,14670,Pave,Reg,Lvl,AllPub,Inside,...,0,1,0,1,0,0,0,1,0,184148.427483
875,1373,904100040,20,RL,8250,Pave,Reg,Lvl,AllPub,Inside,...,0,1,0,0,1,0,0,1,0,125828.274034
876,1672,527425140,20,RL,9000,Pave,Reg,Lvl,AllPub,FR2,...,0,1,0,0,1,0,0,1,0,112615.740774
877,1939,535327160,20,RL,8400,Pave,Reg,Lvl,AllPub,Corner,...,0,1,0,0,1,0,0,1,0,120934.216372


In [36]:
kaggle['SalePrice'].isnull().sum()

0

Save predictions as csv

In [37]:
kaggle.rename(columns={'id':'Id'}, inplace=True)

In [38]:
submission1 = pd.DataFrame(kaggle[['Id','SalePrice']])
submission1 = submission1.sort_values('Id')

In [39]:
submission1.shape

(878, 2)

In [40]:
submission1.to_csv('../datasets/submission1_caress.csv', index = False)

## Second Submission: Model 5

First, I'll need to add the features I engineered for this model:
- bed_bath
- gr_liv_area_log
- built/remodel
- only_full_bath
- lot_area_log

In [41]:
kaggle['bed_bath'] = kaggle['bedroom_abvgr'] * kaggle['total_bath']

In [42]:
kaggle['gr_liv_area_log'] = np.log(kaggle['gr_liv_area'])

In [43]:
kaggle['built/remodel'] = kaggle['year_built'] * kaggle['year_remod/add']**2

In [44]:
kaggle['only_full_bath'] = kaggle['bsmt_full_bath'] + kaggle['full_bath']


In [45]:
kaggle['lot_area_log'] = (np.log((kaggle['lot_area'])))*kaggle['gr_liv_area']

In [46]:
features_5 = ['overall_qual',
 'garage_area',
 'mas_vnr_area',
 'neighborhood_avg',
 'ext_Fa',
 'ext_Gd',
 'ext_TA',
 'base_Fa',
 'base_Gd',
 'base_TA',
 'kitch_Fa',
 'kitch_Gd',
 'kitch_TA',
 'bed_bath', 
 'gr_liv_area_log',
'built/remodel', 
'only_full_bath',
  'lot_area_log',
'year_remod/add']

Import Model 5:

In [47]:
model_5 = pickle.load(open('../model/linear_model_5.pkl', 'rb'))

Make predictions:

In [48]:
X_5 = kaggle[features_5]

In [49]:
logged_prediction_5 = model_5.predict(X_5)

In [50]:
final_predictions_5 = np.exp(logged_prediction_5)

Add predictions to dataframe:

In [51]:
kaggle['SalePrice'] = final_predictions_5

In [52]:
kaggle.head()

Unnamed: 0,Id,pid,ms_subclass,ms_zoning,lot_area,street,lot_shape,land_contour,utilities,lot_config,...,zone_FV,zone_RH,zone_RL,zone_RM,SalePrice,bed_bath,gr_liv_area_log,built/remodel,only_full_bath,lot_area_log
0,2658,902301120,190,RM,9142,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,1,137078.407379,8.0,7.564238,7262775000,2,17584.583237
1,2718,905108090,90,RL,9662,Pave,IR1,Lvl,AllPub,Inside,...,0,0,1,0,159505.803972,12.0,7.584265,7727161833,2,18049.105344
2,2414,528218130,60,RL,17104,Pave,IR1,Lvl,AllPub,Inside,...,0,0,1,0,202714.765609,10.5,7.31055,8072216216,3,14581.613179
3,1989,902207150,30,RM,8520,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,1,113778.956871,2.0,6.875232,7738221228,1,8760.566128
4,625,535105100,20,RL,9500,Pave,IR1,Lvl,AllPub,Inside,...,0,0,1,0,145100.872791,7.5,7.239933,7564163347,2,12767.711626


In [53]:
kaggle.rename(columns={'id':'Id'}, inplace=True)

Save for Kaggle submission:

In [54]:
submission2 = pd.DataFrame(kaggle[['Id','SalePrice']])
submission2 = submission2.sort_values('Id')

In [55]:
submission2.to_csv('../datasets/submission2_caress.csv', index = False)

## Third Submission

First, I'll need to add the features I engineered/dummied for this model:
- condition2_avg
- exterior_avg
- paved driveway
- central air
- heat
- heat_qc
- porch/deck

In [56]:
cond_dict = {'Artery': 1.19, 'Feedr': 1.29, 'Norm': 1.83, 'PosA': 3.91, 'PosN': 3.96, 'RRAe': 1.9, 'RRAn': 1.37, 'RRNn': 0.97}

In [57]:
kaggle['condition2_avg'] = kaggle['condition_2'].apply(lambda x: cond_dict[x])

In [58]:
kaggle['exterior_1st'].value_counts()

VinylSd    302
Wd Sdng    144
HdBoard    142
MetalSd    119
Plywood     69
CemntBd     35
BrkFace     24
Stucco      16
AsbShng     11
WdShing     11
BrkComm      3
AsphShn      1
PreCast      1
Name: exterior_1st, dtype: int64

In [59]:
ext_dict = {'PreCast': 1, 'AsbShng': 1.08, 'AsphShn': 0.82, 'BrkComm': 1.31, 'BrkFace': 2.0, 'CBlock': 1.05, 'CemntBd': 2.44, 'HdBoard': 1.61, 'ImStucc': 2.62, 'MetalSd': 1.56, 'Plywood': 1.71, 'Stone': 2.58, 'Stucco': 1.43, 'VinylSd': 2.19, 'Wd Sdng': 1.44, 'WdShing': 1.56}

In [60]:
kaggle['exterior_avg'] = kaggle['exterior_1st'].apply(lambda x: ext_dict[x])

In [61]:
kaggle['porch/deck'] = (np.log(1 + kaggle['wood_deck_sf'] + kaggle['open_porch_sf'] + kaggle['enclosed_porch'] + kaggle['3ssn_porch'] + kaggle['screen_porch']))**2







In [62]:
cent_air = pd.get_dummies(kaggle['central_air'], drop_first=True, prefix='CA')

In [63]:
heat_dummy = pd.get_dummies(kaggle['heating'], drop_first=True, prefix='heat')

In [64]:
heat_qc = pd.get_dummies(kaggle['heating_qc'], drop_first=True, prefix='heatqc')

In [65]:
drive = pd.get_dummies(kaggle['paved_drive'], drop_first=True, prefix='drive')

In [66]:
kaggle = pd.concat([kaggle, cent_air, heat_dummy, heat_qc, drive], axis=1)

In [67]:
features_6 = ['overall_qual',
 'garage_area',
 'mas_vnr_area',
 'neighborhood_avg',
 'ext_Fa',
 'ext_Gd',
 'ext_TA',
 'base_Fa',
 'base_Gd',
 'base_TA',
 'kitch_Fa',
 'kitch_Gd',
 'kitch_TA',
 'bed_bath',
 'gr_liv_area_log',
 'built/remodel',
 'only_full_bath',
 'lot_area_log',
 'year_remod/add',
 'exterior_avg',
'porch/deck',
 'drive_Y',
 'CA_Y',
 'heat_Grav',
 'heatqc_TA']

In [68]:
model_6 = pickle.load(open('../model/model_6.pkl', 'rb'))

In [69]:
X_6 = kaggle[features_6]

In [70]:
logged_prediction_6 = model_6.predict(X_6)

In [71]:
logged_prediction_6[0]

11.811583378902778

In [72]:
final_predictions_6 = np.exp(logged_prediction_6)

In [73]:
kaggle['SalePrice'] = final_predictions_6

In [74]:
kaggle['SalePrice'].head()

0    134804.839601
1    159739.721289
2    202476.486191
3    109005.098156
4    151824.879351
Name: SalePrice, dtype: float64

In [75]:
kaggle.rename(columns={'id':'Id'}, inplace=True)

In [76]:
submission3 = pd.DataFrame(kaggle[['Id','SalePrice']])
submission3 = submission3.sort_values('Id')

In [77]:
submission3.to_csv('../datasets/submission5_caress.csv', index = False)

## Fourth Submission 

First, I'll need to engineer a few more columns to get all of the features below:

In [78]:
features_7 = ['overall_qual',
 'garage_area',
 'mas_vnr_area',
 'neighborhood_avg',
 'bed_bath',
 'gr_liv_area',
 'built/remodel',
 'lot_area_log',
'porch/deck',
 'kitchen_score',
'basement_score',
'ext_score',
'bsmtfin_sf_1',
'total_bsmt_sf',
'overall_cond',      
'lot_area',    
'contour_score', 
'roof_score', 
'found_score',         
'cond2_score',
'fire',
'sale_score' ,
             ]

In [79]:
kitchens = list(kaggle['kitchen_qual'].unique())
kitchens = sorted(kitchens)

In [80]:
print(kitchens)

['Ex', 'Fa', 'Gd', 'Po', 'TA']


In [81]:
kitchen_list = [5, 2, 4,1, 3]

In [82]:
kitchen_dict = {kitchen : num 
               for(kitchen, num) in zip(kitchens, kitchen_list)}
print(kitchen_dict)

{'Ex': 5, 'Fa': 2, 'Gd': 4, 'Po': 1, 'TA': 3}


In [83]:
kaggle['kitchen_score'] = kaggle['kitchen_qual'].apply(lambda x: kitchen_dict[x])

In [84]:
basements = list(kaggle['bsmt_qual'].unique())

In [85]:
print(basements)

['Fa', 'Gd', 'TA', 'Ex', nan, 'Po']


In [86]:
bsmt_list = [2, 4, 3, 5, 0, 1]

In [87]:
bsmt_dict = {basement : num 
               for(basement, num) in zip(basements, bsmt_list)}
print(bsmt_dict)

{'Fa': 2, 'Gd': 4, 'TA': 3, 'Ex': 5, nan: 0, 'Po': 1}


In [88]:
kaggle['basement_score'] = kaggle['bsmt_qual'].apply(lambda x: bsmt_dict[x])

In [89]:
exts = list(kaggle['exter_qual'].unique())
exts = sorted(exts)

In [90]:
print(exts)

['Ex', 'Fa', 'Gd', 'TA']


In [91]:
ext_list = [5, 2, 4, 3]

In [92]:
ext_dict = {ext : num 
               for(ext, num) in zip(exts, ext_list)}
print(ext_dict)

{'Ex': 5, 'Fa': 2, 'Gd': 4, 'TA': 3}


In [93]:
kaggle['ext_score'] = kaggle['exter_qual'].apply(lambda x: ext_dict[x])

In [94]:
lot_dict = {'FR2':1.7, 'Inside': 1.7, 'Corner':1.8, 'FR3': 2.3, 'CulDSac':2.3 }
kaggle['lot_config_score'] = kaggle['lot_config'].apply(lambda x: lot_dict[x])

uti_dict = {'NoSewr':0, 'AllPub':1 }
kaggle['utilities_score'] = kaggle['utilities'].apply(lambda x: uti_dict[x])

land_dict = {'Bnk': 1.4, 'Lvl': 1.8, 'Low':2, 'HLS': 2.6 }
kaggle['contour_score'] = kaggle['land_contour'].apply(lambda x: land_dict[x])

roof_dict = {'Gambrel': 1.3, 'Mansard': 1.5, 'Gable': 1.7, 'Flat': 1.9, 'Shed': 2, 'Hip': 2.3}
kaggle['roof_score'] = kaggle['roof_style'].apply(lambda x: roof_dict[x])

found_dict = {'BrkTil': 1.3, 'Stone': 1.4, 'CBlock': 1.4, 'Wood': 1.7, 'PConc': 2.3, 'Slab':1}
kaggle['found_score'] = kaggle['foundation'].apply(lambda x: found_dict[x])

elc_dict = {'Mix': .5, 'FuseP': .5, 'FuseF': 1, 'FuseA': 1, 'SBrkr': 2, np.nan:0}
kaggle['elec_score'] = kaggle['electrical'].apply(lambda x: elc_dict[x])

street_dict = {'Grvl': 0, 'Pave': 1}
kaggle['street_score'] = kaggle['street'].apply(lambda x: street_dict[x])

cond2_dict = {'RRNn': .9, 'Artery': 1.2, 'Feedr': 1.3, 'RRAn': 1.3, 'Norm': 1.8, 'RRAe': 1.9, 'PosA': 3.9, 'PosN': 3.9}
kaggle['cond2_score'] = kaggle['condition_2'].apply(lambda x: cond2_dict[x])

mas_dict = {'BrkCmn': 1.5, 'None':1.5, 'BrkFace':2, 'Stone': 2.6, 'CBlock':1, np.nan:0}
kaggle['mas_score'] = kaggle['mas_vnr_type'].apply(lambda x: mas_dict[x])

fire_dict = {0:0, 1:1, 2:1, 3:1, 4:1}
kaggle['fire'] = kaggle['fireplaces'].apply(lambda x: fire_dict[x])

saletype_dict = {'ConLw': 1.2, 'Oth': 1.2, 'COD': 1.3, 'ConLD': 1.4, 
                 'WD ': 1.7, 'CWD': 1.8, 'ConLI': 2, 'Con': 2.4, 'New': 2.8, 'VWD':1.7}
kaggle['sale_score'] = kaggle['sale_type'].apply(lambda x: saletype_dict[x])

bldg_dict = {'2fmCon':1.2, 'Twnhs':1.3, 'Duplex': 1.5, '1Fam': 1.8, 'TwnhsE': 1.9}
kaggle['bldg_score'] = kaggle['bldg_type'].apply(lambda x: bldg_dict[x])



Load my model:

In [95]:
model_7 = pickle.load(open('../model/linear_model_7.pkl', 'rb'))

In [96]:
X_7 = kaggle[features_7]

Make predictions:

In [97]:
logged_prediction_7 = model_7.predict(X_7)

In [98]:
final_predictions_7 = np.exp(logged_prediction_7)

Add predictions to dataframe:

In [99]:
kaggle['SalePrice'] = final_predictions_7

In [100]:
kaggle['SalePrice'].head()

0    143750.524481
1    161719.442248
2    210380.216557
3    119973.004940
4    159832.233454
Name: SalePrice, dtype: float64

In [101]:
kaggle.rename(columns={'id':'Id'}, inplace=True)

Save as Kaggle submission:

In [102]:
submission4 = pd.DataFrame(kaggle[['Id','SalePrice']])
submission4 = submission4.sort_values('Id')

In [103]:
submission4.to_csv('../datasets/submission8_caress.csv', index = False)