In [1]:
import pandas as pd

data = pd.read_csv('train.csv')
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [2]:
print( data.columns)

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [9]:
#import the necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor

#make the 'LotFrontage' feature compatible by filling NA values with the median in a new column
median_frontage = data.LotFrontage.median()
data['LotFrontageFilled'] = data.LotFrontage.fillna(median_frontage) 

#define which features we want to look at
features = ['YearBuilt', 'LotFrontageFilled', 'LotArea','TotRmsAbvGrd']

#assign those features to X and assign the target to y
X = data[features]
y = data.SalePrice

#separate for train/test split
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=0)

#define the model and fit the model
forest_model = RandomForestRegressor(random_state=1)
forest_model.fit(train_X, train_y)

forest_preds = forest_model.predict(val_X)

print(mean_absolute_error(val_y, forest_preds))

32917.49916112198


In [10]:

test = pd.read_csv('test.csv')

test.LotFrontage.describe()

count    1232.000000
mean       68.580357
std        22.376841
min        21.000000
25%        58.000000
50%        67.000000
75%        80.000000
max       200.000000
Name: LotFrontage, dtype: float64

In [11]:
#fill the missing frontage numbers with the median value.
median_frontage = test.LotFrontage.median()
print(median_frontage)
test.LotFrontage = test.LotFrontage.fillna(median_frontage)

67.0


In [12]:
#ok, we define our features, make sure that everything is there, and then generate some predictions based on our model.
features = ['YearBuilt', 'LotFrontage', 'LotArea','TotRmsAbvGrd']
X3 = test[features]

#instantiate a new random forest
model = RandomForestRegressor(random_state=1)

#fit the model on our training set, then make predictions
model.fit(X, y)
predictions = model.predict(X3)

#save the predictions as a dataframe and then write it to CSV format
predictions_file = pd.DataFrame({
    'Id': test['Id'],
    'SalePrice': predictions
    })

predictions_file.to_csv('submission_rf.csv', index=False)