In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
train.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [4]:
train.dropna(axis=1).columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotArea', 'Street', 'LotShape',
       'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood',
       'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl',
       'Exterior1st', 'Exterior2nd', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'GarageCars', 'GarageArea', 'PavedDrive',
       'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
       'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold', 'SaleType',
       'SaleCondition', 'SalePrice'],
      dtype='object')

In [5]:
#pd.concat([train,test],axis=1).dropna(axis=1).columns.sort_values(ascending = True)
train['GarageCars'].value_counts()

2    824
1    369
3    181
0     81
4      5
Name: GarageCars, dtype: int64

In [6]:
lb_Neighborhood = LabelEncoder()
train['NeighborhoodCode'] = lb_Neighborhood.fit_transform(train['Neighborhood'])
test['NeighborhoodCode'] = lb_Neighborhood.fit_transform(test['Neighborhood'])



In [7]:
predictor_columns = ['LotArea', 'YearBuilt', 'GrLivArea', 'FullBath', 
                     'TotRmsAbvGrd','NeighborhoodCode','OverallQual']

In [8]:
train_X = train[predictor_columns]
test_X = test[predictor_columns]
train_Y = train.SalePrice

In [9]:
np.nan_to_num(train_X)
np.nan_to_num(test_X)

array([[11622,  1961,   896, ...,     5,    12,     5],
       [14267,  1958,  1329, ...,     6,    12,     6],
       [13830,  1997,  1629, ...,     6,     8,     5],
       ...,
       [20000,  1960,  1224, ...,     7,    11,     5],
       [10441,  1992,   970, ...,     6,    11,     5],
       [ 9627,  1993,  2000, ...,     9,    11,     7]])

In [10]:
forest_model = RandomForestRegressor()
forest_model.fit(train_X, train_Y)
forest_sales_price_prediction = forest_model.predict(test_X)

In [14]:
my_submission = pd.DataFrame({'Id': test.Id, 'SalePrice': forest_sales_price_prediction})

In [15]:
my_submission.to_csv('my_submission.csv', index=False)

In [13]:
# Score: RMSE = 0.16344
#Submissions are evaluated on Root-Mean-Squared-Error (RMSE) 
#between the logarithm of the predicted value and the logarithm 
#of the observed sales price. 
#(Taking logs means that errors in predicting expensive houses and cheap houses will affect the result equally.)