In [1]:
import pandas as pd

train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [2]:
#print summary of data
print(train_data.describe())
# count shows how many rows have non-missing values.

                Id   MSSubClass  LotFrontage        LotArea  OverallQual  \
count  1460.000000  1460.000000  1201.000000    1460.000000  1460.000000   
mean    730.500000    56.897260    70.049958   10516.828082     6.099315   
std     421.610009    42.300571    24.284752    9981.264932     1.382997   
min       1.000000    20.000000    21.000000    1300.000000     1.000000   
25%     365.750000    20.000000    59.000000    7553.500000     5.000000   
50%     730.500000    50.000000    69.000000    9478.500000     6.000000   
75%    1095.250000    70.000000    80.000000   11601.500000     7.000000   
max    1460.000000   190.000000   313.000000  215245.000000    10.000000   

       OverallCond    YearBuilt  YearRemodAdd   MasVnrArea   BsmtFinSF1  \
count  1460.000000  1460.000000   1460.000000  1452.000000  1460.000000   
mean      5.575342  1971.267808   1984.865753   103.685262   443.639726   
std       1.112799    30.202904     20.645407   181.066207   456.098091   
min       1.000

In [3]:
print(train_data.columns)

#this is my target variable
y = train_data.SalePrice
# the head command returns the top few lines of data.
print(y.head())

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [4]:
house_overall_cond = train_data.OverallCond
#print(house_overall_cond.head())

selected_columns = ['LotArea', 'OverallCond']
data_house_twocols = train_data[selected_columns]

In [5]:
predictors = ['LotArea', 'YearBuilt', 'FullBath', 'BedroomAbvGr']
X = train_data[predictors]

# Decision Tree

In [6]:
from sklearn.tree import DecisionTreeRegressor

# Define model
decision_model = DecisionTreeRegressor()

# Fit model
decision_model.fit(X, y)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [7]:
print("Making predictions for the following 5 houses:")
print(X.head())
print("The predictions are")
print(decision_model.predict(X.head()))

Making predictions for the following 5 houses:
   LotArea  YearBuilt  FullBath  BedroomAbvGr
0     8450       2003         2             3
1     9600       1976         2             3
2    11250       2001         2             3
3     9550       1915         1             3
4    14260       2000         2             4
The predictions are
[ 196750.  184250.  223500.  140000.  250000.]


<hr>

# Validation of Model


### testing on training data (without splitting)

In [8]:
from sklearn.metrics import mean_absolute_error

predicted_values = decision_model.predict(X)
mean_absolute_error(y, predicted_values)

495.48059360730593

### testing on training data (with splitting) 

In [9]:
from sklearn.model_selection import train_test_split

# split data into training and validation data, for both predictors and target
# The split is based on a random number generator. Supplying a numeric value to
# the random_state argument guarantees we get the same split every time we
# run this script.
train_X, val_X, train_y, val_y = train_test_split(X,
                                                  y,random_state = 0)

# Fit model
decision_model.fit(train_X, train_y)

# get predicted prices on validation data
val_predictions = decision_model.predict(val_X)
print(mean_absolute_error(val_y, val_predictions))

43805.3068493


#### Just check the differences between the errors!!!! 

In [10]:
def get_mae(max_leaf_nodes, predictors_train, predictors_val, targ_train, targ_val):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(predictors_train, targ_train)
    preds_val = model.predict(predictors_val)
    mae = mean_absolute_error(targ_val, preds_val)
    return(mae)

In [11]:
# compare MAE with differing values of max_leaf_nodes
for max_leaf_nodes in [5, 50, 75, 100, 200, 500, 2000, 5000, 10000]:
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))

Max leaf nodes: 5  		 Mean Absolute Error:  37026
Max leaf nodes: 50  		 Mean Absolute Error:  35513
Max leaf nodes: 75  		 Mean Absolute Error:  36068
Max leaf nodes: 100  		 Mean Absolute Error:  36598
Max leaf nodes: 200  		 Mean Absolute Error:  39920
Max leaf nodes: 500  		 Mean Absolute Error:  43631
Max leaf nodes: 2000  		 Mean Absolute Error:  44027
Max leaf nodes: 5000  		 Mean Absolute Error:  44027
Max leaf nodes: 10000  		 Mean Absolute Error:  44027


<hr>
# Random Forest

The random forest uses <b>many trees</b>, and it makes a prediction by <b>average</b> the predictions of each component tree. It generally has much better predictive accuracy than a single decision tree and it works well with default parameters.

In [12]:
from sklearn.ensemble import RandomForestRegressor

forest_model = RandomForestRegressor()
forest_model.fit(train_X, train_y)
house_preds = forest_model.predict(val_X)
print(mean_absolute_error(val_y, house_preds))

35595.9185466


# Handling Missing Values

In [13]:
print(train_data.isnull().sum())

Id                  0
MSSubClass          0
MSZoning            0
LotFrontage       259
LotArea             0
Street              0
Alley            1369
LotShape            0
LandContour         0
Utilities           0
LotConfig           0
LandSlope           0
Neighborhood        0
Condition1          0
Condition2          0
BldgType            0
HouseStyle          0
OverallQual         0
OverallCond         0
YearBuilt           0
YearRemodAdd        0
RoofStyle           0
RoofMatl            0
Exterior1st         0
Exterior2nd         0
MasVnrType          8
MasVnrArea          8
ExterQual           0
ExterCond           0
Foundation          0
                 ... 
BedroomAbvGr        0
KitchenAbvGr        0
KitchenQual         0
TotRmsAbvGrd        0
Functional          0
Fireplaces          0
FireplaceQu       690
GarageType         81
GarageYrBlt        81
GarageFinish       81
GarageCars          0
GarageArea          0
GarageQual         81
GarageCond         81
PavedDrive

In [14]:
#remove y i.e. target from data
train_predictors = train_data.drop(['SalePrice'], axis=1)
train_numeric_predictors = train_predictors.select_dtypes(exclude=['object'])

X_train, X_test, y_train, y_test = train_test_split(train_numeric_predictors, 
                                                    y,
                                                    train_size=0.7, 
                                                    test_size=0.3, 
                                                    random_state=0)

def score_dataset(X_train, X_test, y_train, y_test):
    model = RandomForestRegressor()
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    return mean_absolute_error(y_test, preds)

### 1) Drop Columns with Missing Values

In [15]:
cols_with_missing = [col for col in X_train.columns 
                                 if X_train[col].isnull().any()]
red_train_data = X_train.drop(cols_with_missing, axis=1)
red_test_data = X_test.drop(cols_with_missing, axis=1)

print(score_dataset(red_train_data, red_test_data, y_train, y_test))

19334.3892694


### 2) Imputation

In [16]:
from sklearn.preprocessing import Imputer
my_imputer = Imputer()

imputed_X_train = my_imputer.fit_transform(X_train)
imputed_X_test = my_imputer.transform(X_test)
print(score_dataset(imputed_X_train, imputed_X_test, y_train, y_test))

19727.0557078


### 3) Extension to Imputation

In [17]:
imputed_X_train_plus = X_train.copy()
imputed_X_test_plus = X_test.copy()

cols_with_missing = (col for col in X_train.columns 
                                 if X_train[col].isnull().any())
for col in cols_with_missing:
    imputed_X_train_plus[col + '_was_missing'] = imputed_X_train_plus[col].isnull()
    imputed_X_test_plus[col + '_was_missing'] = imputed_X_test_plus[col].isnull()

# Imputation
my_imputer = Imputer()
imputed_X_train_plus = my_imputer.fit_transform(imputed_X_train_plus)
imputed_X_test_plus = my_imputer.transform(imputed_X_test_plus)

print(score_dataset(imputed_X_train_plus, imputed_X_test_plus, y_train, y_test))

19052.9015982
