# House price prediction

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn import preprocessing

import utils

RANDOMSEED = 0

In [2]:
# Make a scorer for root mean squared error of the log price (the metric used on kaggle)
def rmse_log(y, y_pred):
    return(np.sqrt(mean_squared_error(np.log(y), np.log(y_pred))))
rmse_log_scorer = make_scorer(rmse_log, greater_is_better=False)

## 1. Evaluation with only the training data

#### Import data and test/train split

In [3]:
dataset_train = pd.read_csv(r'data\train.csv')
X = dataset_train.drop("SalePrice",axis=1)
y = dataset_train["SalePrice"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOMSEED)

#### Clean the data

In [4]:
X_train, X_test = utils.clean(X_train, X_test)
print(X_train.columns)
print(X_train.shape)

Index(['MasVnrType_BrkFace', 'LandContour_Bnk', 'BsmtCond_Fa', 'BsmtUnfSF',
       'SaleCondition_Family', 'GrLivArea', 'WoodDeckSF',
       'SaleCondition_Normal', 'BsmtFinType1_LwQ', 'BldgType_1Fam',
       ...
       'Exterior1st_VinylSd', 'HouseStyle_2.5Unf', 'GarageFinish_Unf',
       'Functional_Min1', 'GarageType_BuiltIn', 'Id', 'Utilities_AllPub',
       'GarageQual_Fa', 'PavedDrive_N', 'MSZoning_RM'],
      dtype='object', length=212)
(1168, 212)


#### Train the model

In [5]:
model = utils.model(X_train, y_train, 'regression', rmse_log_scorer)
model.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'alpha': [0.03, 0.1, 3, 10, 30, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

#### Evaluate the model

In [6]:
predictions = model.predict(X_test)
accuracy = rmse_log(predictions, y_test)
print(accuracy)

0.166267264398


## 2. Build model on all of training data and make predictions about testing data

#### Import the data

In [7]:
dataset_train = pd.read_csv(r'data\train.csv')
X_train = dataset_train.drop("SalePrice",axis=1)
y_train = dataset_train["SalePrice"]
X_test = pd.read_csv(r'data\test.csv')

#### Clean the data

In [8]:
X_train, X_test = utils.clean(X_train, X_test)

#### Train the model

In [9]:
model = utils.model(X_train, y_train, 'regression', rmse_log_scorer)
model.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=None,
          max_features='auto', max_leaf_nodes=None,
          min_impurity_split=1e-07, min_samples_leaf=1,
          min_samples_split=2, min_weight_fraction_leaf=0.0,
          n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'max_features': ['sqrt', 0.4, 0.8, 0.9], 'min_samples_leaf': [2, 3, 4, 5], 'min_samples_split': [2, 5, 10, 20], 'n_estimators': [100, 150]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

#### Make predictions

In [10]:
predictions = model.predict(X_test)
pd.DataFrame(predictions).to_csv("predictions.csv")