In [1]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import (train_test_split, GridSearchCV)
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (StandardScaler, OneHotEncoder, FunctionTransformer)
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from scipy import stats
import numpy as np
import pandas as pd

# Data Visualisation
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
# load housing data
iowa_file_path = '../data/train.csv'
home_data = pd.read_csv(iowa_file_path)
# store sale price in Y and drop it from the dataframe
Y = home_data["SalePrice"]
X = home_data.drop(columns = ["Id", "SalePrice"])

# split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.05, random_state=1)


In [8]:
# see a boxplot to visualize outliers

#X_train.boxplot(return_type='dict')
#plt.show()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
count,1387.0,1141.0,1387.0,1387.0,1387.0,1387.0,1387.0,1379.0,1387.0,1387.0,...,1387.0,1387.0,1387.0,1387.0,1387.0,1387.0,1387.0,1387.0,1387.0,1387.0
mean,56.925018,70.200701,10522.15429,6.105984,5.580389,1971.530642,1984.909877,104.759971,443.899063,46.450613,...,474.33814,94.059841,47.011536,22.196828,3.412401,15.107426,2.90411,45.777938,6.33093,2007.807498
std,42.066996,24.412196,10129.934396,1.375791,1.103512,30.092827,20.647417,182.096893,456.217183,161.599516,...,212.750242,124.91808,66.849208,61.526278,29.36087,55.641056,41.216672,508.917656,2.68703,1.329975
min,20.0,21.0,1300.0,1.0,2.0,1872.0,1950.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0
25%,20.0,59.0,7537.5,5.0,5.0,1954.0,1967.0,0.0,0.0,0.0,...,336.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0
50%,50.0,70.0,9480.0,6.0,5.0,1973.0,1994.0,0.0,385.0,0.0,...,480.0,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0
75%,70.0,80.0,11539.5,7.0,6.0,2001.0,2004.0,167.5,715.0,0.0,...,576.0,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0
max,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,1474.0,...,1418.0,736.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0


In [30]:
# We will train our classifier with the following features:

# We create the preprocessing pipelines for both numeric and categorical data.
numeric_features = X.select_dtypes(exclude=object) 
num_features_names = numeric_features.columns

# features that need a log transformation
log_features_names = ["LotFrontage", "LotArea", "1stFlrSF", "GrLivArea", "OpenPorchSF"]
log_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', FunctionTransformer(np.log1p, validate=False)),
    ('logscaler', StandardScaler())])

#numeric features that require a normal transformation
numeric_features_names = [ x for x in num_features_names if x not in log_features_names]

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features_names = X.select_dtypes(include=object).columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('log', log_transformer, log_features_names),
        ('num', numeric_transformer, numeric_features_names),
        ('cat', categorical_transformer, categorical_features_names)
        
    ])

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', RandomForestRegressor(random_state=1, n_estimators = 500))])



In [33]:
# use boxplots to detect outliers

Xt = preprocessor.fit_transform(X_train)
print(type(Xt))

numeric_features = Xt.select_dtypes(exclude=object) 
num_features_names = numeric_features.columns.tolist()
print(type(num_features_names))

list_chunks = [num_features_names[i:i + 4] for i in range(0, len(num_features_names), 4)]

for sublist in list_chunks:
    print(type(sublist))
    numeric_features.boxplot(column = sublist)
    plt.show()

<class 'scipy.sparse.csr.csr_matrix'>


AttributeError: select_dtypes not found

In [None]:
#fitting and prediction
clf.fit(X_train, y_train)
y_predict = clf.predict(X_test)
rf_val_mae = mean_absolute_error(y_test, y_predict)

print("Validation MAE for Random Forest Model: {:,.0f}".format(rf_val_mae))

#train the best model on the full dat
clf.fit(X,Y)

#path to file you will use for predictions
test_data_path = '../data/test.csv'
test_data = pd.read_csv(test_data_path)

X_test2 = test_data.drop(columns = ["Id"])

#make predictions which we will submit. 
y_pred2 = clf.predict(X_test2)

#The lines below shows how to save predictions in format used for competition scoring
output = pd.DataFrame({'Id': test_data.Id,
                       'SalePrice': y_pred2})
output.to_csv('../data/submission.csv', index=False)