## Using FastAI to Predict House Sales For Kaggle DataSet 
#### https://www.kaggle.com/c/house-prices-advanced-regression-techniques

In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [1]:
from fastai.imports import *

from fastai.structured import *
from fastai.column_data import *

from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from IPython.display import display

from sklearn import metrics
PATH = "/home/jovyan/persist/data/House/"

In [2]:
def read_csv(fname):
    df_ = pd.read_csv("fname", low_memory=False)
    return df_

def display_all(df):
    with pd.option_context("display.max_rows", 1000, "display.max_columns", 1000):
        display(df)

In [141]:
df_house = pd.read_csv(PATH+"train.csv", low_memory=False)
df_house_test = pd.read_csv(PATH+"test.csv", low_memory=False)
df_house_test['SalePrice'] = 1

In [195]:
df_house.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
def find_cat_names(df):
    cat_var_names = []
    cont_var_name = None
    for n,c in df.items():
        if is_string_dtype(c):
            cat_var_names.append(n)
    cont_var_name = set(df.columns) - set(cat_var_names)
    return cat_var_names, list(cont_var_name)

In [4]:
cat_var_names,cont_var_name = find_cat_names(df_house)

In [202]:
print(cat_var_names)

['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']


In [203]:
print(cont_var_name)

['GarageArea', 'BsmtUnfSF', 'KitchenAbvGr', 'OpenPorchSF', 'YearBuilt', 'MiscVal', 'EnclosedPorch', 'TotRmsAbvGrd', 'GarageYrBlt', 'BsmtFullBath', 'TotalBsmtSF', 'SalePrice', 'WoodDeckSF', 'GarageCars', 'Id', 'MSSubClass', 'Fireplaces', 'OverallCond', 'MoSold', '3SsnPorch', 'LotArea', '2ndFlrSF', 'YearRemodAdd', 'BsmtFinSF1', 'PoolArea', 'HalfBath', 'FullBath', 'BedroomAbvGr', 'ScreenPorch', 'LotFrontage', '1stFlrSF', 'OverallQual', 'MasVnrArea', 'YrSold', 'LowQualFinSF', 'BsmtHalfBath', 'BsmtFinSF2', 'GrLivArea']


In [5]:
def get_cat_sz_embedding(df, cat_var_names):
    df_copy = df.copy()
    train_cats(df_copy)
    cat_sz = [(c, len(df_copy[c].cat.categories)+1) for c in cat_var_names]
    emb_szs = [(c, min(50, (c+1)//2)) for _,c in cat_sz]
    return cat_sz, emb_szs

In [6]:
cat_sz, emb_szs = get_cat_sz_embedding(df_house,cat_var_names)

In [192]:
cat_sz[:4]

[('MSZoning', 6), ('Street', 3), ('Alley', 3), ('LotShape', 5)]

In [193]:
emb_szs[:4]

[(6, 3), (3, 2), (3, 2), (5, 3)]

In [66]:
def transform_data(df, target_name,slist, do_scale=True):
    df_copy = df.copy()
    train_cats(df_copy)
    if do_scale:
        df_, y_, nas_, map_ = proc_df(df_copy, target_name, 
                                      skip_flds=slist, do_scale = do_scale)
        yl_ = np.log(y_)
        return df_, yl_, nas_, map_
    else:
        df_, y_, nas_ = proc_df(df_copy, target_name, skip_flds=slist)
        return df_, y_, nas_

def transform_test(df, target_name, cat_var_names, cont_var_name, 
                   mapper, nas , do_scale = True):
    df_copy = df.copy()
    train_cats(df_copy)
    df_test_1, y_test, nas_test, map_test = proc_df(df_copy, target_name, 
                                                    skip_flds=[], mapper=mapper, 
                                                    na_dict=nas, do_scale = do_scale)
    return df_test_1

In [67]:
df_, y_, nas_, map_ = transform_data(df_house, 'SalePrice', [])
df_test = transform_test(df_house_test, 'SalePrice', cat_var_names, 
                         cont_var_name, map_, nas_)

In [198]:
df_.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,LotFrontage_na,MasVnrArea_na,GarageYrBlt_na
0,-1.730865,0.073375,4,-0.220875,-0.207142,2,0,4,4,1,...,0,0,-0.087688,-1.599111,0.138777,9,5,-0.464385,-0.074227,-0.24236
1,-1.728492,-0.872563,4,0.46032,-0.091886,2,0,4,4,1,...,0,0,-0.087688,-0.48911,-0.614439,9,5,-0.464385,-0.074227,-0.24236
2,-1.72612,0.073375,4,-0.084636,0.07348,2,0,1,4,1,...,0,0,-0.087688,0.990891,0.138777,9,5,-0.464385,-0.074227,-0.24236
3,-1.723747,0.309859,4,-0.44794,-0.096897,2,0,1,4,1,...,0,0,-0.087688,-1.599111,-1.367655,9,1,-0.464385,-0.074227,-0.24236
4,-1.721374,0.073375,4,0.641972,0.375148,2,0,1,4,1,...,0,0,-0.087688,2.100892,0.138777,9,5,-0.464385,-0.074227,-0.24236


In [199]:
df_test.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,LotFrontage_na,MasVnrArea_na,GarageYrBlt_na
0,1.733238,-0.872563,3,0.46032,0.110763,2,0,4,4,1,...,3,0,-0.087688,-0.11911,1.64521,9,5,-0.464385,-0.074227,-0.24236
1,1.73561,-0.872563,4,0.505733,0.37585,2,0,1,4,1,...,0,1,25.116309,-0.11911,1.64521,9,5,-0.464385,-0.074227,-0.24236
2,1.737983,0.073375,4,0.187842,0.332053,2,0,1,4,1,...,3,0,-0.087688,-1.229111,1.64521,9,5,-0.464385,-0.074227,-0.24236
3,1.740356,0.073375,4,0.369494,-0.054002,2,0,1,4,1,...,0,0,-0.087688,-0.11911,1.64521,9,5,-0.464385,-0.074227,-0.24236
4,1.742728,1.492282,4,-1.219961,-0.552407,2,0,1,2,1,...,0,0,-0.087688,-1.969111,1.64521,9,5,-0.464385,-0.074227,-0.24236


In [196]:
max_log_y = np.max(y_)
y_range = (0, max_log_y*1.2)

In [164]:
def model_coln_data(path, df, y, cat_var_names, df_test = None):
    train_ratio = 0.9
    train_size = int(len(df) * train_ratio)
    val_idx = list(range(train_size, len(df)))
    md = ColumnarModelData.from_data_frame(path, 
                                  val_idx, 
                                  df, 
                                  y, 
                                  cat_flds=cat_var_names, 
                                  bs= 128,  test_df=df_test)
    return md

In [166]:
md = model_coln_data(PATH, df_, y_, cat_var_names, df_test)

In [167]:
m = md.get_learner(emb_szs, len(df_.columns)-len(cat_var_names),
                   0.04, 1, [1000,500], [0.001,0.01], y_range=y_range)

In [168]:
#lr = 1e-3
#lr = 10
m.lr_find()

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                           
    0      12.290099  18.181595 



In [169]:
def inv_y(a): return np.exp(a)

def exp_rmspe(y_pred, targ):
    targ = inv_y(targ)
    pct_var = (targ - inv_y(y_pred))/targ
    return math.sqrt((pct_var**2).mean())

In [170]:
lr = 1e-3
m.fit(lr, 5, metrics=[exp_rmspe])

HBox(children=(IntProgress(value=0, description='Epoch', max=5), HTML(value='')))

epoch      trn_loss   val_loss   exp_rmspe               
    0      14.986103  11.872955  34.161623 
    1      9.264621   0.597486   2.928461                
    2      5.850924   0.598637   0.516094                
    3      3.979023   0.283514   0.853877                
    4      2.847732   0.083399   0.318226                



[array([ 0.0834]), 0.31822583448156777]

In [171]:
x,y=m.predict_with_targs()

In [172]:
exp_rmspe(x,y)

0.318225832587142

In [173]:
pred_test=m.predict(True)
pred_test = np.exp(pred_test)

In [189]:
df_house_test["SalePrice"] = pred_test

In [190]:
df_house_test[["Id", "SalePrice"]].to_csv(PATH+"submission.csv", 
                                          index=False)

In [191]:
df_house_test[["Id", "SalePrice"]].head()

Unnamed: 0,Id,SalePrice
0,1461,70140.164062
1,1462,113940.28125
2,1463,191330.484375
3,1464,191153.5625
4,1465,142605.71875


## Final Submission to Kaggle