# Blue Book for Bulldozers

This competition was designed to find out a way to better predict what could be the price of a bulldozers at an auction. Such systems are today used in car pricing algorithms.

The measurment scale is RMSLE (root mean squared log error) between the actual and predicted auction prices.

The training data set contains 400,000+ rows and 55 columns.

We use fastai library to ease the boring stuff.

In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [89]:
from fastai.imports import *
from fastai.structured import *

from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from IPython.display import display

from sklearn import metrics
import feather

from sklearn.model_selection import GridSearchCV

In [3]:
Path = "data/bulldozers/"

In [84]:
df_raw = pd.read_csv(f'{Path}Train.csv', low_memory=False, parse_dates=["saledate"])
df_valid = pd.read_csv(f'{Path}Valid.csv', low_memory=False, parse_dates=["saledate"])
valid_soln = pd.read_csv(f'{Path}ValidSolution.csv', low_memory=False)

### Helper functions

In [5]:
def display_all(df):
    with pd.option_context("display.max_rows", 1000, "display.max_columns", 1000): 
        display(df)
def split_vals(a,n): return a[:n].copy(), a[n:].copy()

def rmse(x,y): return math.sqrt(metrics.mean_squared_error(y,x))

def print_score(m):
    res = [rmse(m.predict(X_train), y_train), rmse(m.predict(X_valid), y_valid),
                m.score(X_train, y_train), m.score(X_valid, y_valid)]
    if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
    print(res)

### Adding datepart and training categories for training data

In [6]:
add_datepart(df_raw, 'saledate')

In [7]:
train_cats(df_raw)

In [8]:
df_raw.SalePrice = np.log(df_raw.SalePrice)

### Save the df in feather format for quick retrieval

In [9]:
os.makedirs('tmp', exist_ok=True)
feather.write_dataframe(df_raw, 'tmp/bulldozers')

In [44]:
df_raw = feather.read_dataframe('tmp/bulldozers')

In [45]:
labels = df_raw.SalePrice

### This section below is mostly identifying what works.

In [17]:
X_train, y_train, nas = proc_df(df_raw, 'SalePrice', subset=100000)
X_train, X_valid = split_vals(X_train, 80000)
y_train, y_valid = split_vals(y_train, 80000)

### Base Model

In [18]:
m = RandomForestRegressor(n_jobs=-1)
m.fit(X_train, y_train)

print_score(m)



[0.10122434910770327, 0.3274163766283504, 0.9786488916963954, 0.7781367036122133]


### Important features

In [19]:
feature_importance = rf_feat_importance(m, X_train)

In [24]:
imps = np.arange(0.001, 0.01, 0.001)
for imp in imps:
    imp_features = feature_importance[feature_importance['imp']>imp]['cols']
    m = RandomForestRegressor(n_jobs=-1, n_estimators=40)
    X_train = X_train[imp_features]
    X_valid = X_valid[imp_features]
    m.fit(X_train[imp_features], y_train)
    print_score(m)

[0.08728941888435801, 0.31556663647057764, 0.9841228099865507, 0.7939052983587707]
[0.08725705795217713, 0.3144126501991329, 0.9841345801514753, 0.7954098655170512]
[0.08749104322842596, 0.3220755486343286, 0.9840493778290039, 0.7853157541849733]
[0.08794797666786947, 0.32358488697306165, 0.9838823343324153, 0.7832988956201943]
[0.08785222703675762, 0.31467136980774973, 0.9839174100787997, 0.7950730262883814]
[0.08785063271385743, 0.3169742264607269, 0.9839179938001147, 0.792062620482034]
[0.08808245205051785, 0.3171769561933986, 0.9838330077560471, 0.7917965510885301]
[0.0882972403907687, 0.32430197925918114, 0.9837540654659812, 0.7823373743582955]
[0.08833742757909394, 0.3227416301019736, 0.983739273905073, 0.7844268629684157]


In [72]:
# picking 0.06 because of an appropriate drop in rmse value with validation set
important_features = feature_importance[feature_importance['imp']>0.006]['cols']
df_important = df_raw[important_features]

### Here we will try to make the model more robust by making it independant of the timeframe.

In [73]:
df_new = df_important.copy()
df_new['ext']=1
x, y, _ = proc_df(df_new, 'ext', subset=100000)
y[:80000] = 0
m = RandomForestClassifier(n_jobs=-1, oob_score=True)
m.fit(x,y)

m.oob_score_

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])


0.99773

Since the oob score is so high, we can safely assume that the model can easily predict between training and validation times. So we can remove the most important feature.

In [49]:
rf_feat_importance(m,x)

Unnamed: 0,cols,imp
6,SalesID,0.603211
10,MachineID,0.162339
5,saleElapsed,0.156214
1,YearMade,0.036878
4,ModelID,0.012126
7,fiModelDesc,0.010358
8,fiBaseModel,0.009049
9,fiSecondaryDesc,0.002847
3,fiProductClassDesc,0.00186
14,saleDayofyear,0.001416


column correlations can cause unnecessary model dependencies and can reduce performance so we remove the dependant ones. 

In [74]:
x.corr() > 0.5

Unnamed: 0,Coupler_System,YearMade,ProductSize,fiProductClassDesc,ModelID,saleElapsed,SalesID,fiModelDesc,fiBaseModel,fiSecondaryDesc,MachineID,Enclosure,state,saleDay,saleDayofyear,fiModelDescriptor
Coupler_System,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
YearMade,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False
ProductSize,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False
fiProductClassDesc,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False
ModelID,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False
saleElapsed,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False
SalesID,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False
fiModelDesc,False,False,False,False,False,False,False,True,True,False,False,False,False,False,False,False
fiBaseModel,False,False,False,False,False,False,False,True,True,False,False,False,False,False,False,False
fiSecondaryDesc,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False


In [80]:
df_raw = feather.read_dataframe('tmp/bulldozers')

In [81]:
X_train, y_train, nas = proc_df(df_raw, 'SalePrice')

In [85]:
add_datepart(df_valid, 'saledate')
apply_cats(df_valid, df_raw)

In [86]:
X_valid, _, _ = proc_df(df_valid, na_dict=nas)

In [87]:
valid_soln.SalePrice = np.log(valid_soln.SalePrice)

In [76]:
feather.write_dataframe(X_valid, 'tmp/bulldozer-valid')
feather.write_dataframe(valid_soln, 'tmp/bulldozer-valid-soln')

In [83]:
X_valid = feather.read_dataframe('tmp/bulldozer-valid')

In [79]:
y_valid = feather.read_dataframe('tmp/bulldozer-valid-soln').SalePrice

In [88]:
# Applying important features and removing SalesID because it is biased on time.
X_train = X_train[important_features]
X_train.drop(['SalesID','fiBaseModel'], axis=1, inplace=True)

X_valid = X_valid[important_features]
X_valid.drop(['SalesID','fiBaseModel'], axis=1, inplace=True)

### Hyper Parameter Tuning

In [None]:
params = {
            'n_estimators' : np.arange(150,200,10),
            'max_features' : [0.5,'sqrt'],
            'min_samples_leaf' : [7,9],
            'min_samples_split' : [5,6]
         }
scorer_func= lambda y_true,y_pred: np.sqrt(metrics.mean_squared_log_error(y_true,y_pred))

scorer = metrics.make_scorer(scorer_func,greater_is_better=False)
m = RandomForestRegressor(n_jobs=-1)
m = GridSearchCV(m, cv=3, verbose=3, scoring=scorer, param_grid = params)
m.fit(X_train, y_train)

In [None]:
m = RandomForestRegressor(n_jobs=-1, n_estimators=140, max_features='sqrt', min_samples_leaf=6)
m.fit(X_train, y_train)

print_score(m)