`Predicting the future sale prices of bulldozers`

In [None]:
# importing modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_log_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [None]:
#importing data
both = pd.read_csv("../input/bluebook-for-bulldozers/TrainAndValid.csv", low_memory = False, parse_dates =["saledate"])
both.sort_values(by = ["saledate"], inplace =True, ascending = True)

both1 = both.copy()

## Feature engineering

In [None]:
both1["saleyear"] = both1.saledate.dt.year
both1["salemonth"] = both1.saledate.dt.month
both1["saleday"] = both1.saledate.dt.day
both1["saledayoftheweek"] = both1.saledate.dt.dayofweek
both1["saledayoftheyear"] = both1.saledate.dt.dayofyear
both1.drop("saledate", axis=1, inplace=True)


In [None]:
# converting strings(object types) to categorical
for label, cont in both1.items():
    if pd.api.types.is_string_dtype(cont):
        both1[label] = cont.astype("category").cat.as_ordered()


# filling numerical missing and adding binary column for the rows missing
for label, cont in both1.items():
    if pd.api.types.is_numeric_dtype(cont):
        if pd.isnull(cont).sum():
            
            both1[label + "_is_missing"] = pd.isnull(cont)
            
            both1[label] = cont.fillna(cont.median())

            
# adding binary columns for missing rows in columns and giving codes for categorical types
for labels, cont in both1.items():
    if not pd.api.types.is_numeric_dtype(cont):
        both1[labels+"_is_missing"] = pd.isnull(cont)
        
        both1[labels] = pd.Categorical(cont).codes+1

            

## Modelling on Training set

In [None]:
val = both1[both1.saleyear == 2012]
train = both1[both1.saleyear != 2012]


X_train, Y_train = train.drop("SalePrice", axis = 1), train.SalePrice
X_val, Y_val = val.drop("SalePrice", axis = 1), val.SalePrice

X_train.shape, Y_train.shape, X_val.shape, Y_val.shape

`Predicting on validation set`

In [None]:
%%time
est = RandomForestRegressor(n_jobs = -1, random_state = 17)
est.fit(X_train, Y_train)
pre = est.predict(X_val)

np.sqrt(mean_squared_log_error(Y_val, pre))


`Using RandomizedSearchCV for finding best parmeters`


In [None]:
%%time

rf_param = {"n_estimators": [10,20,30,40,60,80,100],
            "max_depth": [None, 2, 6, 8, 10],
            "min_samples_leaf": [2,6,8,10],
            "min_samples_split": [2,4,6,10],
            "max_samples": [100000]}

rfr_cv = RandomizedSearchCV(RandomForestRegressor(n_jobs = -1,
                                                  random_state = 17),cv =5,
                                                  param_distributions = rf_param,
                                                  n_iter = 5,verbose  = True)

rfr_cv.fit(X_train, Y_train)

pr = rfr_cv.predict(X_val)

np.sqrt(mean_squared_log_error(Y_val, pr)), rfr_cv.best_params_


`Fitting the above parameters from hyper tuning in a model`

In [None]:
%%time
model = RandomForestRegressor(n_jobs = -1,random_state = 17)



param_grid = {"n_estimators": [40],
            "max_depth": [None],
            "min_samples_leaf": [6],
            "min_samples_split": [2,6],
            "max_samples": [None]}
RFR = GridSearchCV(model,
                   cv = 5,
                   param_grid = param_grid,
                   verbose =True)

RFR.fit(X_train, Y_train)





In [None]:
pred = RFR.predict(X_val);
np.sqrt(mean_squared_log_error(Y_val, pred)), RFR.best_params_

In [None]:
# importing test set
test = pd.read_csv("../input/bluebook-for-bulldozers/Test.csv", low_memory = False,parse_dates = ["saledate"])
test.sort_values(by = ["saledate"], inplace =True, ascending = True)


## Feature engineering on Test set

In [None]:
test["saleyear"] = test.saledate.dt.year
test["salemonth"] = test.saledate.dt.month
test["saleday"] = test.saledate.dt.day
test["saledayoftheweek"] = test.saledate.dt.dayofweek
test["saledayoftheyear"] = test.saledate.dt.dayofyear
test.drop("saledate", axis=1, inplace=True)


In [None]:
for labels, cont in test.items():
    if pd.api.types.is_string_dtype(cont):
        test[label] = cont.astype("category").cat.as_ordered()


for labels, cont in test.items():
    if pd.api.types.is_numeric_dtype(cont):
        if pd.isnull(cont).sum():
            test[labels+"_is_missing"] = pd.isnull(cont)
            test[labels] = cont.fillna(cont.median())

for labels, cont in test.items():
    if not pd.api.types.is_numeric_dtype(cont):
        
        test[labels + "_is_missing"] = pd.isnull(cont)
        
        test[labels] = pd.Categorical(cont).codes+1


In [None]:
predictions = RFR.predict(test)
predictions


In [None]:
submission = pd.DataFrame()
submission["SalesID"] = test["SalesID"]
submission["SalePrice"] = predictions

submission.to_csv("submission.csv", index = False)