 **Introduction**

we want here to predict the future sale price of a bulldozer, given its characteristics and previous examples of how much similiar bulldozers have been sold for

***Importing libraries***

In [None]:
import pandas as pd 
import numpy as np
from sklearn.ensemble import  RandomForestRegressor
import re
from sklearn.impute import SimpleImputer
from IPython.display import display
from pandas.api.types import is_string_dtype, is_numeric_dtype

 **Data**
 

We have three main Data:


*   Train Data
*   Valid Data
*   Test Data



**Parsing dates**

When we work with time series data, we want to enrich the time & date component as much as possible.

We can do that by telling pandas which of our columns has dates in it using the parse_dates parameter.

In [None]:
data_set = pd.read_csv('../input/bluebook-for-bulldozers/Train.zip', low_memory=False, 
                     parse_dates=["saledate"])

data_test=pd.read_csv('../input/bluebook-for-bulldozers/Test.csv', low_memory=False, 
                     parse_dates=["saledate"])

In [None]:
data_set.info()

**preprocessing Data**

This dataset contains a mix of continuous and categorical variables , so we will preprocessing it


**Add datetime parameters for saledate column**

we know from the description of the problem that "sale date " is time series so we convert it into many fields in "int64"

In [None]:
data_set['saleyear']=data_set.saledate.dt.year
data_set['salemonth']=data_set.saledate.dt.month
data_set['saleday']=data_set.saledate.dt.day

In [None]:
# Test Data
data_test['saleyear']=data_test.saledate.dt.year
data_test['salemonth']=data_test.saledate.dt.month
data_test['saleday']=data_test.saledate.dt.day

In [None]:
data_set.drop(columns=['saledate'],inplace=True)
data_test.drop(columns=['saledate'],inplace=True)

**Convert string to categories**


In [None]:
data_set.state.unique()

we cant use one hot encoding because as we see there are some feature have more than 15 variable .


In [None]:

for col ,val in data_set.items():
  if pd.api.types.is_string_dtype(val):
    data_set[col]=val.astype("category").cat.as_ordered()


In [None]:
#Test Data
for col ,val in data_test.items():
  if pd.api.types.is_string_dtype(val):
    data_test[col]=val.astype("category").cat.as_ordered()


we can't use Labelencoder from Sklearn because it cant handle 'nan' data



In [None]:
for col,val in data_set.items():
  if  not pd.api.types.is_numeric_dtype(val):
    data_set[col]=pd.Categorical(val).codes+1

In [None]:
for col,val in data_test.items():
  if  not pd.api.types.is_numeric_dtype(val):
    data_test[col]=pd.Categorical(val).codes+1

**Handling missing value**

In [None]:
for col,val in data_set.items():
  if pd.api.types.is_numeric_dtype(val):
    if pd.isnull(val).sum():
        data_set[col]=val.fillna(val.median())


In [None]:
#Test Data
for col,val in data_test.items():
  if pd.api.types.is_numeric_dtype(val):
    if pd.isnull(val).sum():
        data_test[col]=val.fillna(val.median())


check if there is null value

In [None]:
for col,val in data_set.items():
  if pd.api.types.is_numeric_dtype(val):
    if pd.isnull(val).sum():
        print(col)

In [None]:
y=data_set.SalePrice
data_set.drop(columns=['SalePrice'],inplace=True)
sales_id=data_test.SalesID
data_set.drop(columns=['SalesID'],inplace=True)
data_test.drop(columns=['SalesID'],inplace=True)



***Scaling the Data***

In [None]:
from sklearn.preprocessing import  StandardScaler
SC=StandardScaler()
data_set[data_set.columns]=(SC.fit_transform(data_set[data_set.columns]))
data_test[data_test.columns]=(SC.fit_transform(data_test[data_set.columns]))


Spliting The Data 


1.   Train Data
2.   Valid Daya



**Calculate size of valid data**

In [None]:
test_valid_size=data_test.shape[0]/ (data_set.shape[0]) 

test_valid_size

In [None]:
from sklearn.model_selection import  train_test_split
x_train,x_valid,y_train,y_valid=train_test_split(data_set,y,test_size=test_valid_size,random_state=44)

**Building an evaluation function**


The evaluation metric for this competition is the RMSLE (root mean squared log error) between the actual and
predicted auction prices

In [None]:
from sklearn.metrics import  mean_squared_log_error
def rmse(y_test, y_preds):
    """
    Calculates root mean squared error between predictions and truelabels.
    """
    return np.sqrt(mean_squared_log_error(y_test, y_preds))
def show_evalution_score (model):
  print("Training score",model.score(x_train,y_train))
  print("Valid score",model.score(x_valid,y_valid))
  print("Training RMSLE",rmse(model.predict(x_train),y_train))
  print("Valid RMSLE",rmse(model.predict(x_valid),y_valid))



**Train the model**

In [None]:
m = RandomForestRegressor(n_jobs=-1)
m.fit(x_train, y_train)




In [None]:
show_evalution_score(m)

there is difference between the train and validton score we have here overfitting 

**Hyperparameter tunning with GridSearchCV**

we will use RandomizedSearchto find the best paramters 

In [None]:

from sklearn.model_selection import RandomizedSearchCV

 # Different RandomForestRegressor hyperparameters
rf_grid = {"n_estimators": np.arange(20, 100, 20),
          "max_depth": [None, 3, 5, 10],
          "min_samples_split": np.arange(2, 20, 2),
          "min_samples_leaf": np.arange(1, 20, 2),
          "max_features": [0,5, 1, "sqrt", "auto"],
          "max_samples": [10000]}

 # Instantiate RandomizedSearchCV
scv_model = RandomizedSearchCV(RandomForestRegressor(n_jobs=-1,
                                                   random_state=12),
                             param_distributions=rf_grid,
                             n_iter=2,
                             cv=5,
                             verbose=True)

 # Fit the RandomizedSearchCV
scv_model.fit(x_train, y_train)

In [None]:
scv_model.best_params_

In [None]:
show_evalution_score(scv_model)

**Train a model with the best hyperparameters**

Note: These were found after 100 iterations of RandomizedSearchCV.

In [None]:
best_model = RandomForestRegressor(n_estimators=40,
                                   min_samples_leaf=1,
                                   min_samples_split=14,
                                   max_features=.5,
                                   n_jobs=-1,
                                   max_samples=None,
                                   random_state=12)

In [None]:
best_model.fit(x_train, y_train)

In [None]:
show_evalution_score(best_model)

**Make predictions on test data**

In [None]:
test_pred=best_model.predict(data_test)
test_pred

In [None]:
df_predict=pd.DataFrame()
df_predict["SalesID"] = sales_id
df_predict["SalesPrice"] = test_pred
df_predict.to_csv("submission.csv", index = False)


In [None]:
df_predict