In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import *
from sklearn.preprocessing import *
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.metrics import *
import numpy as np

In [4]:
bulldozer = pd.read_csv('Train.csv',parse_dates=['saledate'],low_memory=False)

In [5]:
bulldozer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 401125 entries, 0 to 401124
Data columns (total 53 columns):
SalesID                     401125 non-null int64
SalePrice                   401125 non-null int64
MachineID                   401125 non-null int64
ModelID                     401125 non-null int64
datasource                  401125 non-null int64
auctioneerID                380989 non-null float64
YearMade                    401125 non-null int64
MachineHoursCurrentMeter    142765 non-null float64
UsageBand                   69639 non-null object
saledate                    401125 non-null datetime64[ns]
fiModelDesc                 401125 non-null object
fiBaseModel                 401125 non-null object
fiSecondaryDesc             263934 non-null object
fiModelSeries               56908 non-null object
fiModelDescriptor           71919 non-null object
ProductSize                 190350 non-null object
fiProductClassDesc          401125 non-null object
state                

In [6]:
bulldozer.head()

Unnamed: 0,SalesID,SalePrice,MachineID,ModelID,datasource,auctioneerID,YearMade,MachineHoursCurrentMeter,UsageBand,saledate,...,Undercarriage_Pad_Width,Stick_Length,Thumb,Pattern_Changer,Grouser_Type,Backhoe_Mounting,Blade_Type,Travel_Controls,Differential_Type,Steering_Controls
0,1139246,66000,999089,3157,121,3.0,2004,68.0,Low,2006-11-16,...,,,,,,,,,Standard,Conventional
1,1139248,57000,117657,77,121,3.0,1996,4640.0,Low,2004-03-26,...,,,,,,,,,Standard,Conventional
2,1139249,10000,434808,7009,121,3.0,2001,2838.0,High,2004-02-26,...,,,,,,,,,,
3,1139251,38500,1026470,332,121,3.0,2001,3486.0,High,2011-05-19,...,,,,,,,,,,
4,1139253,11000,1057373,17311,121,3.0,2007,722.0,Medium,2009-07-23,...,,,,,,,,,,


In [7]:
y = np.log(bulldozer['SalePrice'])

In [8]:
# preparing features

x0 = bulldozer.drop(['SalePrice','fiModelDesc','fiBaseModel','fiSecondaryDesc','fiModelSeries','fiModelDescriptor','fiProductClassDesc','ProductGroupDesc','saledate','Drive_System','state'],axis=1)

x = x0.iloc[:,0:11]
print(x.shape)
x.columns

(401125, 11)


Index(['SalesID', 'MachineID', 'ModelID', 'datasource', 'auctioneerID',
       'YearMade', 'MachineHoursCurrentMeter', 'UsageBand', 'ProductSize',
       'ProductGroup', 'Enclosure'],
      dtype='object')

In [9]:
# converting datetime into different features

x['DayOfYear'] = bulldozer.saledate.dt.dayofyear
x['WeekOfYear'] = bulldozer.saledate.dt.weekofyear
x['month_end'] = bulldozer.saledate.dt.is_month_end
x['month_start'] = bulldozer.saledate.dt.is_month_start
x['year'] = bulldozer.saledate.dt.year
x['month'] = bulldozer.saledate.dt.month

x['MachineHoursCurrentMeter'].fillna(0,inplace=True)
x['auctioneerID'].fillna(method='ffill',inplace=True)

x.shape

(401125, 17)

In [10]:
# imputing & One hot encoding

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer



one_hot = OneHotEncoder(handle_unknown='ignore')
categorical_features = ["UsageBand","ProductGroup","Enclosure","ProductSize"]

categorical_trans = Pipeline(steps=[('imputer',SimpleImputer(strategy='constant',fill_value="missing")),('one_hot',one_hot)])

preprocessor = ColumnTransformer(transformers=[('cat',categorical_transformer,categorical_features)],remainder="passthrough")

In [11]:
# splitting the data 

def split(x,y):
    x_train = x.iloc[0:361000,:]
    x_test = x.iloc[361000:,:]
    y_train = y.iloc[0:361000]
    y_test = y.iloc[361000:]
    return x_train,x_test,y_train,y_test

x_train,x_test,y_train,y_test = split(x,y)

In [12]:
print(x_train.shape,x_test.shape,y_train.shape,y_test.shape)

(361000, 17) (40125, 17) (361000,) (40125,)


In [13]:
# fitting and transforming preprocessor

preprocessor.fit(x)
x_train_trans = preprocessor.transform(x_train)
x_test_trans = preprocessor.transform(x_test)

In [14]:
print(x_train_trans.shape,x_test_trans.shape)

(361000, 37) (40125, 37)


In [35]:
# random forest

rf = RandomForestRegressor(n_estimators=100,random_state=1,n_jobs=-1)

rf.fit(x_train_trans,y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
           oob_score=False, random_state=1, verbose=0, warm_start=False)

In [36]:
y_pred = rf.predict(x_test_trans)

In [37]:
print(f'training score = ',rf.score(x_train_trans,y_train))

training score =  0.9857061867901479


In [38]:
from math import sqrt

print(f'RMSL ERROR = ',sqrt(mean_squared_error(y_test,y_pred)))

RMSL ERROR =  0.29875735024828415


In [39]:
# save a model

import pickle

pickle.dump(rf, open("Random_forest_model_1.pkl","wb"))