In [3]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [8]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import *
from sklearn.preprocessing import OneHotEncoder
#from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

In [5]:
bulldozer = pd.read_csv('Train.csv',parse_dates=['saledate'],low_memory=False)


In [5]:
bulldozer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 401125 entries, 0 to 401124
Data columns (total 53 columns):
SalesID                     401125 non-null int64
SalePrice                   401125 non-null int64
MachineID                   401125 non-null int64
ModelID                     401125 non-null int64
datasource                  401125 non-null int64
auctioneerID                380989 non-null float64
YearMade                    401125 non-null int64
MachineHoursCurrentMeter    142765 non-null float64
UsageBand                   69639 non-null object
saledate                    401125 non-null datetime64[ns]
fiModelDesc                 401125 non-null object
fiBaseModel                 401125 non-null object
fiSecondaryDesc             263934 non-null object
fiModelSeries               56908 non-null object
fiModelDescriptor           71919 non-null object
ProductSize                 190350 non-null object
fiProductClassDesc          401125 non-null object
state                

In [6]:
bulldozer.head()

Unnamed: 0,SalesID,SalePrice,MachineID,ModelID,datasource,auctioneerID,YearMade,MachineHoursCurrentMeter,UsageBand,saledate,...,Undercarriage_Pad_Width,Stick_Length,Thumb,Pattern_Changer,Grouser_Type,Backhoe_Mounting,Blade_Type,Travel_Controls,Differential_Type,Steering_Controls
0,1139246,66000,999089,3157,121,3.0,2004,68.0,Low,2006-11-16,...,,,,,,,,,Standard,Conventional
1,1139248,57000,117657,77,121,3.0,1996,4640.0,Low,2004-03-26,...,,,,,,,,,Standard,Conventional
2,1139249,10000,434808,7009,121,3.0,2001,2838.0,High,2004-02-26,...,,,,,,,,,,
3,1139251,38500,1026470,332,121,3.0,2001,3486.0,High,2011-05-19,...,,,,,,,,,,
4,1139253,11000,1057373,17311,121,3.0,2007,722.0,Medium,2009-07-23,...,,,,,,,,,,


In [9]:
y = np.log(bulldozer['SalePrice'])

In [10]:
bulldozer.drop(['SalePrice'],axis=1,inplace=True)

In [11]:
x1 = bulldozer.drop(['SalesID','saledate','MachineID','ModelID','datasource','auctioneerID','YearMade','MachineHoursCurrentMeter'],axis=1)
x2 = bulldozer.iloc[:,0:7]


In [12]:
# Fill missing values and categorize and numericalize the data

x2.fillna(value={'auctioneerID':0,'MachineHoursCurrentMeter':0},inplace=True)

def categorize(df):
    for n in df:
        df[n] = df[n].astype('category',inplace=True)

categorize(x1)        

x1.UsageBand.cat.set_categories(['Low','Medium','High'],ordered=True,inplace=True)

def numericalize(df):
    for n in df:
        df[n] = df[n].cat.codes + 1
        
        
numericalize(x1)

In [13]:
# merge x2 & x1 

x1['SalesID'] = bulldozer['SalesID']

x = pd.merge(x2,x1,on='SalesID',how='outer')

In [12]:
x.isna().sum()

SalesID                     0
MachineID                   0
ModelID                     0
datasource                  0
auctioneerID                0
YearMade                    0
MachineHoursCurrentMeter    0
UsageBand                   0
fiModelDesc                 0
fiBaseModel                 0
fiSecondaryDesc             0
fiModelSeries               0
fiModelDescriptor           0
ProductSize                 0
fiProductClassDesc          0
state                       0
ProductGroup                0
ProductGroupDesc            0
Drive_System                0
Enclosure                   0
Forks                       0
Pad_Type                    0
Ride_Control                0
Stick                       0
Transmission                0
Turbocharged                0
Blade_Extension             0
Blade_Width                 0
Enclosure_Type              0
Engine_Horsepower           0
Hydraulics                  0
Pushblock                   0
Ripper                      0
Scarifier 

In [13]:
x.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 401125 entries, 0 to 401124
Data columns (total 51 columns):
SalesID                     401125 non-null int64
MachineID                   401125 non-null int64
ModelID                     401125 non-null int64
datasource                  401125 non-null int64
auctioneerID                401125 non-null float64
YearMade                    401125 non-null int64
MachineHoursCurrentMeter    401125 non-null float64
UsageBand                   401125 non-null int8
fiModelDesc                 401125 non-null int16
fiBaseModel                 401125 non-null int16
fiSecondaryDesc             401125 non-null int16
fiModelSeries               401125 non-null int8
fiModelDescriptor           401125 non-null int16
ProductSize                 401125 non-null int8
fiProductClassDesc          401125 non-null int8
state                       401125 non-null int8
ProductGroup                401125 non-null int8
ProductGroupDesc            401125 non-nu

In [14]:
# converting datetime into different features

x['DayOfYear'] = bulldozer.saledate.dt.dayofyear
x['WeekOfYear'] = bulldozer.saledate.dt.weekofyear
x['month_end'] = bulldozer.saledate.dt.is_month_end
x['month_start'] = bulldozer.saledate.dt.is_month_start
x['year'] = bulldozer.saledate.dt.year
x['month'] = bulldozer.saledate.dt.month


x.shape

(401125, 57)

In [15]:
# splitting the data 

def split(x,y):
    x_train = x.iloc[0:389125,:]
    x_test = x.iloc[389125:,:]
    y_train = y.iloc[0:389125]
    y_test = y.iloc[389125:]
    return x_train,x_test,y_train,y_test

x_train,x_test,y_train,y_test = split(x,y)

In [16]:
print(x_train.shape,x_test.shape,y_train.shape,y_test.shape)

(389125, 57) (12000, 57) (389125,) (12000,)


In [17]:
# random forest

#np.random.seed(42)

#rf = RandomForestRegressor(n_estimators=100,min_samples_leaf=4,min_samples_split=2,max_features=0.5,max_depth=20,random_state=1,n_jobs=-1,oob_score=True,verbose=1)


#rf.fit(x_train,y_train)

In [18]:
#y_pred = rf.predict(x_test)

In [18]:
from math import sqrt


def print_scores(x_train,x_test,y_train,y_test,y_pred,model):
    print(f'training score = ',model.score(x_train,y_train))
    print(f'test score = ',model.score(x_test,y_test))
    print(f'RMSL ERROR = ',sqrt(mean_squared_error(y_test,y_pred)))
    print(f'oob score = {model.oob_score_}')

In [20]:
#print_scores(x_train,x_test,y_train,y_test,y_pred,rf)

In [22]:
# Tuning hyperparameters with RandomizedSearchCV

#np.random.seed(42)

#grid = {
#        'n_estimators' : [40],
#       'max_depth': [30],
#        'max_features' : [0.5,0.75],
#        'min_samples_split' : [4,6],
#        'min_samples_leaf' : [3,4]
#       }

#estimator = RandomForestRegressor(n_jobs=-1,random_state=1,oob_score=True)

#rand_cv = RandomizedSearchCV(estimator=estimator,
#                             param_distributions=grid,
 #                            n_iter=5,
  #                           cv=5,
   #                          verbose=2
    #                        )



#rand_cv.fit(x_train,y_train)

In [23]:
#rand_cv.best_params_

In [24]:
#y_preds = rand_cv.predict(x_test)

In [25]:
#print_scores(x_train,x_test,y_train,y_test,y_preds,rand_cv)

In [20]:
#best model

np.random.seed(42)

best_rf = RandomForestRegressor(n_estimators=40,min_samples_leaf=3,max_features=0.5,max_depth=30,random_state=1,oob_score=True,n_jobs=-1)

In [21]:
%time best_rf.fit(x_train,y_train)

Wall time: 45.8 s


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=30,
           max_features=0.5, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=3, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=40, n_jobs=-1,
           oob_score=True, random_state=1, verbose=0, warm_start=False)

In [22]:
y_preds = best_rf.predict(x_test)

In [23]:
print_scores(x_train,x_test,y_train,y_test,y_preds,best_rf)

training score =  0.967206106156657
test score =  0.9059144260785029
RMSL ERROR =  0.22952898290481444
oob score = 0.9092433584968741


In [24]:
# finding important features

feature_importance = best_rf.feature_importances_

attributes = x.columns

feature = sorted(zip(feature_importance, attributes), reverse=True)

feature_df = pd.DataFrame(feature,columns=['imp','attr'])
feature_df

Unnamed: 0,imp,attr
0,0.169573,YearMade
1,0.100499,Coupler_System
2,0.097453,ProductSize
3,0.07213,fiProductClassDesc
4,0.069111,ModelID
5,0.06333,year
6,0.051025,Grouser_Tracks
7,0.048175,Hydraulics_Flow
8,0.040354,fiSecondaryDesc
9,0.034261,fiModelDesc


In [146]:
# creating new x of reduced features

to_keep = feature_df[feature_df['imp']>0.006]

to_keep = np.array(to_keep['attr'])

x_reduced = x[to_keep]
to_keep

array(['YearMade', 'Coupler_System', 'ProductSize', 'fiProductClassDesc',
       'ModelID', 'year', 'Grouser_Tracks', 'Hydraulics_Flow',
       'fiSecondaryDesc', 'fiModelDesc', 'SalesID', 'Enclosure',
       'fiBaseModel', 'fiModelDescriptor', 'MachineID', 'ProductGroup',
       'Hydraulics', 'Tire_Size', 'DayOfYear', 'state', 'Drive_System',
       'WeekOfYear', 'Track_Type'], dtype=object)

In [26]:
# split train & test

x_train,x_test,y_train,y_test = split(x_reduced,y)
print(x_train.shape,x_test.shape,y_train.shape,y_test.shape)

(389125, 23) (12000, 23) (389125,) (12000,)


In [27]:
%time best_rf.fit(x_train,y_train)

Wall time: 27.5 s


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=30,
           max_features=0.5, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=3, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=40, n_jobs=-1,
           oob_score=True, random_state=1, verbose=0, warm_start=False)

In [28]:
y_preds = best_rf.predict(x_test)

In [29]:
print_scores(x_train,x_test,y_train,y_test,y_preds,best_rf)

training score =  0.9648487607827629
test score =  0.9075388949341153
RMSL ERROR =  0.2275388468836985
oob score = 0.9076846969891886


In [39]:
one_he = OneHotEncoder(handle_unknown='ignore')


In [72]:
def features_to_encode(df):
    categorical_features = []
    for n in df:
        if len(df[n].value_counts()) < 7:
            categorical_features.append(n)
    return categorical_features

In [147]:
x_reduced = x_reduced.drop(['fiModelDesc','Grouser_Tracks'],axis=1)

In [148]:
def replacing(n):
    if n<1900:
        n=1900
    return n

x_reduced['YearMade'] = x_reduced['YearMade'].apply(lambda n: replacing(n))

x_reduced.describe()


Unnamed: 0,YearMade,Coupler_System,ProductSize,fiProductClassDesc,ModelID,year,Hydraulics_Flow,fiSecondaryDesc,SalesID,Enclosure,...,fiModelDescriptor,MachineID,ProductGroup,Hydraulics,Tire_Size,DayOfYear,state,Drive_System,WeekOfYear,Track_Type
count,401125.0,401125.0,401125.0,401125.0,401125.0,401125.0,401125.0,401125.0,401125.0,401125.0,...,401125.0,401125.0,401125.0,401125.0,401125.0,401125.0,401125.0,401125.0,401125.0,401125.0
mean,1984.832189,0.115889,1.818219,32.262222,6889.70298,2004.095728,0.321483,36.685036,1919713.0,3.605195,...,12.233433,1217903.0,3.725524,4.253801,3.059847,179.977581,23.508318,0.809299,26.179864,0.458792
std,29.017302,0.342865,2.10783,22.596618,6221.777842,5.75419,0.926401,38.228243,909021.5,2.220353,...,29.04195,440992.0,1.725772,4.809368,5.950781,103.55993,15.732898,1.43685,14.788059,0.819459
min,1900.0,0.0,0.0,1.0,28.0,1989.0,0.0,0.0,1139246.0,0.0,...,0.0,0.0,1.0,0.0,0.0,2.0,1.0,0.0,1.0,0.0
25%,1985.0,0.0,0.0,11.0,3259.0,2000.0,0.0,0.0,1418371.0,1.0,...,0.0,1088697.0,2.0,1.0,0.0,84.0,9.0,0.0,13.0,0.0
50%,1995.0,0.0,0.0,35.0,4604.0,2006.0,0.0,29.0,1639422.0,3.0,...,0.0,1279490.0,4.0,1.0,0.0,168.0,22.0,0.0,25.0,0.0
75%,2000.0,0.0,4.0,52.0,8724.0,2009.0,0.0,57.0,2242707.0,6.0,...,0.0,1468067.0,5.0,12.0,0.0,271.0,41.0,2.0,39.0,0.0
max,2013.0,2.0,6.0,74.0,37198.0,2011.0,3.0,175.0,6333342.0,6.0,...,139.0,2486330.0,6.0,12.0,17.0,365.0,53.0,4.0,53.0,2.0


In [149]:
categorical_features = features_to_encode(x_reduced)

In [150]:
categorical_features


['Coupler_System',
 'Hydraulics_Flow',
 'ProductGroup',
 'Drive_System',
 'Track_Type']

In [151]:
preprocessor = ColumnTransformer([('onehot',one_he,categorical_features)],remainder='passthrough')

In [152]:
x_transformed = preprocessor.fit_transform(x_reduced)

In [153]:
x_transformed = pd.DataFrame(x_transformed)

In [154]:
x_train,x_test,y_train,y_test = split(x_transformed,y)

In [155]:
%time best_rf.fit(x_train,y_train)

Wall time: 33.7 s


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=30,
           max_features=0.5, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=3, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=40, n_jobs=-1,
           oob_score=True, random_state=1, verbose=0, warm_start=False)

In [156]:
y_preds = best_rf.predict(x_test)

In [157]:
print_scores(x_train,x_test,y_train,y_test,y_preds,best_rf)

training score =  0.9644514338472957
test score =  0.9064435823856054
RMSL ERROR =  0.22888261407214372
oob score = 0.9067866310799151
