In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# Importing libraries for data exploration and anlysis
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import sklearn as sk
import seaborn as sns
import matplotlib.pyplot as plt

#Model from SciKit-Learn
from sklearn.ensemble import RandomForestRegressor 
from sklearn.linear_model import Lasso
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn import feature_selection
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split

# Metrics evaluations from SciKit Learn
from sklearn.metrics import accuracy_score,precision_score
from sklearn.metrics import mean_squared_log_error, mean_absolute_error

# for displaying graph in the notebook
%matplotlib inline
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Problem Definition - Blue Book for Bulldozers
Predict the auction sale price for a piece of heavy equipment to create a "blue book" for bulldozers,based on the charactetistics and past sales data.


## Kaggle Link
 - https://www.kaggle.com/c/bluebook-for-bulldozers/data


## Data Source
Data is downloaded from Kaggle Blue Book for Bulldozers as per description below.The data for this competition is split into below parts:
 - TrainAndValid.csv
 - Test.csv

## Model Evaluation
The evaluation metric for this competition is the RMSLE (root mean squared log error) between the actual and predicted auction prices.
    - `Root Mean Square Error(RMSE)` : It is a measure of the squared difference between the prediction from our model and the actual value.
    - `Root Mean Square Logistic Error (RMLSE)` : It is a measure of the squared difference between the log of the prediction from our model and the log of the actual value.

## Program Execution Strategy
 1. Data Acquisition- Getting the data ready
 2. Handling NaN data and convert categorical data into Numeric
 3. Choosing the right maching learning estimator/aglorithm/model for this problem
 4. Fitting your chosen machine learning model to data and using it to make a prediction
 5. Evaluting a machine learning model
 6. Improving predictions through experimentation (hyperparameter tuning)
 7. Picking the right evaluation metric
 8. Feature Engineering - determining the important features

# Data Acquisition & Transformation

In [None]:
#Importing Source data - TrainAndValid and view into row and column 
df_bpp=pd.read_csv("/kaggle/input/bluebook-for-bulldozers/TrainAndValid.csv",
              low_memory=False,
              parse_dates=["saledate"])
#number of rows, number of columns
df_bpp.shape

# Data Pre-Processing and Analysis (EDA)

In [None]:
#viewing bulldozer data elements
df_bpp.info()

In [None]:
#return the first n rows
df_bpp.head(3).T

# Visualization ~ pattern reconization

In [None]:
ax = plt.subplot()
ax.scatter(df_bpp["SalesID"][:500], df_bpp ["SalePrice"][:500] , color="violet");
plt.title("History of SalesPrice by Sales ID")
plt.xlabel("Sales ID")
plt.ylabel("Sales Pricing");

In [None]:
#Plot the distribution of Sales Price
plt.figure(figsize=(10, 6))
sns.histplot(df_bpp['SalePrice'],color='lightblue');

In [None]:
ax =  plt.subplot()
ax.scatter(df_bpp["ModelID"][:500], df_bpp ["SalePrice"][:500] , color="lightgreen")
plt.title("Sales by Model Type")
plt.xlabel("Model Type");
plt.ylabel("Sales Pricing");

# Data Handling - Missing & Encode Categorical data
Handling NaN data and convert categorical data into Numeric

In [None]:
#concise summary of bull dozer data
df_bpp.info()

In [None]:
# converting string value into category values
for label, content in df_bpp.items():
    if pd.api.types.is_string_dtype(content):
        df_bpp[label] = content.astype("category").cat.as_ordered() 

In [None]:
# Fill the numeric rows with mean
for label, content in df_bpp.items():
    if pd.api.types.is_numeric_dtype(content):
        if pd.isnull(content).sum():
# Add a binary column which tells us if the data was missing or not
            df_bpp[label] = pd.isnull(content)
# Fill missing numeric values with median
            df_bpp[label] = content.fillna(content.mean())
    
# Filled categorical missing data and turn categories into numbers
    if not pd.api.types.is_numeric_dtype(content):
        df_bpp[label] = pd.isnull(content)
# We add +1 to the category code because pandas encodes missing categories as -1
        df_bpp[label] = pd.Categorical(content).codes+1

In [None]:
#number of rows, number of columns
df_bpp.shape

In [None]:
#return the first n rows ~ 3
df_bpp.head(3).T

# Categorisation of data into feature and label

In [None]:
#splitting the data into X and Y
#X , Y = df_bpp_transformed.drop(["SalePrice"],axis=1),df_bpp_transformed.SalePrice
X , Y = df_bpp.drop(["SalePrice"],axis=1),df_bpp.SalePrice
X.shape, Y.shape

In [None]:
#splitting the data into train and test with 25% reserved for testing and 75% for training
#Training data to train our Random Forest Model on and Validation data to validate the performance of our Model.
X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.25, random_state=42)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

#  Selecting Estimator ~ Random Forest Regression 
 Applying Machine Learning Alrorithms to find the patten in predicting the price of bulldozer based on the feature.

In [None]:
## Applying RandomForest Regressor Model
model_rf= RandomForestRegressor(n_jobs=-1,verbose=2,random_state=42)

#fitting the data into model
model_rf.fit (X_train,y_train)

#scoring the RandomForest Regressor Model
print(f"Random Forest Model Accuracy : {model_rf.score(X_test,y_test)*100:.2f}%")

In [None]:
#predicting label data through RandomForestRegressor
y_pred=model_rf.predict(X_test)
y_pred

# Hyperparameter Tuning
* The best way to think about hyperparameters is like the settings of an algorithm that can be adjusted to optimize performance, just as we might
* Random search is a technique where random combinations of the hyperparameters are used to find the best solution for the built model. It is similar to grid search, and yet it has proven to yield better results comparatively.

In [None]:
#Setting parameters for RandomForest through RandomizedSearchCV model to imporve the accuracy
# Different RandomForestRegressor hyperparameters
random_grid = {"n_estimators": np.arange(10, 100, 10),
           "max_depth": [None, 3, 5, 10],
           "min_samples_split": np.arange(2, 20, 2),
           "min_samples_leaf": np.arange(1, 20, 2),
           "max_features": [0.5, 1, "sqrt", "auto"]}
#displaying the random grid parameters for the estimators
random_grid

In [None]:
# Random search of parameters, using 3 fold cross validation,and search across 2 different combinations
rscv_para = RandomizedSearchCV(estimator = RandomForestRegressor(), param_distributions = random_grid, 
                               n_iter = 2, cv = 5, verbose=True, n_jobs = -1)
# Fit the random search model with revised parameters
rscv_para.fit(X_train, y_train)

In [None]:
#finding the best hyper parameters for RandomForest Regressor
rscv_para.best_params_

In [None]:
# Fitting the model with optimal hyper parameters
model_rf_optimal=RandomForestRegressor(n_estimators= 30,min_samples_split=4,min_samples_leaf= 3,
                                     max_features='sqrt',max_depth= None,n_jobs=-1,verbose=2,
                                     random_state=42)
#fit the ideal model
model_rf_optimal.fit(X_train,y_train)

#revised score post optimization of the hyper parameters
print(f"Optimsed RandomForestRegressor model Accuracy : {model_rf_optimal.score(X_test,y_test)*100:.2f}%")

In [None]:
#predicting label data through Optimised RandomForestRegressor
y_Optimal_pred=model_rf_optimal.predict(X_test)
y_Optimal_pred

# Evaluting a machine learning model

## RandomForestRegressor

In [None]:
# mean absolute error (MAE) is a measure of errors between paired observations expressing the same phenomenon.
mean_absolute_error(y_test, y_pred,sample_weight=None,multioutput='uniform_average')

In [None]:
# MSLE-computes a risk metric corresponding to the expected value of the squared logarithmic (quadratic) error or loss.
mean_squared_log_error(y_test,y_pred,sample_weight=None,multioutput='uniform_average')

## Optimised RandomForestRegressor

In [None]:
# mean absolute error (MAE) is a measure of errors between paired observations expressing the same phenomenon.
mean_absolute_error(y_test, y_Optimal_pred,sample_weight=None,multioutput='uniform_average')

In [None]:
# MSLE-computes a risk metric corresponding to the expected value of the squared logarithmic (quadratic) error or loss.
mean_squared_log_error(y_test,y_Optimal_pred,sample_weight=None,multioutput='uniform_average')

# Predictions - SalesPrice

In [None]:
 # import test data and view row and column count
df_bpp_tst=pd.read_csv("/kaggle/input/bluebook-for-bulldozers/Test.csv",
                        low_memory=False,
                        parse_dates=["saledate"])
# #number of rows, number of columns
df_bpp_tst.shape

In [None]:
#number of rows, number of columns
df_bpp_tst.shape

In [None]:
#return the first n rows
df_bpp_tst.head(3).T

In [None]:
#concise summary of bull dozer test data
df_bpp_tst.info()

In [None]:
# Now converting string value into category values
for label, content in df_bpp_tst.items():
    if pd.api.types.is_string_dtype(content):
        df_bpp_tst[label] = content.astype("category").cat.as_ordered()  

In [None]:
# Fill the numeric rows with mean
for label, content in df_bpp_tst.items():
    if pd.api.types.is_numeric_dtype(content):
        if pd.isnull(content).sum():
# Add a binary column which tells us if the data was missing or not
            df_bpp_tst[label] = pd.isnull(content)
# Fill missing numeric values with median
            df_bpp_tst[label] = content.fillna(content.mean())
    
# Filled categorical missing data and turn categories into numbers
    if not pd.api.types.is_numeric_dtype(content):
        df_bpp_tst[label] = pd.isnull(content)
# We add +1 to the category code because pandas encodes missing categories as -1
        df_bpp_tst[label] = pd.Categorical(content).codes+1

In [None]:
#return the first n rows
df_bpp_tst.head(3).T

In [None]:
#shape of transformed test data
df_bpp_tst.shape

In [None]:
#function is applied to the predict the salesprice through test data
df_pred=model_rf.predict(df_bpp_tst)

In [None]:
# formatting in the desired format
df_pred_sp=pd.DataFrame()
df_pred_sp ["SalesID"]= df_bpp_tst.SalesID
df_pred_sp ["SalesPrice"]= df_pred
df_pred_sp

In [None]:
#export predictied data to file
df_pred_sp.to_csv("/kaggle/working/predict_salesprice.csv",index=0)

# Feature Engineering
* Feature importance refers to techniques that assign a score to input features based on how useful they are at predicting a target variable
* scikit learn random forest regressor feature importance for 'SalesPrice' of Bulldozer

In [None]:
model_rf.feature_importances_

In [None]:
#creating dictory to map the column with optimal feature rating
feature_dict=dict(zip((df_bpp.columns),list(model_rf.feature_importances_)))
feature_dict

In [None]:
#Visulaization of Important features
feature_df=pd.DataFrame(feature_dict,index=[0])
feature_df.T.plot.line(title="Price Prediction Feature Importance", legend=False,color='blue');