In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# predection the Sale Price of Bulldozers using Machine Lerning
in this Notebook we are going to go through an example machine lerning project with a goal of predecting the sale price of bulldozers.
## 1. Problem definition
The goal of the contest is to predict the sale price of a particular piece of heavy equiment at auction based on it's usage, equipment type, and configuaration.  The data is sourced from auction result postings and includes information on usage and equipment configurations.

Fast Iron is creating a "blue book for bull dozers," for customers to value what their heavy equipment fleet is worth at auction.
## 2.Data

The data for this competition is split into three parts:

* Train.csv is the training set, which contains data through the end of 2011.
* Valid.csv is the validation set, which contains data from January 1, 2012 - April 30, 2012 You make predictions on this set * throughout the majority of the competition. Your score on this set is used to create the public leaderboard.
* Test.csv is the test set, which won't be released until the last week of the competition. It contains data from May 1, 2012 - November 2012. Your score on the test set determines your final rank for the competition.

## 3. Evalution 
The evaluation metric for this competition is the RMSLE (root mean squared log error) between the actual and predicted auction prices.

## 4. Features
kaggle provides a data dictionary detailling all of the features of the dataset. You can view this data dictionay on google Sheets  or look "Data Dictionary.xlsx" 

In [None]:
# Import data analysis tools 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn

Now we've got our tools for data analysis ready, we can import the data and start to explore it.

For this project, we've downloaded the data from Kaggle

In [None]:
# import training and validation sets 
df=pd.read_csv("../input/bluebook-for-bulldozers/TrainAndValid.csv",
              low_memory=False)
df.head()

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
fig,ax= plt.subplots()
ax.scatter(df['saledate'][:1000],df['SalePrice'][:1000])

In [None]:
df['SalePrice'].plot.hist(bins=20)

In [None]:
df.saledate[:100]

In [None]:
df.saledate.dtype

### Parsing dates
When working with time series data, it's a good idea to make sure any date data is the format of a datetime object (a Python data type which encodes specific information about dates).

In [None]:
df=pd.read_csv("../input/bluebook-for-bulldozers/TrainAndValid.csv",
              low_memory=False,
              parse_dates=['saledate'])

In [None]:
# With parse_dates... check dtype of "saledate"
df.info()

In [None]:
df.saledate.dtype

In [None]:
type('<M8[ns]')==type('datetime64[ns]')

In [None]:
fig,ax=plt.subplots(figsize=[16,6])
ax.scatter(df['saledate'][:1000],df['SalePrice'][:1000])

In [None]:
df.head()

In [None]:
df.head().T

In [None]:
df.saledate[:10]

#### Sort DataFrame by saledate
As we're working on a time series problem and trying to predict future examples given past examples, it makes sense to sort our data by date.

In [None]:
#Sort Dataframe in date order

df.sort_values(by=["saledate"], inplace=True, ascending=True)

In [None]:
df.head()

In [None]:
df.saledate.head(12)

#### Make a copy of the original DataFrame

Since we're going to be manipulating the data, we'll make a copy of the original DataFrame and perform our changes there.

This will keep the original DataFrame in tact if we need it again.

In [None]:
#Make a copy of the original Dataframe to preform edits on.
df_tmp=df.copy()

In [None]:
df_tmp

### Add datetime parameters for saledate column
Why?

So we can enrich our dataset with as much information as possible.

Because we imported the data using read_csv() and we asked pandas to parse the dates using parase_dates=["saledate"], we can now access the different datetime attributes of the saledate column.

In [None]:
# Add datetime parameters for saledate
df_tmp["saleYear"]=df_tmp['saledate'].dt.year
df_tmp["saleMonth"]=df_tmp['saledate'].dt.month
df_tmp["saleDay"]=df_tmp['saledate'].dt.day
df_tmp["saleDayOfWeek"]=df_tmp['saledate'].dt.dayofweek
df_tmp["saleDayOfYear"]=df_tmp['saledate'].dt.dayofyear

In [None]:
df_tmp.drop('saledate',axis=1,inplace=True)

In [None]:
df_tmp.head().T

In [None]:
# Check the different values of different columns
df_tmp.state.value_counts()

In [None]:
# Check for missing categories and different datatypes
df_tmp.info()

In [None]:
# Check for missing values
df_tmp.isna().sum()

### Convert strings to categories
One way to help turn all of our data into numbers is to convert the columns with the string datatype into a category datatype.

To do this we can use the pandas types API which allows us to interact and manipulate the types of data.

In [None]:
pd.api.types.is_string_dtype(df_tmp["UsageBand"])

In [None]:
# These columns contain strings
for label, content in df_tmp.items():
    if pd.api.types.is_string_dtype(content):
        print(label)

In [None]:
# If you're wondering what df.items() does, let's use a dictionary as an example
random_dict={'key1':'val1',
            'key2':'vam222'}
for key,value in random_dict.items():
    print('this is the key' ,key,' and this is it\'s content ',value)

In [None]:
# This will turn all of the string values into category values
for label , content in df_tmp.items():
    if pd.api.types.is_string_dtype(content):
        df_tmp[label]=content.astype('category').cat.as_ordered()

In [None]:
df_tmp.info()

In [None]:
df_tmp.head().T

In [None]:
df_tmp.state.cat.codes

In [None]:
states_code=df_tmp.state.cat.categories
dict(enumerate(states_code))

All of our data is categorical and thus we can now turn the categories into numbers, however it's still missing values...

In [None]:
#the percentage of null value in the dataframe 
df_tmp.isnull().sum()/len(df_tmp)

In the format it's in, it's still good to be worked with, let's save it to file and reimport it so we can continue on.

## Save Processed Data


In [None]:
# Save preprocessed data
df_tmp.to_csv('train_tmp.csv',
             index=False)

In [None]:
# Import preprocessed data
df_tmp=pd.read_csv('train_tmp.csv',
                  low_memory=False)


In [None]:
df_tmp.head().T

Excellent, our processed DataFrame has the columns we added to it but it's still missing values.



In [None]:
df_tmp.isnull().sum()

## Fill missing values

From our experience with machine learning models. We know two things:

* 1.All of our data has to be numerical
* 2.There can't be any missing values
And as we've seen using df_tmp.isna().sum() our data still has plenty of missing values.

Let's fill them.

Filling numerical values first
We're going to fill any column with missing values with the median of that column.

In [None]:
for label , content in df_tmp.items():
    if pd.api.types.is_numeric_dtype(content):
        print(label)

In [None]:
# Check for which numeric columns have null values
for label ,content in df_tmp.items():
    if pd.api.types.is_numeric_dtype(content):
        if pd.isnull(content).sum():
            print(label)
        

In [None]:
# Fill numeric rows with the median
for label,content in df_tmp.items():
    if pd.api.types.is_numeric_dtype(content):
        if pd.isnull(content).sum():
            # Add a binary column which tells if the data was missing our not
            df_tmp[label+"_is_missing"]=pd.isnull(content)
            # Fill missing numeric values with median since it's more robust than the mean
            df_tmp[label]=content.fillna(content.median())

Why add a binary column indicating whether the data was missing or not?

We can easily fill all of the missing numeric values in our dataset with the median. However, a numeric value may be missing for a reason. In other words, absence of evidence may be evidence of absence. Adding a binary column which indicates whether the value was missing or not helps to retain this information.

In [None]:
# Check if there's any null values
for label ,content in df_tmp.items():
    if pd.api.types.is_numeric_dtype(content):
        if pd.isnull(content).sum():
            print(label)

In [None]:
# Check to see how many examples were missing
df_tmp.auctioneerID_is_missing.value_counts()

### Filling and turning categorical variables to numbers
Now we've filled the numeric values, we'll do the same with the categorical values at the same time as turning them into numbers.

In [None]:
# Check columns which *aren't* numeric
i=1
for label , content in df_tmp.items():
    if not pd.api.types.is_numeric_dtype(content):
        print(i, label)
        i+=1

In [None]:
i=1
for label ,content in df_tmp.items():
    if not pd.api.types.is_numeric_dtype(content):
        if pd.isnull(content).sum():
            print(i, label)
            i+=1

In [None]:
# Turn categorical variables into numbers
for label ,content in df_tmp.items():
    # Check columns which *aren't* numeric
    if not pd.api.types.is_numeric_dtype(content):
        # Add binary column to inidicate whether sample had missing value
        df_tmp[label+"_is_missing"]=pd.isnull(content)
        # We add the +1 because pandas encodes missing categories as -1
        df_tmp[label]=pd.Categorical(content).codes+1

In [None]:
pd.Categorical(df_tmp.state).codes

In [None]:
df_tmp.info()

In [None]:
df_tmp.isna().sum()

In [None]:
df_tmp.head().T

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
%%time
# Instantiate model
model=RandomForestRegressor(n_jobs=-1)
# Fit the model



According to the Kaggle data page, the validation set and test set are split according to dates.

This makes sense since we're working on a time series problem.

E.g. using past events to try and predict future events.

Knowing this, randomly splitting our data into train and test sets using something like train_test_split() wouldn't work.

Instead, we split our data into training, validation and test sets using the date each sample occured.

In our case:

Training = all samples up until 2011
Valid = all samples form January 1, 2012 - April 30, 2012
Test = all samples from May 1, 2012 - November 2012
For more on making good training, validation and test sets, check out the post How (and why) to create a good validation set by Rachel Thomas.mm

In [None]:
df_tmp.saleYear.value_counts()

In [None]:
%%time
# Split data into training and validation
df_val = df_tmp[df_tmp['saleYear']==2012]
df_train =df_tmp[df_tmp['saleYear'] != 2012]
len(df_val), len(df_train)

In [None]:
#  Split data into X & y
X_train , y_train= df_train.drop('SalePrice',axis=1),df_train.SalePrice
X_valid , y_valid = df_val.drop('SalePrice' , axis=1), df_val.SalePrice

X_train.shape , y_train.shape , X_valid.shape , y_valid.shape


### Building an evaluation function
According to Kaggle for the Bluebook for Bulldozers competition, the evaluation function they use is root mean squared log error (RMSLE).

RMSLE = generally you don't care as much if you're off by $10 as much as you'd care if you were off by 10%, you care more about ratios rather than differences. MAE (mean absolute error) is more about exact differences.

It's important to understand the evaluation metric you're going for.

Since Scikit-Learn doesn't have a function built-in for RMSLE, we'll create our own.

We can do this by taking the square root of Scikit-Learn's mean_squared_log_error (MSLE). MSLE is the same as taking the log of mean squared error (MSE).

We'll also calculate the MAE and R^2 for fun.

In [None]:
y_train

In [None]:
# Create evaluation function (the competition uses RMSLE)
from sklearn.metrics import mean_absolute_error , mean_squared_log_error ,r2_score

def rmsle(y_test,y_pred):
    """
    Caculates root mean squared log error between predictions and
    true labels.
    """
    return np.sqrt(mean_squared_log_error(y_test,y_pred))

# Create function to evaluate model on a few different levels
def show_scores(model):
    train_preds=model.predict(X_train)
    val_preds=model.predict(X_valid)
    scores={"Training MAE":mean_absolute_error(y_train,train_preds),
            "Valid MAE":mean_absolute_error(y_valid,val_preds),
            "Training RMSLE":rmsle(y_train,train_preds),
            "Valid RMSLE":rmsle(y_valid,val_preds),
            "Training R^2":r2_score(y_train,train_preds),
            "Valid R^2":r2_score(y_valid,val_preds),
           }
    return scores

### Testing our model on a subset (to tune the hyperparameters)

In [None]:
len(X_train)

In [None]:
X_train.info()

In [None]:
# Change max_samples value
model=RandomForestRegressor(n_jobs=-1,
                           random_state=42,
                           max_samples=10000)

In [None]:
%%time
# Cutting down on the max number of samples each estimator can see improves training time
model.fit(X_train,y_train)
### Wall time: 18.2 s

In [None]:
y_pi=model.predict(X_valid)

In [None]:
rmsle(y_pi,y_valid)

In [None]:
%%time
show_scores(model)

## Hyerparameter tuning with RandomizedSearchCV

In [None]:
# Find the best model hyperparameter
rs_mode.best_params_

In [None]:
%%time
from sklearn.model_selection import RandomizedSearchCV
# Different RandomForestRegressor hyperparameters
rf_grid = {"n_estimators": np.arange(10, 100, 10),
           "max_depth": [None, 3, 5, 10],
           "min_samples_split": np.arange(2, 20, 2),
           "min_samples_leaf": np.arange(1, 20, 2),
           "max_features": [0.5, 1, "sqrt", "auto"],
           "max_samples": [10000]}

# Instantiate RandomizedSearchCV model
rs_mode=RandomizedSearchCV(RandomForestRegressor(n_jobs=-1,
                                                 n_estimators=2),
                           param_distributions=rf_grid,
                           n_iter=2,
                           cv=5,
                           verbose=True)
# Fit the RandomizedSearchCV model
rs_mode.fit(X_train,y_train)

In [None]:
show_scores(rs_mode)


### Train a model with the best parameters
In a model I prepared earlier, I tried 100 different combinations of hyperparameters (setting n_iter to 100 in RandomizedSearchCV) and found the best results came from the ones you see below.

Note: This kind of search on my computer (n_iter = 100) took ~2-hours. So it's kind of a set and come back later experiment.

We'll instantiate a new model with these discovered hyperparameters and reset the max_samples back to its original value.

In [None]:
%%time
# Most ideal hyperparameters
ideal_model=RandomForestRegressor(n_estimators=90,
                                    min_samples_leaf=1,
                                    min_samples_split=14,
                                    max_features=0.5,
                                    n_jobs=-1,
                                    max_samples=None)
ideal_model.fit(X_train,y_train)

In [None]:
show_scores(ideal_model)

With these new hyperparameters as well as using all the samples, we can see an improvement to our models performance.

You can make a faster model by altering some of the hyperparameters. Particularly by lowering n_estimators since each increase in n_estimators is basically building another small model.

However, lowering of n_estimators or altering of other hyperparameters may lead to poorer results.

### Make predictions on test data


In [None]:
# Import the test data
df_test = pd.read_csv("../input/bluebook-for-bulldozers/Test.csv",
                      low_memory=False,
                      parse_dates=["saledate"])

df_test.head()

### Preprocessing the data (getting the test dataset in the same format as our training dataset)

In [None]:
def preprocess_data(df):
    """
    Performs transformations on df and returns transformed df.
    """
    df["saleYear"] = df.saledate.dt.year
    df["saleMonth"] = df.saledate.dt.month
    df["saleDay"] = df.saledate.dt.day
    df["saleDayOfWeek"] = df.saledate.dt.dayofweek
    df["saleDayOfYear"] = df.saledate.dt.dayofyear
    
    df.drop("saledate", axis=1, inplace=True)
    
    # Fill the numeric rows with median
    for label, content in df.items():
        if pd.api.types.is_numeric_dtype(content):
            if pd.isnull(content).sum():
                # Add a binary column which tells us if the data was missing or not
                df[label+"_is_missing"] = pd.isnull(content)
                # Fill missing numeric values with median
                df[label] = content.fillna(content.median())
    
        # Filled categorical missing data and turn categories into numbers
        if not pd.api.types.is_numeric_dtype(content):
            df[label+"_is_missing"] = pd.isnull(content)
            # We add +1 to the category code because pandas encodes missing categories as -1
            df[label] = pd.Categorical(content).codes+1
    
    return df

In [None]:
# Process the test data 
df_test = preprocess_data(df_test)
df_test.head()

In [None]:
X_train.head()

In [None]:
# We can find how the columns differ using sets
set(X_train.columns) - set(df_test.columns)

In [None]:
# Manually adjust df_test to have auctioneerID_is_missing column
df_test["auctioneerID_is_missing"] = False
df_test.head()


Finally now our test dataframe has the same features as our training dataframe, we can make predictions!

In [None]:
# Make predictions on the test data
test_preds = ideal_model.predict(df_test)

In [None]:
test_preds

We've made some predictions but they're not in the same format Kaggle is asking for:

In [None]:
# Format predictions into the same format Kaggle is after
df_preds = pd.DataFrame()
df_preds["SalesID"] = df_test["SalesID"]
df_preds["SalesPrice"] = test_preds
df_preds

In [None]:
# Export prediction data
df_preds.to_csv("test_predictions.csv", index=False)

In [None]:
submission=df_preds
submission.to_csv("submission.csv", index = False)

### Feature Importance
Feature importance seeks to figure out which different attributes of the data were most importance when it comes to predicting the target variable (SalePrice).

In [None]:
# Find feature importance of our best model
ideal_model.feature_importances_

In [None]:
import seaborn as sns

# Helper function for plotting feature importance
def plot_features(columns, importances, n=20):
    df = (pd.DataFrame({"features": columns,
                        "feature_importance": importances})
          .sort_values("feature_importance", ascending=False)
          .reset_index(drop=True))
    
    sns.barplot(x="feature_importance",
                y="features",
                data=df[:n],
                orient="h")

In [None]:
plot_features(X_train.columns,ideal_model.feature_importances_)

In [None]:
sum(ideal_model.feature_importances_)

In [None]:
df.ProductSize.isna().sum()

In [None]:
df.ProductSize.value_counts()