In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_log_error, mean_absolute_error, r2_score
from sklearn.model_selection import RandomizedSearchCV

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## 1. Problem definition

> How well can we precit the future sale price, giving it characteristics and previous examples of how much similiar bulldozers have benn sold for?

## 2. Data

The data is downloaded from the Kaggle Bluebook for Bulldozers competition https://www.kaggle.com/c/bluebook-for-bulldozers/overview/evaluation:
* Train.csv is the training set, which contains data through the end of 2011.
* Valid.csv is the validation set, which contains data from January 1, 2012 - April 30, 2012 You make predictions on this set throughout the majority of the competition. Your score on this set is used to create the public leaderboard.
* Test.csv is the test set, which won't be released until the last week of the competition. It contains data from May 1, 2012 - November 2012. Your score on the test set determines your final rank for the competition.

## 3. Evaluation
The evaluation metric for this competition is the RMSLE (root mean squared log error) between the actual and predicted auction prices.

## 4. Data evaluation

In [None]:
# Import training and validation sets
df = pd.read_csv("../input/blue-book-for-bulldozer/Train/Train.csv", low_memory=False, parse_dates=["saledate"])

In [None]:
df.info()

In [None]:
df.isna().sum()

Get some sales according to year

In [None]:
fig, ax = plt.subplots()
ax.scatter(df["saledate"][:1000], df["SalePrice"][:1000])

In [None]:
# Sort DataFrame in date order
df.sort_values(by=["saledate"],inplace=True, ascending=True)
df.saledate.head(20)

In [None]:
# Make a copy so we can return to it if something goes wrong
df_tmp = df.copy()

In [None]:
# Add datetime parameters for `saledate` column
df_tmp["saleYear"] = df_tmp.saledate.dt.year
df_tmp["saleMonth"] = df_tmp.saledate.dt.month
df_tmp["saleDay"] = df_tmp.saledate.dt.day
df_tmp["saleDayOfWeek"] = df_tmp.saledate.dt.dayofweek
df_tmp["saleDayOfYear"] = df_tmp.saledate.dt.dayofyear

In [None]:
# Now we've enriched our DataFrame with date time features, we can remove saledate
df_tmp.drop("saledate", axis=1, inplace=True)

In [None]:
df_tmp.head(20).T

## 5. Modelling

Data have lots of null values and objects. Need to change that.

In [None]:
# This will turn all of the string value into category values
for label, content in df_tmp.items():
    if pd.api.types.is_string_dtype(content):
        df_tmp[label] = content.astype("category").cat.as_ordered()

In [None]:
# Check missing data
df_tmp.isnull().sum()/len(df_tmp)

In [None]:
# Fill numeric rows with the median
for label, content in df_tmp.items():
    if pd.api.types.is_numeric_dtype(content):
        if pd.isnull(content).sum(): 
            # Add a binary column which tells us if the data was missing
            df_tmp[label+"_is_missing"] = pd.isnull(content)
            # Fill missing numeric values with median
            df_tmp[label] = content.fillna(content.median())

In [None]:
# Turn categorical vvariables into numbers and fill missing
for label, content in df_tmp.items():
    if not pd.api.types.is_numeric_dtype(content):
        # Add binary column to indicate whether sample have missing data
        df_tmp[label+"_is_missing"] = pd.isnull(content)
        # Turn categories into numbers and add+1
        df_tmp[label] = pd.Categorical(content).codes+1

In [None]:
df_tmp.isna().sum()

Now that all of data is numeric as well as our dataframe has no missing values, we should be able to build a machine learning model.

In [None]:
# split data into training and validation sets
df_val = df_tmp[df_tmp.saleYear==2011]
df_train = df_tmp[df_tmp.saleYear !=2011]

len(df_val),len(df_train)

In [None]:
# Split data into X & y
X_train, y_train = df_train.drop("SalePrice",axis=1),df_train.SalePrice
X_valid, y_valid = df_val.drop("SalePrice",axis=1),df_val.SalePrice

X_train.shape, y_train.shape, X_valid.shape, y_valid.shape

In [None]:
def rmsle(y_test,y_preds):
    """
    Calculates root mean squared log error between predictions and true labels.
    """
    return np.sqrt(mean_squared_log_error(y_test,y_preds))

# Create function to evaluate model on a few different levels
def show_scores(model):
    train_preds = model.predict(X_train)
    val_preds = model.predict(X_valid)
    scores = {"training MAE": mean_absolute_error(y_train,train_preds),
             "Valid MAE": mean_absolute_error(y_valid, val_preds),
             "Training RMSLE": rmsle(y_train, train_preds),
             "Valid RMSLE": rmsle(y_valid, val_preds),
             "Training R^2": r2_score(y_train, train_preds),
             "Valid R^2": r2_score(y_valid, val_preds)}
    return scores

In [None]:
model = RandomForestRegressor(n_jobs=-1, max_samples = 10000)

In [None]:
%%time
#Cutting down on the max number of samples each estimator can see improve compute time
model.fit(X_train,y_train)

In [None]:
show_scores(model)

### Hyperparameters tuning with RandomizedSearchCV

In [None]:
%%time

# Different RandomForestRegressor hyperparameters
rf_grid= {"n_estimators": np.arange(10,100,10),
         "max_depth": [None, 3,5,10],
         "min_samples_split": np.arange(2,20,2),
         "min_samples_leaf": np.arange(1,20,2),
         "max_features": [0.5,1,"sqrt","auto"],
         "max_samples":[10000]}
# Instantiate RandomizedSearchCV model
rs_model = RandomizedSearchCV(RandomForestRegressor(n_jobs=-1, random_state=42),
                                                    param_distributions=rf_grid,n_iter=2,cv=5,verbose=True)

#Fit the randomized search cv model
rs_model.fit(X_train,y_train)

In [None]:
rs_model.best_params_

In [None]:
show_scores(rs_model)

## Train a model with the best hyperparameters

In [None]:
%%time

# Most ideal hyperparameters
ideal_model = RandomForestRegressor(**rs_model.best_params_)
# Fit the idel model
ideal_model.fit(X_train, y_train)

In [None]:
show_scores(ideal_model)

## Make predictions on test data

In [None]:
df_test = pd.read_csv("../input/blue-book-for-bulldozer/Test.csv", low_memory=False,parse_dates=["saledate"])

Need to preprocess data on test set

In [None]:
def preprocess_data(df):
    """
    Performs transformations on df and retruned transformed df.
    """
    df["saleYear"] = df.saledate.dt.year
    df["saleMonth"] = df.saledate.dt.month
    df["saleDay"] = df.saledate.dt.day
    df["saleDayOfWeek"] = df.saledate.dt.dayofweek
    df["saleDayOfYear"] = df.saledate.dt.dayofyear
    
    df.drop("saledate", axis=1, inplace=True)
    
    
    # Fill numeric rows with the median
    for label, content in df.items():
        if pd.api.types.is_numeric_dtype(content):
            if pd.isnull(content).sum(): 
                # Add a binary column which tells us if the data was missing
                df[label+"_is_missing"] = pd.isnull(content)
                # Fill missing numeric values with median
                df[label] = content.fillna(content.median())
    
    

    # Turn categorical vvariables into numbers and fill missing
    for label, content in df.items():
        if not pd.api.types.is_numeric_dtype(content):
            # Add binary column to indicate whether sample have missing data
            df[label+"_is_missing"] = pd.isnull(content)
            # Turn categories into numbers and add+1
            df[label] = pd.Categorical(content).codes+1
    
    return df

In [None]:
# Process test data
df_test = preprocess_data(df_test)
df_test.head()

In [None]:
df_test["auctioneerID_is_missing"] = False

In [None]:
test_preds = ideal_model.predict(df_test)

In [None]:
# Format preictions into the same format Kaggle is after
df_preds=pd.DataFrame()
df_preds["SalesID"] = df_test["SalesID"]
df_preds["SalesPrice"] = test_preds
df_preds

## Feature importance

Feature importance seeks to figure out which different attributes where most important when it comes to predicting target variable (SalePrice).

In [None]:
# Find Feature importances of our best model
len(ideal_model.feature_importances_)

In [None]:
# Helper function for plotting feature importance
def plot_features(columns, importances, n=20):
    df = (pd.DataFrame({"features": columns,
                       "feature_importances": importances})
          .sort_values("feature_importances",ascending = False)
          .reset_index(drop=True))
    fig, ax =plt.subplots()
    ax.barh(df["features"][:n], df["feature_importances"][:20])
    ax.set_ylabel("Features")
    ax.set_xlabel("Feature importance")
    ax.invert_yaxis()

In [None]:
plot_features(X_train.columns, ideal_model.feature_importances_)

In [None]:
X_train.head()