In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn

In [None]:
df = pd.read_csv("../input/bluebook-for-bulldozers/TrainAndValid.csv",low_memory=False)

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
fig,ax = plt.subplots()
ax.scatter(df["saledate"][:1000],df["SalePrice"][:1000]);

In [None]:
df.saledate[:1000]

In [None]:
df.saledate.dtype

In [None]:
df.SalePrice.plot.hist()

## Parsing dates

When we work with time series data, we want to enrich the time & date component as much as possible.

We can do that by telling pandas which of our columns has dates in it using the parse_dates parameter.

In [None]:
# Import data again but this time parse dates

df=pd.read_csv("../input/bluebook-for-bulldozers/TrainAndValid.csv",low_memory=False,parse_dates=["saledate"])

In [None]:
df["saledate"][:1000]

In [None]:
fig , ax = plt.subplots()
ax.scatter(df["saledate"][:1000],df["SalePrice"][:1000]);

In [None]:
df.head()

In [None]:
df.head().T

In [None]:
df.saledate.head(20)

## Sort DataFrame by saledate
When working with time series data, it's a good idea to sort it by date.

In [None]:
df.sort_values(by=["saledate"],ascending=True,inplace=True)

In [None]:
df.saledate.head(10)

In [None]:
# Make a copy of the original DataFrame to perform edits on
df_tmp = df.copy()

## Add datetime parameters for saledate column

In [None]:
# Add datetime parameters for saledate
df_tmp["saleYear"] = df_tmp.saledate.dt.year
df_tmp["salemonth"] = df_tmp.saledate.dt.month
df_tmp["saleDay"] = df_tmp.saledate.dt.day
df_tmp["saleDayofweek"] = df_tmp.saledate.dt.dayofweek
df_tmp["saleDayofYear"] = df_tmp.saledate.dt.dayofyear

df_tmp.drop("saledate",axis=1,inplace=True)

In [None]:
df_tmp.head().T

In [None]:
# Check the different values of different columns
df_tmp.state.value_counts()

In [None]:
# Check for missing categories and different datatypes
df_tmp.info()

In [None]:
# Check for missing values
df_tmp.isna().sum()

## Convert strings to categories

In [None]:
pd.api.types.is_string_dtype(df_tmp["UsageBand"])

In [None]:
# These columns contain strings
for label,content in df_tmp.items():
    if pd.api.types.is_string_dtype(content):
        print(label)

In [None]:
# This will turn all of the string values into category values
for label,content in df_tmp.items():
    if pd.api.types.is_string_dtype(content):
        df_tmp[label] = content.astype("category").cat.as_ordered()

In [None]:
df_tmp.info()

All of our data is categorical and thus we can now turn the categories into numbers, however it's still missing values...

## Fill missing values

### Filling numerical values first

In [None]:
for label,content in df_tmp.items():
    if pd.api.types.is_numeric_dtype(content):
        print(label)

In [None]:
# Check for which numeric columns have null values
for label,content in df_tmp.items():
    if pd.api.types.is_numeric_dtype(content):
        if df_tmp[label].isna().sum():
            print(label)

In [None]:
# Fill numeric rows with the median
for label,content in df_tmp.items():
    if pd.api.types.is_numeric_dtype(content):
        if df_tmp[label].isna().sum():
            # Add a binary column which tells if the data was missing our not
            df_tmp[label+"missing"] = pd.isnull(content)
            # Fill missing numeric values with median since it's more robust than the mean
            df_tmp[label] = content.fillna(content.median())

We can easily fill all of the missing numeric values in our dataset with the median. However, a numeric value may be missing for a reason. In other words, absence of evidence may be evidence of absence. Adding a binary column which indicates whether the value was missing or not helps to retain this information

In [None]:
# Check if there's any null values
for label,content in df_tmp.items():
    if pd.api.types.is_numeric_dtype(content):
        if df_tmp[label].isna().sum():
            print(label)

### Filling and turning categorical variables to numbers

In [None]:
# Turn categorical variables into numbers
for label,content in df_tmp.items():
    if not pd.api.types.is_numeric_dtype(content):
        df_tmp[label+"missing"] = pd.isnull(content)
        # We add the +1 because pandas encodes missing categories as -1
        df_tmp[label] = pd.Categorical(content).codes + 1

In [None]:
df_tmp.info()

In [None]:
df_tmp.isna().sum()

In [None]:
df_tmp.head().T

In [None]:
%%time

from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_jobs=-1)

model.fit(df_tmp.drop("SalePrice",axis=1),df_tmp["SalePrice"])

In [None]:
# Score the model
model.score(df_tmp.drop("SalePrice",axis=1),df_tmp.SalePrice)

### This metric not reliable since not split in train and valid sets 

## Splitting data into train/valid sets

* Training = all samples up until 2011
* Valid = all samples form January 1, 2012 - April 30, 2012
* Test = all samples from May 1, 2012 - November 2012

In [None]:
df_tmp.head()

In [None]:
df_val = df_tmp[df_tmp.saleYear == 2012]
df_train = df_tmp[df_tmp.saleYear != 2012]

len(df_val), len(df_train)

In [None]:
# Split data into X & y
X_train,Y_train = df_train.drop("SalePrice",axis=1),df_train.SalePrice
X_valid,Y_valid = df_val.drop("SalePrice",axis=1),df_val.SalePrice


## Building an evaluation function
MSLE is the same as taking the log of mean squared error (MSE)

In [None]:
# Create evaluation function (the competition uses Root Mean Square Log Error)
from sklearn.metrics import mean_squared_log_error , mean_absolute_error

def rmsle(y_test,y_preds):
    return np.sqrt(mean_squared_log_error(y_test,y_preds))

# Create function to evaluate our model
def show_scores(model):
    train_preds = model.predict(X_train)
    vaild_preds = model.predict(X_valid)
    scores = { "Training MAE" : mean_absolute_error(Y_train,train_preds),
              "Valid MAE" : mean_absolute_error(Y_valid,vaild_preds),
              "Training RMSLE" : rmsle(Y_train,train_preds),
              "Valid RMSLE" : rmsle(Y_valid,vaild_preds),
              "Training R^2" : model.score(X_train,Y_train),
              "Valid R^2" : model.score(X_valid,Y_valid)}
    return scores

## Testing our model on a subset (to tune the hyperparameters)

In [None]:
# Change max samples in RandomForestRegressor
model = RandomForestRegressor(n_jobs=-1,max_samples=10000)

# Setting max_samples to 10000 means every n_estimator (default 100) in our RandomForestRegressor will only see 10000 random samples from our DataFrame instead of the entire 400,000.

In [None]:
%%time
# Cutting down the max number of samples each tree can see improves training time
model.fit(X_train,Y_train)

In [None]:
show_scores(model)

## Hyperparameter tuning with RandomizedSearchCV

In [None]:
%%time
from sklearn.model_selection import RandomizedSearchCV

# Differernt RandomForestClassifier hyperparameters
rf_grid = {"n_estimators": np.arange(10, 100, 10),
           "max_depth": [None, 3, 5, 10],
           "min_samples_split": np.arange(2, 20, 2),
           "min_samples_leaf": np.arange(1, 20, 2),
           "max_features": [0.5, 1, "sqrt", "auto"],
           "max_samples": [10000]}

rs_model = RandomizedSearchCV(RandomForestRegressor(),
                             param_distributions=rf_grid,
                             n_iter=20,
                             cv=5)
rs_model.fit(X_train,Y_train)

In [None]:
rs_model.best_params_

In [None]:
show_scores(rs_model)

In a model I prepared earlier, I tried 100 different combinations of hyperparameters (setting n_iter to 100 in RandomizedSearchCV) and found the best results came from the ones you see below.

Note: This kind of search on my computer (n_iter = 100) took ~2-hours. So it's kind of a set and come back later experiment.

We'll instantiate a new model with these discovered hyperparameters and reset the max_samples back to its original value.

In [None]:
%%time
# Most ideal hyperparameters
ideal_model = RandomForestRegressor(n_estimators=90,
                                   min_samples_leaf=1,
                                   min_samples_split=14,
                                   max_features=0.5,
                                   n_jobs=-1,
                                   max_samples=None)
ideal_model.fit(X_train,Y_train)

In [None]:
show_scores(ideal_model)

## Make predictions on test data

In [None]:
df_test = pd.read_csv("../input/bluebook-for-bulldozers/Test.csv",parse_dates=["saledate"])
df_test.head()

### Preprocessing the data

In [None]:
def preprocess_data(df):
    # Add datetime parameters for saledate
    df["saleYear"] = df.saledate.dt.year
    df["salemonth"] = df.saledate.dt.month
    df["saleDay"] = df.saledate.dt.day
    df["saleDayofweek"] = df.saledate.dt.dayofweek
    df["saleDayofYear"] = df.saledate.dt.dayofyear
    
    # Drop original saledate
    df.drop("saledate",axis=1,inplace=True)
    
    # Fill numeric rows with the median
    for label,content in df.items():
        if pd.api.types.is_numeric_dtype(content):
            if pd.isnull(content).sum():
                df[label+"missing"] = pd.isnull(content)
                df[label] = content.fillna(content.median())
                
        # Turn categorical variables into numbers
        if not pd.api.types.is_numeric_dtype(content):
            df[label+"missing"] = pd.isnull(content)
            df[label] = pd.Categorical(content).codes+1
            
    return df         

In [None]:
df_test = preprocess_data(df_test)
df_test.head()

In [None]:
X_train.head()

In [None]:
# We can find how the columns differ using sets
set(X_train.columns)-set(df_test.columns)

In this case, it's because the test dataset wasn't missing any auctioneerID fields.

To fix it, we'll add a column to the test dataset called auctioneerID_is_missing and fill it with False, since none of the auctioneerID fields are missing in the test dataset.

In [None]:
# Match test dataset columns to training dataset
df_test["auctioneerID_is_missing"] = False
df_test.head()

In [None]:
# Make predictions on the test dataset using the best model
test_preds = ideal_model.predict(df_test)

## Feature Importance

In [None]:
# Find feature importance of our best model
ideal_model.feature_importances_

In [None]:
import seaborn as sns
def plot_features(columns,importances,n=20):
    df=pd.DataFrame({"features":columns,"feature_importance":importances}).sort_values("feature_importance",ascending=False).reset_index(drop=True)
    sns.barplot(x="feature_importance",y="features",data=df[:n],orient="h")
    

In [None]:
plot_features(X_train.columns,ideal_model.feature_importances_)