In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Predicting The Sale Price of Bulldozers Using Machine Learning

In this notebook we're going to go through an example of machine learning project with the goal of predicting the sale price of bulldozers.

## 1. Problem Definition

> How well can we predict the future sale price of a bulldozer, given its characteristic and previous examples of how much similar bulldozers have been sold for?

## 2. Data

The Data is Downloaded from the kaggle bluebook for bulldozers competition: https://www.kaggle.com/c/bluebook-for-bulldozers/data

There are 3 main datasets:

* **Train.csv** is the training set, which contains data through the end of 2011.
* **Valid.csv** is the validation set, which contains data from January 1, 2012 - April 30, 2012 You make predictions on this set throughout the majority of the competition. Your score on this set is used to create the public leaderboard.
* **Test.csv** is the test set, which won't be released until the last week of the competition. It contains data from May 1, 2012 - November 2012. Your score on the test set determines your final rank for the competition

## 3. Evaluation 

The evaluation metric for this competition is the RMSLE (root mean squared log error) between the actual and predicted auction prices.

For more on the evaluation of this project check:
https://www.kaggle.com/c/bluebook-for-bulldozers/overview/evaluation

Note: The goal for most regression evaluation is metrics is to minimize the error. For example, our goal for this project will be to build a machine learning model which minimises the RMSLE(root mean squared log error).


## 4. Features

Kaggle provides a data Dictionary detailing all of the features of the dataset. You can view this data dictionary on Kaggle https://www.kaggle.com/c/bluebook-for-bulldozers/data

## 5. Modelling

## 6. Experimentaion 

In [None]:
# Import Exploring and Visualizing Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Import training and validaiton datasets

df = pd.read_csv("../input/bluebook-for-bulldozers/TrainAndValid.csv",
                 low_memory=False)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
sns.set_style("darkgrid")
fig, ax =plt.subplots()
ax.scatter(df["saledate"][:1000],df["SalePrice"][:1000])


In [None]:
df.SalePrice.plot.hist()

### Parsing dates

When we work time series data, we want to enrich the time & data component as much as possible.

We can do that by telling pandas which of our columns has dates in it using the `parse_dates` parameter. 

In [None]:
# Import data again but this time parse date
df = pd.read_csv("../input/bluebook-for-bulldozers/TrainAndValid.csv",
                 low_memory=False,
                 parse_dates=["saledate"])

In [None]:
df.saledate[:100]

In [None]:
fig, ax = plt.subplots()
ax.scatter(df['saledate'][:1000],df['SalePrice'][:1000])

In [None]:
df.head().T # To check all columns in the dataset

In [None]:
df.saledate.head(20)

### Sort Dataframe by saledate

When working with time series data, it's a good idea to sort it by date.

In [None]:
# Sort DataFrame in date order
df.sort_values(by=["saledate"],inplace=True, ascending=True)
df.saledate.head(20)

In [None]:
df.head()

### Make a copy of original DataFrame

We make a copy of original dataframe so when we manipluate the copy, we've still got our original data.

In [None]:
# Make a copy 
df_temp = df.copy()

In [None]:
df_temp

### Add datetime parameters for `saledate` column

In [None]:
# Let's add some date features extracting from saledate column
df_temp["saleYear"] = df_temp.saledate.dt.year
df_temp["saleMonth"] = df_temp.saledate.dt.month
df_temp["saleDay"] = df_temp.saledate.dt.day
df_temp["saleDayOfWeek"] = df_temp.saledate.dt.dayofweek
df_temp["saleDayOfYear"] = df_temp.saledate.dt.dayofyear

In [None]:
df_temp.head()

In [None]:
# Now we've enriched our DataFrame with date time features, now we can remove saledate column
df_temp.drop("saledate",axis=1,inplace=True)

In [None]:
# Check the values of different columns 
df_temp.state.value_counts()

## 5. Modelling

We have done enough EDA(We could always do more) but let's start to do some model-driven EDA

In [None]:
df_temp.info()

### Convert string (object) into categories

One way we can turn all of our data  into numbers is by converting them into pandas categories.

We can check the different datatypes compatible with pandas here:
https://pandas.pydata.org/docs/reference/api/pandas.api.types.is_object_dtype.html

In [None]:
df_temp.head().T

In [None]:
pd.api.types.is_string_dtype(df_temp["UsageBand"])

In [None]:
# Find the columns which contain strings
for labels, content in df_temp.items():
    if pd.api.types.is_string_dtype(content):
        print(labels)
# It will loop through all features and print all those column names which contains
# the string datatype

In [None]:
# If you're wondering what df.items() does, here's an example below
random_dict = {"key1": "hello",
               "key2": "world!"}
for key, value in random_dict.items():
    print(f"This is the key:{key}")
    print(f"This is the value:{value}")    

In [None]:
# This will turn all of the string value into category values 
for label, content in df_temp.items():
    if pd.api.types.is_string_dtype(content):
        df_temp[label] = content.astype("category").cat.as_ordered()

In [None]:
df_temp.info()

In [None]:
# Let's check out the state column as ordered
df_temp.state.cat.categories

In [None]:
# It still have string in it but pandas has assigned hidden numbers to its values 

# Let's check those numbers

df_temp.state.cat.codes

Thanks to pandas categories we now have a way to access all our data in the form of numbers

But we still have bunch of missing values...

In [None]:
# Check the missing data (in percentage)

df_temp.isnull().sum()/len(df_temp)

In [None]:
df_temp.isna().sum()

## Fill missing values

### 1. Fill numeric missing values first

In [None]:
# Let's check which columns are numberic first
for label, content in df_temp.items():
    if pd.api.types.is_numeric_dtype(content):
        print(label)

In [None]:
# Check which numeric columns have null values
for label, content in df_temp.items():
    if pd.api.types.is_numeric_dtype(content):
        if pd.isnull(content).sum():
            print(label)

In [None]:
# Fill numberic rows with the median
for label, content in df_temp.items():
    if pd.api.types.is_numeric_dtype(content):
        if pd.isnull(content).sum():
            # Add a binary column which tells us if the data was missing
            df_temp[label+"_is_missing"] = pd.isnull(content)
            # Fill missing numeric values with median
            df_temp[label] = content.fillna(content.median())

In [None]:
df_temp.head()

In [None]:
# Let's Demonstrate how median is more robust (essential) than mean
# In most cases outliers in datasets really impact the mean which really doesn't
# make any sense but median will never be affected with outliers

hundreds = np.full((1000), 100)
hundreds_billion = np.append(hundreds, 1000000000)

np.mean(hundreds),np.mean(hundreds_billion), np.median(hundreds),np.median(hundreds_billion)

In [None]:
# Chech if there is any null numeric values
for label, content in df_temp.items():
    if pd.api.types.is_numeric_dtype(content):
        if pd.isnull(content).sum():
            print(label)
# There will no output because there is no numeric missing values

In [None]:
# Check to see how many examples were missing
df_temp.auctioneerID_is_missing.value_counts()

In [None]:
df_temp.isna().sum()

In [None]:
df_temp.info()

### Filling and turning categorical variables into numbers

In [None]:
# Check for columns which are numeric
for label, content in df_temp.items():
    if not pd.api.types.is_numeric_dtype(content):
        print(label)     

In [None]:
# Turn categorical variables into numbers and fill missing
for label, content in df_temp.items():
    if not pd.api.types.is_numeric_dtype(content):
        # Add binary column to indicate whether sample had missing values
        df_temp[label+"_is_missing"] = pd.isnull(content)
        # Turn categories into numbers and add +1
        df_temp[label] = pd.Categorical(content).codes+1

In [None]:
pd.Categorical(df_temp["state"]).codes+1

In [None]:
df_temp.info()

In [None]:
df_temp.head().transpose()

In [None]:
df_temp.describe()

In [None]:
df_temp.dtypes

In [None]:
df_temp.columns

Now that all of data is numeric as well as our dataframe has no missing values, we should be able to build a machine learing model.

In [None]:
df_temp.head()

In [None]:
%%time
# Instantiate model 
model = RandomForestRegressor(random_state=42)

# Fit the model 
model.fit(df_temp.drop("SalePrice",axis =1), df_temp["SalePrice"])

In [None]:
# Score the model
model.score(df_temp.drop("SalePrice",axis=1), df_temp["SalePrice"])

**Question:** Why doesn't the above metric hold water? (why isn't the metric reliable)

### Splitting data into train/validation sets

In [None]:
df_temp.saleYear

In [None]:
df_temp.saleYear.value_counts()

In [None]:
# Split data into training and validation
df_val = df_temp[df_temp.saleYear == 2012]
df_train = df_temp[df_temp.saleYear!= 2012]

len(df_val), len(df_train)

In [None]:
# Split data into X, y
X_train, y_train = df_train.drop("SalePrice", axis=1), df_train.SalePrice
X_valid, y_valid = df_val.drop("SalePrice",axis= 1), df_val.SalePrice

X_train.shape, y_train.shape, X_valid.shape, y_valid.shape

### Building an evaluation function

In [None]:
# Create evaluation function(The competition uses RMSLE)
from sklearn.metrics import mean_squared_log_error, mean_absolute_error, r2_score

def rmsle(y_test, y_preds):
    """
    Calculates root mean squared log error between predictions and true labels
    """
    return np.sqrt(mean_squared_log_error(y_test, y_preds))

# Create function to evaluate model on a few different levels
def show_scores(model):
    train_preds = model.predict(X_train)
    val_preds = model.predict(X_valid)
    scores = {"Training MAE": mean_absolute_error(y_train, train_preds),
              "Valid MAE": mean_absolute_error (y_valid, val_preds),
              "Training RMSLE": rmsle(y_train, train_preds),
              "Valid RMSLE": rmsle(y_valid, val_preds),
              "Training R^2": r2_score(y_train, train_preds),
              "Valid R^2": r2_score(y_valid, val_preds)}
    return scores
    

## Testing our model on a subset (to tune the hyperparameters)

In [None]:
# # This takes far too long... for experimenting 

# %%time
# model = RandomForestRegressor(n_jobs =-1,
#                               random_state=42)
# model.fit(X_train, y_train)

In [None]:
# One method is that we can slice tran sets into eg.10K to train on 
# model.fit(X_train[:10000], y_train[:10000])

In [None]:
# another way is to Change max_samples value 
model = RandomForestRegressor(random_state=42,
                              max_samples=10000)

In [None]:
%%time
# Cutting down on the max number of samples each estimator can see improves training time
model.fit(X_train, y_train)

In [None]:
show_scores(model)

### Hyperparameter tunning with RandomizedSearchCV

In [None]:
%%time
from sklearn.model_selection import RandomizedSearchCV

# Different RandomForestRegressor hyperparameters
rf_grid = {"n_estimators": np.arange(10,100,10),
           "max_depth": [None, 3, 5, 10],
           "min_samples_split": np.arange(2, 20, 2),
           "min_samples_leaf": np.arange(1, 20, 2),
           "max_features": [0.5, 1,"sqrt", "auto"],
           "max_samples": [10000]}
# Instantiate randomized search cv
rs_model = RandomizedSearchCV(RandomForestRegressor(random_state=42),
                             param_distributions=rf_grid,
                             n_iter=2,
                             cv=5,
                             verbose=True)
# Fit the RandomizedSearchCV model
rs_model.fit(X_train, y_train)

In [None]:
# Find the best model hyperparameters

rs_model.best_params_

In [None]:
# Evaluate the RandomizedSearch model
show_scores(rs_model)

## Train a model with the best hyperparameters

**Note:** These were found after 100 iterations of `RandomizedSearchCV`

In [None]:
%%time

# Most ideal hyperparameters
ideal_model = RandomForestRegressor(n_estimators =40,
                                    min_samples_leaf=1,
                                    min_samples_split=14,
                                    max_features=0.5,
                                    max_samples=None,
                                    random_state=42)
# Fit the ideal model
ideal_model.fit(X_train, y_train)

In [None]:
# Show scores for ideal model (trained on all data)
show_scores(ideal_model)

In [None]:
# Show scores on rs_model (Only trained on ~10,000 examples)
show_scores(rs_model)

#### As we saw for ideal_model we had good decrease in RMSLE after finding best parameters


# Make Predictions On Test Dataset

In [None]:
# Import Test dataset
df_test = pd.read_csv("../input/bluebook-for-bulldozers/Test.csv",
                      low_memory=False,
                      parse_dates=["saledate"])
df_test.head()

In [None]:
df_test.isna().sum()

In [None]:
df_test.info()

In [None]:
df_test.columns

### Preprocessing the data (getting the test dataset in the same format as our training dataset)

In [None]:
def preprocess_data(df):
    """
    Performs transformations on df and returns transformed df.
    """
    # Let's add some date features extracting from saledate column
    df["saleYear"] = df.saledate.dt.year
    df["saleMonth"] = df.saledate.dt.month
    df["saleDay"] = df.saledate.dt.day
    df["saleDayOfWeek"] = df.saledate.dt.dayofweek
    df["saleDayOfYear"] = df.saledate.dt.dayofyear
    
    df.drop("saledate",axis=1, inplace=True)
    
    # Fill numeric row with median
    for label, content in df.items():
        if pd.api.types.is_numeric_dtype(content):
            if pd.isnull(content).sum():
                # Add a binary column which tells us if the data was missing
                df[label+"_is_missing"] = pd.isnull(content)
                # Fill missing numeric values with median
                df[label] = content.fillna(content.median())
    
         # Fill the categorical missing data and turned categorical into numbers
        if not pd.api.types.is_numeric_dtype(content):
                df[label+"_is_missing"] = pd.isnull(content)
                # We add +1 to the category code because pandas encodes missing values as -1
                df[label] = pd.Categorical(content).codes+1
                
    
    return df

In [None]:
# Processed test data
df_test = preprocess_data(df_test)

df_test.head()

> An error will occur if we run below cell because we have a missing feature

In [None]:
# # Make predictions on updated test data
# test_preds = ideal_model.predict(df_test)

In [None]:
# We can find how the columns differ using sets
set(X_train.columns) - set(df_test.columns)

In [None]:
# Manually adjust df_test to have auctioneerID_is_missing columns
df_test["auctioneerID_is_missing"] = False
df_test.head()

Finally now our test data dataframe has the same features as our training dataframe, we can make predictions!

In [None]:
# Make predictions on the test data
test_preds = ideal_model.predict(df_test)

In [None]:
test_preds

We've made some predictions but they're not in the same format Kaggle is asking for:
https://www.kaggle.com/c/bluebook-for-bulldozers/overview/evaluation

In [None]:
# Format predicitons into the same format Kaggle is after
df_preds = pd.DataFrame()
df_preds["SalesID"] = df_test["SalesID"]
df_preds["SalesPrice"] = test_preds
df_preds

# Features Importance

Feature importance seeks to figure out which different attributes of the data were most importance when it comes to predicting the **Target Varaiable**(SalePrice).

In [None]:
# Find Feature importance of our model  
ideal_model.feature_importances_

In [None]:
# Helper function for plotting feature importance
def plot_features(columns, importances, n=20):
    df = (pd.DataFrame({"features":columns,
                        "features_importances": importances})
          .sort_values("features_importances",ascending=False)
          .reset_index(drop=True))
    
    # Plot the dataframe we created
    fig, ax = plt.subplots()
    ax.barh(df["features"][:n], df["features_importances"][:20])
    ax.set_ylabel("Features")
    ax.set_xlabel("Features Importance")
    ax.invert_yaxis()

In [None]:
plot_features(X_train.columns, ideal_model.feature_importances_)