In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn

# # Predicting the Sale Price of Bulldozers using Machine Learning

### 1. Problem definition

> How can we predict the future sale price of a bulldozer, given its characteristics and previous examples of how much similar bulldozers have been sold for?

### 2. Data

* The data is downloaded from kaggle bluebook for Bulldozers competition:


### 3. Evaluation

* The evaluation metric for this competition is the RMSLE (root mean squared log error) betwee the actual and predicted auction prices.

* For more on the evaluation of this project check:
https://www.kaggle.com/c/bluebook-for-bulldozers/overview/evaluation

* **Note:** The goal for most regression evaluation metrics is to minimize the error. For example, our goal for this project will be to build a machine learning model which minimizes RMSLE.

### 4. Features

Kaggle provides a data dictionary detailing all of the features of the dataset. You can view this data dictionary on Google Sheets:
https://docs.google.com/spreadshets/d/181y-bLR8sbJLITkWG7ozKm8l33RyieQ2Fpgix-beSYT/edit?usp=sharing

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

PATH = "../input/bluebook-for-bulldozers/TrainAndValid.csv"
df = pd.read_csv(f"{PATH}", low_memory= False)

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df.saledate[:1000]

In [None]:
df.saledate.dtype

In [None]:
fig,ax = plt.subplots()
ax.scatter(df['saledate'][:1000],df["SalePrice"][:1000])

In [None]:
df.SalePrice.plot.hist()

### Parsing dates

When we work with time series data, we want to enrich the time & date compoent as much as possible.

We can do that by telling pandas which of our columns has dates in it using the parse_dates parameter. 

In [None]:
# Import data again but this time parse dates
PATH = "../input/bluebook-for-bulldozers/TrainAndValid.csv"
df = pd.read_csv(f"{PATH}", low_memory= False, parse_dates=["saledate"])

In [None]:
df.saledate.dtype

In [None]:
df.saledate[:1000]

In [None]:
fig,ax = plt.subplots()
ax.scatter(df["saledate"][:1000],df["SalePrice"][:1000])

In [None]:
df.head()

In [None]:
df.head().T

In [None]:
df.saledate.head(20)

### Sort DataFrame by saledate

When working with time series data, its a good idea to sort it by date

In [None]:
# Sort DataFrame in date order
df.sort_values(by=["saledate"],inplace=True,ascending=True)

In [None]:
df.saledate.head(20)

In [None]:
df.head()

### Make a copy of the original DataFrame

We make a copy of the original dataframe so when we manipulate the copy, we've still got our original data.

In [None]:
# Make a copy
df_tmp = df.copy()

In [None]:
df_tmp.saledate.head()

### Add datetime parameters for saledate column

In [None]:
df_tmp[:1].saledate.dt.year

In [None]:
df_tmp[:1].saledate.dt.day

In [None]:
df_tmp[:1].saledate

In [None]:
df_tmp["saleYear"] = df_tmp.saledate.dt.year
df_tmp["saleMonth"] = df_tmp.saledate.dt.month
df_tmp["saleDay"] = df_tmp.saledate.dt.day
df_tmp["saleDayOfWeek"] = df_tmp.saledate.dt.dayofweek
df_tmp["saleDayOfYear"] = df_tmp.saledate.dt.dayofyear

In [None]:
df_tmp.head()

In [None]:
# Now we can remove the saledate column 
df_tmp.drop("saledate",axis=1,inplace=True)

In [None]:
# Check the values of different columns
df_tmp.state.value_counts()

In [None]:
len(df_tmp)

## 5. Modelling

We've done enough EDA (we could always do more) but let's start to do some model-driven EDA

In [None]:
df.info()

In [None]:
df_tmp["UsageBand"].dtype

In [None]:
df_tmp.isna().sum()

## Convert string to categories

One way we can turn our data into numbers is by converting them into pandas categories.

We can check the different datatypes compatible with pandas here:
https://pandas.pydata.org/pandas-docs/stable/reference/general_utility_functions.html#data-types-related-functionality

In [None]:
df_tmp.head()

In [None]:
pd.api.types.is_string_dtype(df_tmp["UsageBand"])

In [None]:
# Find the columns which contain strings
# label = columns
# content = values
for label,content in df_tmp.items():
    if pd.api.types.is_string_dtype(content):
        print(label)

In [None]:
# If you are wondering what df.items() does, here's an example
random_dict = {"key1":"hello",
               "key2":"world"}
for key,value in random_dict.items():
    print(f"this is a key:{key}",
          f"this is a value:{value}")

In [None]:
# This will turn all of the string value into category values
for label,content in df_tmp.items():
    if pd.api.types.is_string_dtype(content):
        df_tmp[label] = content.astype("category").cat.as_ordered()

In [None]:
df_tmp.info()

In [None]:
df_tmp.state.cat.categories

In [None]:
df_tmp.state.value_counts()

In [None]:
df_tmp.state.cat.codes

Thanks to pandas Categories we now have a way to access all of our data in the form of numbers.

But we still have a bunch of missing data...

In [None]:
# Check misisng data
df_tmp.isnull().sum()/len(df_tmp)

### Save preprocessed data

In [None]:
df_tmp.head().T

In [None]:
df_tmp.isna().sum()

## Fill missing values

### Fill numerical missing values first

In [None]:
for label,content in df_tmp.items():
    if pd.api.types.is_numeric_dtype(content):
        print(label)

In [None]:
df_tmp.ModelID

In [None]:
# Check for which numeric columns have null values
for label,content in df_tmp.items():
    if pd.api.types.is_numeric_dtype(content):
        if pd.isnull(content).sum():
            print(label)

In [None]:
# Fill numeric rows with the median
for label,content in df_tmp.items():
    if pd.api.types.is_numeric_dtype(content):
        if pd.isnull(content).sum():
            # Add a binary column which tells us if the data was missing or not
            df_tmp[label+"_is_missing"] = pd.isnull(content)
            # Fill missing numeric values with median
            df_tmp[label] = content.fillna(content.median())

In [None]:
# Demonstrate how median is more robust than mean
hundreds = np.full((1000,),100)
hundreds_billion = np.append(hundreds, 1000000000)
np.mean(hundreds), np.mean(hundreds_billion), np.median(hundreds), np.median(hundreds_billion)

In [None]:
hundreds_billion

In [None]:
# Check if there's any null numeric values
for label,content in df_tmp.items():
    if pd.api.types.is_numeric_dtype(content):
        if pd.isnull(content).sum():
            print(label)

In [None]:
# Check to see how many examples were missing
df_tmp.auctioneerID_is_missing.value_counts()

In [None]:
df_tmp.isna().sum()

### Filling and turning categorical variables into numbers

In [None]:
# Check for columns which aren't numeric
for label,content in df_tmp.items():
    if not pd.api.types.is_numeric_dtype(content):
        print(label)

In [None]:
pd.Categorical(df_tmp["state"])

In [None]:
pd.Categorical(df_tmp["state"]).codes + 1

In [None]:
# You can see that result can also be -1 
# Thus we added 1 to remove the negative sign
pd.Categorical(df_tmp["UsageBand"]).codes 

### Turn categorical variables into numbers and fill missing

In [None]:
for label,content in df_tmp.items():
    if not pd.api.types.is_numeric_dtype(content):
        # Add binary column to indicate whether sample had missing value
        df_tmp[label+"_is_missing"] = pd.isnull(content)
        # Turn categories into numbers and add +1
        df_tmp[label] = pd.Categorical(content).codes + 1 

In [None]:
pd.Categorical(df_tmp["state"]).codes+1

In [None]:
df_tmp.info()

In [None]:
df_tmp.head().T

In [None]:
df_tmp.isna().sum()

#### Now that all of data is numeric as well as our dataframe has no missing values, we should be able to build a machine learning model.  

In [None]:
df_tmp.head()

In [None]:
len(df_tmp)

In [None]:
from sklearn.ensemble import RandomForestRegressor
# Instantiate model
model = RandomForestRegressor(n_jobs=-1,
                              random_state=42,
                              n_estimators=100)

# Fit the model
model.fit(df_tmp.drop("SalePrice", axis=1), df_tmp["SalePrice"])

In [None]:
model.score(df_tmp.drop("SalePrice",axis=1), df_tmp["SalePrice"])

#### **Questions:** Why doesn't the above metric hold water? (wh isn't the metric reliable)

### Splitting data into tain/validation sets 

In [None]:
df_tmp.saleYear

In [None]:
df_tmp.saleYear.value_counts()

In [None]:
# Split data into training and validation
df_val = df_tmp[df_tmp.saleYear == 2012]
df_train = df_tmp[df_tmp.saleYear!= 2012]

len(df_val), len(df_train)

In [None]:
# Split data into x & y
x_train,y_train = df_train.drop("SalePrice",axis=1),df_train.SalePrice
x_valid,y_valid = df_val.drop("SalePrice",axis=1),df_val.SalePrice

x_train.shape,y_train.shape,x_valid.shape,y_valid.shape

In [None]:
y_train

### Building an Evaluation function

In [None]:
# Create evaluation function (the competition uses RMSLE)
from sklearn.metrics import mean_squared_log_error, mean_absolute_error,r2_score

def rmsle(y_test,y_preds):
    '''
    Calculate root mean squared log error between predictions and
    true labels.
    '''
    return np.sqrt(mean_squared_log_error(y_test,y_preds))
# Create function to evaluate model on a few different levels
def show_scores(model):
    train_preds = model.predict(x_train)
    val_preds   = model.predict(x_valid)
    scores = {"Training MAE": mean_absolute_error(y_train, train_preds),
              "Valid MAE": mean_absolute_error(y_valid, val_preds),
              "Training RMSLE": rmsle(y_train, train_preds),
              "Valid RMSLE": rmsle(y_valid, val_preds),
              "Training R^2": r2_score(y_train, train_preds),
              "Valid R^2": r2_score(y_valid, val_preds)}
    return scores

### Testing our model on a subset (to tune the hyperparameters)

In [None]:
# # This takes far too long for experimenting
# %%time
# model = RandomForestRegressor(n_jobs=-1,
#                               random_state=42,
#                               n_estimators=100)

#model.fit(x_train,y_train)

In [None]:
len(x_train),len(y_train)

In [None]:
# Change max_samples value 
# Version 0.22 has argument max_samples which saves time 
model = RandomForestRegressor(n_jobs=-1,
                              random_state=42,
                              n_estimators=100,
                              max_samples=10000)
model.fit(x_train,y_train)

In [None]:
# Since n_estimators = 100
x_train.shape[0]*100

In [None]:
show_scores(model)

### Hyperparameter tuning with RandomizedSearchCV

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# Different RandomForestRegressor hyperparameters
rf_grid = {"n_estimators": np.arange(10,100,10),
           "max_depth": [None,3,5,10],
           "min_samples_split": np.arange(2,20,2),
           "min_samples_leaf": np.arange(1,20,2),
           "max_features": [0.5,1,"sqrt","auto"],
           "max_samples": [10000]}

#Instantiate RandomizedSearchCV model
rs_model = RandomizedSearchCV(RandomForestRegressor(n_jobs=-1,
                                                    random_state=42),
                             param_distributions =  rf_grid,
                             n_iter = 5,
                             cv = 5,
                             verbose = True)
rs_model.fit(x_train,y_train)

In [None]:
rs_model.best_params_

In [None]:
show_scores(rs_model)

### Train a model with the bes hyperpaprameters

**Note:** These were found after 100 iterations of RandomizedSearchCV, i.e. n_iter = 100

In [None]:
%%time

#Most Ideal Hyperparameters
ideal_model = RandomForestRegressor(n_estimators = 40,
                                    min_samples_leaf = 1,
                                    min_samples_split = 14,
                                    max_features = 0.5,
                                    n_jobs = -1,
                                    max_samples = None,
                                    random_state = 42)

# Fit the ideal model
ideal_model.fit(x_train,y_train)

In [None]:
# Scores for ideal_model (trained on all the data)
# Valid RMSLE is better
show_scores(ideal_model)

In [None]:
# Scores for rs_model (trained on 10000 examples)
show_scores(rs_model)

### Make Predictions on Test data

In [None]:
# Import the test data
PATH = "../input/bluebook-for-bulldozers/Test.csv"
df_test = pd.read_csv(f"{PATH}", low_memory= False, parse_dates=["saledate"])
df_test.head()

In [None]:
df_test.isna().sum()

### Preproessing the test data (getting the dataset i teh same format as our training dataset)

In [None]:
def preprocess_data(df):
    '''
    Performs transformations on df and returns transformed df.
    '''
    df["saleYear"] = df.saledate.dt.year
    df["saleMonth"] = df.saledate.dt.month
    df["saleDay"] = df.saledate.dt.day
    df["saleDayOfWeek"] = df.saledate.dt.dayofweek
    df["saleDayofYear"] = df.saledate.dt.dayofyear
    
    df.drop("saledate",axis =1,inplace=True)
    
    # Fill the numeric rows with median
    for label,content in df.items():
        if pd.api.types.is_numeric_dtype(content):
            if pd.isnull(content).sum():
                df[label+"is_missing"] = pd.isnull(content)
                df[label] = content.fillna(content.median())
                
    # Fill categorical missing data turn categories into numbers
        if not pd.api.types.is_numeric_dtype(content):
            df[label+"is missing"] = pd.isnull(content)
            # We add +1 to the category code
            df[label] = pd.Categorical(content).codes+1
            
    return df

In [None]:
df_test = preprocess_data(df_test)
df_test.head()

In [None]:
x_train.head()

In [None]:
# We can find how the columns differ using sets
# Somethig is wrong with the output here
# It should have only displayed 'auctioneerID'
set(x_train.columns) - set(df_test.columns)

In [None]:
# Manually adjust df_test to have auctioneerID_is_missing column
df_test["auctioneerID_is_missing"] = False
df_test.head()

#### Finally our test dataframe has the same features as our training dataframe, we can make predictions!

In [None]:
# Make predictions on the test data
test_preds = ideal_model.predict(df_test)

In [None]:
test_preds

#### We've made some preditions but they're not in the same format Kaggle is asking for:
https://www.kaggle.com/c/bluebo-for-bulldozers/overview/evaluation

In [None]:
# Format predictions into the same format Kaggle is after
df_preds = pd.DataFrame()
df_preds["SalesID"] = df_test["SalesID"]
df_preds["SalesPrice"] = test_preds
df_preds

In [None]:
# Export prediction data
df_preds.to_csv('test_predictions.csv', index = False)

### Feature Importance

Feature importane seeks to figure out which different attributes of the data were most important when it comes to predicting the 
**target variable** (SalePrice).

In [None]:
# Find feature importance of our best model
len(ideal_model.feature_importances_)

In [None]:
x_train.shape

In [None]:
def plot_features(columns,importances,n=20):
    df = (pd.DataFrame({"features":columns,
                        "feature_importances": importances})
         .sort_values("feature_importances", ascending = False)
         .reset_index(drop = True))
    # Plot the dataframes
    fig,ax = plt.subplots()
    ax.barh(df["features"][:n], df["feature_importances"][:20])
    ax.set_ylabel("Features")
    ax.set_xlabel("Feature Importance")
    ax.invert_yaxis()

In [None]:
plot_features(x_train.columns,ideal_model.feature_importances_)