In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Import libraries

In [None]:
import matplotlib.pyplot as plt
import pickle
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import mean_squared_log_error, mean_absolute_error

# Helper function to import the dataset

In [None]:
def import_dataset(path):
    df = pd.read_csv(path, parse_dates = ['saledate'])
    return df

# Helper function to preprocess dataframe

In [None]:
def preprocess_dataframe_for_model(df):
    # change all srting type to categorical type
    for label, content in df.items():
        if pd.api.types.is_string_dtype(content):
            df[label]=df[label].astype("category").cat.as_ordered()
            
    # enrich the dataframe 
    enrich_df(df)
    df.drop("saledate",axis=1,inplace=True)
    
    # fill the numerical missing values with median and non-numerical values with their (category no. + 1)      
    for label, content in df.items():
        if pd.api.types.is_numeric_dtype(content):
            if pd.isnull(content).sum():
                df[label]=content.fillna(content.median())
        else:
            df[label]=pd.Categorical(content).codes+1
    return df

# Helper function to enrich the dataframe

In [None]:
def enrich_df(df):
    """
    Adds following columns to dataframe saleYear, saleMonth, saleDay, saledayOfWeek, saleDayOfYear
    """
    temp_dict={
    "saleYear":"year",
    "saleMonth":"month",
    "saleDay":"day",
    "saleDayOfWeek":"dayofweek",
    "saleDayOfYear":"dayofyear"
    }
    
    for column, attribute in temp_dict.items():
        df[column] = df["saledate"].dt.__getattribute__(attribute)
    return df

# Helper function to evaluate model

In [None]:
# Create evaluation function (the competition uses Root Mean Square Log Error)

def rmsle(y_test, y_preds):
    return np.sqrt(mean_squared_log_error(y_test, y_preds))

# Create function to evaluate our model
def show_scores(model):
    train_preds = model.predict(X_train)
    val_preds = model.predict(X_valid)
    scores = {"Training MAE": mean_absolute_error(y_train, train_preds),
              "Valid MAE": mean_absolute_error(y_valid, val_preds),
              "Training RMSLE": rmsle(y_train, train_preds),
              "Valid RMSLE": rmsle(y_valid, val_preds),
              "Training R^2": model.score(X_train, y_train),
              "Valid R^2": model.score(X_valid, y_valid)}
    return scores

# Import train and valid dataset

In [None]:
# import train and valid dataset
df_test_and_valid = import_dataset("../input/bluebook-for-bulldozers/TrainAndValid.csv")

# Different attributes of train and valid dataframe

In [None]:
df_test_and_valid.info()

In [None]:
df_test_and_valid.isna().sum()

In [None]:
df_test_and_valid.describe()

In [None]:
df_test_and_valid["saledate"].value_counts()

# Visual plots to understand data in a better way

In [None]:
fig, ax = plt.subplots(figsize=(15,5))
ax.scatter(df_test_and_valid["saledate"][:1000],df_test_and_valid["SalePrice"][:1000])
ax.set_xlabel("Sale Date",fontsize=14)
ax.set_ylabel("Sale Price",fontsize=14);

In [None]:
fig, ax = plt.subplots(figsize=(15,5))
ax.hist(df_test_and_valid["SalePrice"])
ax.set_xlabel('Price',fontsize=14)
ax.set_ylabel('Sales',fontsize=14)
ax.set_title("Distribution of sales",fontsize=16);

# Preprocess train and valid dataframe

In [None]:
df_test_and_valid_modified=preprocess_dataframe_for_model(df_test_and_valid)

In [None]:
df_test_and_valid_modified.head()

In [None]:
df_test_and_valid_modified.info()

In [None]:
df_test_and_valid_modified.isna().sum()

# Modelling

In [None]:
model = RandomForestRegressor(n_jobs=-1,random_state=42)

## Slpitting train and valid data

In [None]:
df_train=df_test_and_valid_modified[df_test_and_valid_modified.saleYear!=2012]
df_valid=df_test_and_valid_modified[df_test_and_valid_modified.saleYear==2012]

In [None]:
X_train, y_train= df_train.drop(["SalePrice"],axis=1), df_train.SalePrice
X_valid, y_valid= df_valid.drop(["SalePrice"],axis=1), df_valid.SalePrice

## Preparing for hyperparameter tuning

In [None]:
search_grid={
    "n_estimators": np.arange(10, 30, 5),
    "max_depth": [None, 3, 5, 10],
    "min_samples_split": np.arange(2, 10, 4),
    "min_samples_leaf": np.arange(1, 10, 4),
    "max_features": [0.5, 1, "sqrt", "auto"],
    "max_samples": [10000]
}

In [None]:
lis=search_grid.values()
pro=1
for index,li in enumerate(lis):
    pro=len(li)*pro
print(f'Now we will fit {pro*2} models')

# GridSreachCV for hyperparameter tuning

In [None]:
%%time
ideal_model=GridSearchCV(
    RandomForestRegressor(),
    param_grid=search_grid,
    n_jobs=-1,
    cv=2
)
ideal_model.fit(X_train,y_train)

In [None]:
# ideal_model=pickle.load(open("./bulldozer-sale-price-predictor.pkl","rb"))

In [None]:
ideal_model.best_params_

## How our model performs

In [None]:
show_scores(ideal_model)

# Import test data

In [None]:
# import test data
df_test=import_dataset('../input/bluebook-for-bulldozers/Test.csv')
df_test.head()

In [None]:
df_test_modified=preprocess_dataframe_for_model(df_test)

In [None]:
df_test_modified.head()

In [None]:
df_test_modified.isna().sum()

# Collecting predictions of our model

In [None]:
test_preds=ideal_model.predict(df_test_modified)

In [None]:
df_preds=pd.DataFrame()

In [None]:
df_preds["SalesID"]=df_test_modified.SalesID
df_preds["SalePrice"]=test_preds

In [None]:
df_preds

# Saving csv file for submission

In [None]:
df_preds.to_csv("SalePrice-Submission.csv",index=False)

# Saving our model 

In [None]:
pickle.dump(ideal_model,open('bulldozer-sale-price-predictor.pkl',"wb"))

In [None]:
# df_preds=pd.read_csv("./SalePrice-Submission.csv")
# df_preds