In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Loading Data

In [None]:
df_train_val = pd.read_csv('/kaggle/input/bluebook-for-bulldozers/TrainAndValid.csv',
                           low_memory=False,
                           parse_dates=['saledate'])

print(df_train_val.shape)

In [None]:
df_train_val.sort_values(by=["saledate"], inplace=True, ascending=True)
df_train_val.head(20)

# Preprocessing

In [None]:
print(df_train_val.info())

In [None]:
print(df_test.info())

**Dropping SalesID feature**

In [None]:
df_train_val.drop('SalesID', axis=1, inplace=True)

In [None]:
removed_features = ['SalesID']
print(removed_features)

In [None]:
print(df_train_val.info())

**Splitting saledate feature**

In [None]:
df_train_val["saleYear"] = df_train_val.saledate.dt.year
df_train_val["saleMonth"] = df_train_val.saledate.dt.month
df_train_val["saleDay"] = df_train_val.saledate.dt.day
df_train_val["saleDayofweek"] = df_train_val.saledate.dt.dayofweek
df_train_val["saleDayofyear"] = df_train_val.saledate.dt.dayofyear
df_train_val.drop("saledate", axis=1, inplace=True)

**Converting string to categorical data**

In [None]:
for label, content in df_train_val.items():
    if pd.api.types.is_string_dtype(content):
        df_train_val[label] = content.astype('category').cat.as_ordered()

In [None]:
df_train_val.info()

In [None]:
import matplotlib.pyplot as plt

In [None]:
fig, ax = plt.subplots(figsize=(10, 5))
ax.scatter(df_train_val['saleYear'][:1000], df_train_val['SalePrice'][:1000])

In [None]:
df_train_val.SalePrice.plot.hist()

In [None]:
df_train_val.head().T

**Dropping features with missing values > 70%**

In [None]:
for label, content in df_train_val.items():
    if 100*df_train_val[label].isna().sum()/len(df_train_val) > 70:
        removed_features.append(label)
        print(label,
              '{0:.2f}%'.format(100*df_train_val[label].isna().sum()/len(df_train_val)))

In [None]:
print(removed_features)

In [None]:
list(set(removed_features).intersection(set(df_train_val.columns)))

In [None]:
df_train_val.drop(
    list(set(removed_features).intersection(set(df_train_val.columns))),
    axis=1,
    inplace=True)

In [None]:
df_train_val.info()

**Filling numerical data and converting categorical data to numerical data**

In [None]:
for label, content in df_train_val.items():
    if pd.api.types.is_numeric_dtype(content):
        if pd.isna(content).sum():
            df_train_val[label] = content.fillna(content.median())
    else:
        df_train_val[label] = pd.Categorical(content).codes+1

In [None]:
for label, content in df_train_val.items():
    if df_train_val[label].isna().sum():
        print(label,
              '{0:.2f}%'.format(100*df_train_val[label].isna().sum()/len(df_train_val)))

In [None]:
df_train_val.info()

# Training

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_log_error, mean_absolute_error

**Function for metrics**

In [None]:
def rmsle(y_test, y_preds):
    return np.sqrt(mean_squared_log_error(y_test, y_preds))

def show_scores(model, X_train, y_train, valid=False, X_valid=None, y_valid=None):
    train_preds = model.predict(X_train)
    
    scores = dict()
    
    scores['Training MAE'] = mean_absolute_error(y_train, train_preds)
    scores['Training RMSLE'] = rmsle(y_train, train_preds)
    scores['Training R^2'] = model.score(X_train, y_train)
    
    if valid:
        val_preds = model.predict(X_valid)
        scores['Valid MAE'] = mean_absolute_error(y_valid, val_preds)
        scores['Valid RMSLE'] = rmsle(y_valid, val_preds)
        scores['Valid R^2'] = model.score(X_valid, y_valid)

    return scores

In [None]:
models = dict()

**Basic Model**
* Without validation
* Without hyper-parameter tuning

In [None]:
models['basic'] = RandomForestRegressor(n_jobs=-1)

In [None]:
models['basic'].fit(df_train_val.drop('SalePrice', axis=1), df_train_val.SalePrice)

* Evaluation

In [None]:
show_scores(models['basic'],
            df_train_val.drop('SalePrice', axis=1),
            df_train_val.SalePrice)

**Validation Model**
* With validation
* Without hyper-parameter tuning

* Split data into **training** and **validation**

In [None]:
df_train_val.saleYear.unique()

In [None]:
df_valid = df_train_val[df_train_val.saleYear == 2012]
df_train = df_train_val[df_train_val.saleYear != 2012]

df_train.shape, df_valid.shape

In [None]:
X_train, y_train = df_train.drop("SalePrice", axis=1), df_train.SalePrice
X_valid, y_valid = df_valid.drop("SalePrice", axis=1), df_valid.SalePrice

X_train.shape, y_train.shape, X_valid.shape, y_valid.shape

* Fitting the data

In [None]:
models['no_tuning'] = RandomForestRegressor(n_jobs=-1)

In [None]:
models['no_tuning'].fit(X_train, y_train)

* Evaluation

In [None]:
show_scores(model=models['no_tuning'],
            X_train=X_train,
            y_train=y_train,
            valid=True,
            X_valid=X_valid,
            y_valid=y_valid)

**RandomizedSearchCV Model**
* With validation
* With hyper-parameter tuning

In [None]:
from sklearn.model_selection import RandomizedSearchCV

* Hyperparameters to tune

In [None]:
rf_grid = {"n_estimators": np.arange(10, 100, 10),
           "max_depth": [None, 3, 5, 10],
           "min_samples_split": np.arange(2, 20, 2),
           "min_samples_leaf": np.arange(1, 20, 2),
           "max_features": [0.5, 1, "sqrt", "auto"],
           "max_samples": [20000]}

* Fitting data

In [None]:
%%time
rs_model = RandomizedSearchCV(RandomForestRegressor(),
                              param_distributions=rf_grid,
                              n_iter=20,
                              cv=5,
                              verbose=True)

rs_model.fit(X_train, y_train)

* Best hyperparameters

In [None]:
best_params = rs_model.best_params_
best_params

* Evaluation

In [None]:
show_scores(model=rs_model,
            X_train=X_train,
            y_train=y_train,
            valid=True,
            X_valid=X_valid,
            y_valid=y_valid)

**Adding rs_model to models dictionary**
* Fitting data

In [None]:
models['rs'] = RandomForestRegressor(n_jobs=-1,
                                     n_estimators=best_params['n_estimators'],
                                     min_samples_split=best_params['min_samples_split'],
                                     min_samples_leaf=best_params['min_samples_leaf'],
                                     max_features=best_params['max_features'],
                                     max_depth=best_params['max_depth'])
models['rs'].fit(X_train, y_train)

* Evaluation

In [None]:
show_scores(model=models['rs'],
            X_train=X_train,
            y_train=y_train,
            valid=True,
            X_valid=X_valid,
            y_valid=y_valid)

# Testing

# Loading Data

In [None]:
df_test = pd.read_csv('/kaggle/input/bluebook-for-bulldozers/Test.csv',
                      low_memory=False,
                      parse_dates=['saledate'])

# Preprocessing

In [None]:
df_test.head(10)

In [None]:
df_test.head().T

**Splitting saledate feature**

In [None]:
df_test['saleYear'] = df_test.saledate.dt.year
df_test['saleMonth'] = df_test.saledate.dt.month
df_test['saleDay'] = df_test.saledate.dt.day
df_test['saleDayofweek'] = df_test.saledate.dt.dayofweek
df_test['saleDayofyear'] = df_test.saledate.dt.dayofyear
df_test.drop('saledate', axis=1, inplace=True)

In [None]:
df_test.head().T

In [None]:
salesID = df_test.SalesID
salesID.head()

**Reducing features to match the model**

In [None]:
df_test.drop(
    list(set(removed_features).intersection(set(df_test.columns))),
    axis=1,
    inplace=True)

In [None]:
df_test.columns

**Filling missing numerical data and converting string to numerical(categorical) data**

In [None]:
df_test.isna().sum()/len(df_test)

In [None]:
for label, content in df_test.items():
    if pd.api.types.is_numeric_dtype(content):
        if pd.isna(content).sum():
            df_test[label] = content.fillna(content.median())

    else:
        df_test[label] = pd.Categorical(content).codes + 1
        

In [None]:
df_test.isna().sum()/len(df_test)

# Predictions

In [None]:
test_preds = dict()
for label, model in models.items():
    test_preds[label] = model.predict(df_test)

# Saving submission files

In [None]:
for label, preds in test_preds.items():
    output = pd.DataFrame({'SalesID': salesID, 'SalePrice': preds})
    output.to_csv('my_submission_{}.csv'.format(label), index=False)

print('Your submission was successfully saved!')