In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

datafolder = "../input"
!ls {datafolder}

In [None]:
import numpy as np 
import pandas as pd

from fastai.tabular.transform import *
from fastai.tabular.data import TabularDataBunch

from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor

## Data import & first look

In [None]:
df_raw = pd.read_csv(f'{datafolder}/train/Train.csv', low_memory=False, parse_dates=['saledate'])
df_raw.shape

In [None]:
# show first 10 rows
display(df_raw.iloc[:10])
# summary statistics of each variable 
display(df_raw.describe(include='all').T)

## Performance measurement - Loss function

- The loss function is:  **root mean squared log error** <=> **percentage price difference**
- Log difference describes percentage increase/decrease, which is more meaning than absolute number when comparing price 
  - log(a) - log(b) = log(a/b) 

In [None]:
# Personally I think creating a new variable is better than replacing SalePrice with log(SalePrice) in place,
# because it avoids being transformed by log twice without giving error when the cell is rerun
df_raw['logSalePrice'] = np.log(df_raw['SalePrice'])

## Feature engineering

### Principle
1. Convert to numerical values
    - most machine learning model only accept numerical datatype, therefore we must convert any other type of data to numerical values
2. The more features the merrier
    -  include all the possible features you can think of that might be helpful, redundancy is not a concern (subject to imperical testing)
3. Fill missing values (NA)
4. Normalization (eg. 0 mean, 1 standard deivation)

### Practice
Docs: https://docs.fast.ai/tabular.transform.html
<br>
from fastai.tabular.transform import *
1.  - categorical -> numerical
        - Categorify()
    - datetime -> numerical
        - add_datepart()
2. __data scientist's job!__
3. FillMissing()
    - categorical 
        - NA=-1 by default
        - +1 to all categories
        - now NA=0
    - numerical
        - NA replaced by median by default
        - other strategies: mean, mode, specific number etc.
4. Normalize()

### Automate everything!
from fastai.tabular import 


In [None]:
# display the datatype and number of NAs of a dataframe
def info(df):
    datatypes = pd.Series(df.dtypes, name='datatype')
    na_count = pd.Series(df.isna().sum(), name='na_count')
    with pd.option_context('display.max_rows',1000,'display.max_columns',1000):
        display(pd.concat([datatypes, na_count],axis=1))
info(df_raw)

In [None]:
# return list of columns of specific datatypes 
def datatype(df):
    date_cols = df.select_dtypes(include=['datetime']).columns.tolist()
    num_cols = df.select_dtypes(include=['number','bool']).columns.tolist()
    cat_cols = df.select_dtypes(include=['object']).columns.tolist() # may contain other type of data
    print(f"Date columns: {date_cols} \n\nNumerical columns:{num_cols} \n\nString columns: {cat_cols}")
    return date_cols, num_cols, cat_cols
date_cols, num_cols, cat_cols = datatype(df_raw)

In [None]:
# converts datetime dtype to numerical/boolean dtype 
# in the mean time, adds bunch of generic features generated from datetime
add_datepart(df_raw, 'saledate')

# converts string to categorical dtype
cat_to_num = Categorify(cat_cols, num_cols)
cat_to_num(df_raw)

In [None]:
# now all the dtype are acceptable by machine learning models
info(df_raw)

In [None]:
# fill missing values 
fillNA = FillMissing(cat_cols,num_cols)
fillNA(df_raw)

In [None]:
# x: features y: labels 
x = df_raw.drop(['logSalePrice','SalePrice'],axis=1) 
y = df_raw['logSalePrice']

In [None]:
# automate all
preprocessing = [Categorify, FillMissing, Normalize]
data = TabularDataBunch.from_df(f'{datafolder}/train/', df_raw, dep_var='logSalePrice', valid_idx=range(len(df)-2000, len(df_raw)-1),
                                procs=preprocessing, cat_names=cat_cols)

In [None]:
data

## Random Forest

"bagging of weaker decision tress"
- bootstrapping(draw with replacement) of weaker decisions trees trained on a subset of the data
- the less correlated the the tress are, the better (most importantly)
- the more accurate the trees are, the better 

### Pros:
- works well universally
- suitable for any data type and both classification and regression problem
- little to no statistical assumptions such as independence, normal distributed, linear relationship, interaction modeled etc.
- requires little to no preprocessing such as normalization etc. 
- doesn't tend to overfit, easy to prevent overfitting 
- don't require validation set, it can tell how well it generalize on the training data alone

### How: 

In [None]:
#randomforest = RandomForestRegressor(n_jobs=-1) # n_jobs=-1: use all CPUs, n_jobs=1: no parallelism
#randomforest.fit(x, y)