In [None]:
#import packages
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
import time 
import datetime
import collections
# from plotnine import *
import seaborn as sns
import scipy.stats as stats

from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import mean_absolute_error

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge 
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor


import xgboost as xgb
import lightgbm as lgb


# Table of Content
* [Overview](#1)
    - [Raw Data Structure](#1.1)
    - [Data Size](#1.2)
    - [Feature Type](#1.3)
    - [Missing Rate and Null Treatment](#1.4) 
* [Target Variable](#2)
    - [Time Series](#2.1)
    - [Unconditional Distribution](#2.2)
    - [Conditional Distribution](#2.3)
* [Features](#3)
    - [Categorical Features](#3.1)
        - [Distribution](#3.1.1)
        - [Correlation with Y](#3.1.2)
        - [Correlation within X](#3.1.3)
        - [Feature Importance via RF](#3.1.4)        
    - [Numerical Features](#3.2)
        - [Distribution](#3.2.1)
        - [Correlation with Y](#3.2.1)
        - [Correlation within X](#3.2.2)
        - [Feature Importance via RF](#3.2.3)
* [New Features](#4)
 
 

# EDA


<a id="1"></a>
## Overview

<a id="1.1"></a>
### Raw Data Structure
  - raw features include properties features in 2016 and 2017
  - target data includes house transaction data with time stamp and log error between transaction price and zestimate in 2016 and 2017
  - submission data: list of properties to predict.


<a id="1.2"></a>
### Data Size 

In [None]:
data_dir = '/kaggle/input/zillow-prize-1/'
y_train_2016 = pd.read_csv(f"{data_dir}train_2016_v2.csv")
y_train_2017 = pd.read_csv(f"{data_dir}train_2017.csv")
X_train_2016 = pd.read_csv(f"{data_dir}properties_2016.csv")
X_train_2017 = pd.read_csv(f"{data_dir}properties_2017.csv")
df_submission = pd.read_csv(f"{data_dir}sample_submission.csv")

print(f"\nX_train_2016 shape: {X_train_2016.shape}\ny_train_2016 shape: {y_train_2016.shape}\n2017 X_train shape: {X_train_2017.shape} \
      \n y_train_2017 shape: {y_train_2017.shape}\ntest_data shape:{df_submission.shape}")
del df_submission

<a id="1.3"></a>
### Feature Types
* 52 float, 1 integer and 5 object type
* Many features have small number of unique values, likely to be categorical variables
* Based on data dictionary and variable names, there are quite a few cateogrical variables related geographic location too. We need to encode cateogrical variables if we want to use linear regression. 
* Encode string values using numeric integer code

In [None]:
data_type = X_train_2016.dtypes
data_cnt = X_train_2016.nunique()
print(f"frequencey count of differnet types:{collections.Counter(data_type)}")

data_summary = pd.concat([data_type,data_cnt],axis=1)
data_summary.columns = ['dtype','unique_cnt']
print(data_summary.sort_values(['dtype','unique_cnt']))

In [None]:
#object type deepdive
col_object = data_summary.loc[data_summary['dtype']=='object',:].index
X_train_2016.loc[:,col_object].describe()

In [None]:
#convert boolean type to integer
X_2016_clean = X_train_2016.copy()
X_2016_clean[['hashottuborspa','fireplaceflag']] = X_2016_clean[['hashottuborspa','fireplaceflag']].fillna(value=0).astype(int)
X_2016_clean.loc[X_2016_clean.taxdelinquencyflag=='Y','taxdelinquencyflag'] = 1.0
X_2016_clean.loc[X_2016_clean.taxdelinquencyflag!='Y','taxdelinquencyflag'] = 0.0
col_str = ['propertycountylandusecode','propertyzoningdesc']
for col in col_str:
    uniq_val = X_2016_clean[col].unique()
    df_encoding = pd.DataFrame(np.arange(len(uniq_val)),columns=[col+'_e'])
    df_encoding[col] = uniq_val
    X_2016_clean = pd.merge(X_2016_clean, df_encoding, on=[col], how='left')
    X_2016_clean.drop(col,axis=1,inplace=True)
    X_2016_clean.rename(columns={col+"_e":col},inplace=True)

#convert boolean type to integer
X_2017_clean = X_train_2017.copy()
X_2017_clean[['hashottuborspa','fireplaceflag']] = X_2017_clean[['hashottuborspa','fireplaceflag']].fillna(value=0).astype(int)
X_2017_clean.loc[X_2017_clean.taxdelinquencyflag=='Y','taxdelinquencyflag'] = 1.0
X_2017_clean.loc[X_2017_clean.taxdelinquencyflag!='Y','taxdelinquencyflag'] = 0.0
col_str = ['propertycountylandusecode','propertyzoningdesc']
for col in col_str:
    uniq_val = X_2017_clean[col].unique()
    df_encoding = pd.DataFrame(np.arange(len(uniq_val)),columns=[col+'_e'])
    df_encoding[col] = uniq_val
    X_2017_clean = pd.merge(X_2017_clean, df_encoding, on=[col], how='left')
    X_2017_clean.drop(col,axis=1,inplace=True)
    X_2017_clean.rename(columns={col+"_e":col},inplace=True)

del X_train_2016, X_train_2017


<a id="1.4"></a>
### Missing Rate and Null Treatment
* Missing Rate: 17 features with more than 90% missing, could drop these. 
* Null treatment: 9 features with significant missing rate between 0.5 to 0.9, worth deepdive to identify appropriate imputation strategy: impute 4 count variable missing to 0.

In [None]:
data_summary['missing_rate'] = 1 - X_2016_clean.count()/X_2016_clean.shape[0]
#missing rate when joined with target
train_data = pd.merge(y_train_2016, X_2016_clean, on=['parcelid'],how='left')
mr = 1-train_data.count()/train_data.shape[0]
mr.name = 'missing_rate_sample'
# data_summary.drop('missing_rate_sample', axis=1, inplace=True)
data_summary = pd.concat([data_summary, mr],axis=1)

mr = data_summary.copy()
mr.loc[mr.missing_rate==0,'mr_type'] = 'No Missing'
mr.loc[mr.missing_rate>0,'mr_type'] = '(0,0.1]'
mr.loc[mr.missing_rate>0.1,'mr_type'] = '(0.1,0.5]'
mr.loc[mr.missing_rate>0.5,'mr_type'] = '(0.5,0.9]'
mr.loc[mr.missing_rate>0.9,'mr_type'] = '(0.9, 1)'
df_plt = mr.groupby('mr_type')['dtype'].count()
ax = df_plt.plot(kind='bar')
ax.set_ylabel('No. of Features')
ax.set_xlabel('missing rate range')
for i, v in enumerate(df_plt):
    ax.text(i,v + 0.5,str(v), color='blue', fontweight='bold')
# data_summary.sort_values('missing_rate_sample',ascending=False)

In [None]:
df_impute = mr.loc[mr.mr_type.isin(['(0.5,0.9]','(0.1,0.5]']),:]
X_2016_clean[df_impute.index].describe().T

In [None]:
#impute missing count without 0 with 0
col1 = list(set(list(df_impute.index.values)) - 
set([c for c in df_impute.index if 'garage' in c]+
    ['regionidneighborhood','numberofstories','unitcnt','airconditioningtypeid','buildingqualitytypeid','heatingorsystemtypeid'])) 
X_2016_clean[col1] = X_2016_clean[col1].fillna(value=0)

#fill pool id and area with 0 if poolcnt = 0
col1 = ['pooltypeid2','pooltypeid7','pooltypeid10','poolsizesum']
X_2016_clean.loc[X_2016_clean.poolcnt==0,col1] = 0

In [None]:
data_summary['missing_rate'] = 1 - X_2016_clean.count()/X_2016_clean.shape[0]
#missing rate when joined with target
train_data = pd.merge(y_train_2016, X_2016_clean, on=['parcelid'],how='left')
mr = 1-train_data.count()/train_data.shape[0]
mr.name = 'missing_rate_sample'
data_summary.drop('missing_rate_sample', axis=1, inplace=True)
data_summary = pd.concat([data_summary, mr],axis=1)

mr = data_summary.copy()
mr.loc[mr.missing_rate==0,'mr_type'] = 'No Missing'
mr.loc[mr.missing_rate>0,'mr_type'] = '(0,0.1]'
mr.loc[mr.missing_rate>0.1,'mr_type'] = '(0.1,0.5]'
mr.loc[mr.missing_rate>0.5,'mr_type'] = '(0.5,0.9]'
mr.loc[mr.missing_rate>0.9,'mr_type'] = '(0.9, 1)'
df_plt = mr.groupby('mr_type')['dtype'].count()
ax = df_plt.plot(kind='bar')
ax.set_ylabel('No. of Features')
ax.set_xlabel('missing rate range')
for i, v in enumerate(df_plt):
    ax.text(i,v + 0.5,str(v), color='blue', fontweight='bold')
# data_summary.sort_values('missing_rate_sample',ascending=False)

<a id="2"></a>
## Target Variable
* Clearly there are quite a bit of outliers in target variable

<a id="2.1"></a>
### Time Series
* Clear Seasonality in Volume, seasonal dummy as features
* Outliers each month
* Multiple transaction per properties: 320


In [None]:
#count over time
date_col ='transactiondate'
y_train_2016['date_'] = y_train_2016[date_col].apply(lambda x: pd.Timestamp(x))
y_train_2017['date_'] = y_train_2017[date_col].apply(lambda x: pd.Timestamp(x))

fig, ax = plt.subplots(2,1,figsize=(10,10))
ys = [y_train_2016, y_train_2017]
titles = ['Transanction Volume over Time in 2016', 'Transaction Volume over Time in 2017']
for i, y in enumerate(ys):
    plti = y.groupby('date_')['logerror'].count()
    ax[i].plot(plti.index, plti.values)
    ax[i].set_title(titles[i])
    ax[i].set_ylabel('Transaction Count')
    
    

In [None]:
#boxplot over time
fig, ax = plt.subplots(2,1,figsize=(10,10))
ys = [y_train_2016, y_train_2017]
titles = ['Logerror Distribution over Time in 2016', 'Log Error Distribution over Time in 2017']
for i, y in enumerate(ys):
    y['ym'] = y.date_.apply(lambda x: x.year*100 + x.month)
    sns.boxplot(x='ym',y='logerror', data=y, ax=ax[i])
    ax[i].set_title(titles[i])
    ax[i].set_ylabel('Transaction Count')


* How many properties have more than one transactions in 2016/2017?


In [None]:
transaction_2016_cnt = y_train_2016.groupby('parcelid')['transactiondate'].count()
transaction_2017_cnt = y_train_2017.groupby('parcelid')['transactiondate'].count()
multiple_2016 = transaction_2016_cnt[transaction_2016_cnt>1]
multiple_2017 = transaction_2017_cnt[transaction_2017_cnt>1]
print(f"{len(multiple_2016)+len(multiple_2017)} properties have multiple transactions within 2016 or 2017")


<a id="2.2"></a>
### Unconditional Distribution 
* comparison between raw and winsorized data
* winsorize by yearmon at different threshold

In [None]:
#raw historgram
fig, ax = plt.subplots(1,2,figsize=(15,5))
ax[0].hist(y_train_2016.logerror,500)
ax[0].set_title('Histogram of log error in 2016')
ax[0].set_ylabel('Count')
ax[0].set_xlabel('logerror')
ax[1].hist(y_train_2017.logerror,500)
ax[1].set_title('Histogram of log error in 2017')
ax[1].set_ylabel('Count')
ax[1].set_xlabel('logerror')

In [None]:
#winsorize
def winsorize(df,date_col,data_col,limit=[0.01,0.99]):
    df['ym'] = df[date_col].apply(lambda x: x.year*100 + x.month)
    df[data_col+'_wc_'+str(limit[0])] = df.groupby('ym')[data_col].transform(lambda x: x.clip(*x.quantile(limit)))
    df[data_col+'_w_'+str(limit[0])] = df[data_col].transform(lambda x: x.clip(*x.quantile(limit)))
    return df
y_train_2016 = winsorize(y_train_2016,'date_','logerror',[0.01,0.99])
y_train_2017 = winsorize(y_train_2017,'date_','logerror',[0.01,0.99])

#outlier transform
fig, ax = plt.subplots(2,2,figsize=(15,10))
ax = ax.ravel()
ax[0].hist(y_train_2016['logerror_w_0.01'],100)
ax[0].set_title('Histogram of winsorized logerror in 2016')
ax[0].set_ylabel('Count')
ax[0].set_xlabel('winsorized at [0.01,0.99] logerror')
ax[1].hist(y_train_2017['logerror_w_0.01'],100)
ax[1].set_title('Histogram of winsorized logerror in 2017')
ax[1].set_ylabel('Count')
ax[1].set_xlabel('winsorized at [0.01,0.99] logerror')
ax[2].hist(y_train_2016['logerror_wc_0.01'],100)
ax[2].set_title('Histogram of winsorized logerror in 2016')
ax[2].set_ylabel('Count')
ax[2].set_xlabel('winsorized by ym at [0.01,0.99] logerror')
ax[3].hist(y_train_2017['logerror_wc_0.01'],100)
ax[3].set_title('Histogram of winsorized logerror in 2017')
ax[3].set_ylabel('Count')
ax[3].set_xlabel('winsorized by ym at [0.01,0.99] logerror')


In [None]:
y_train_2016 = winsorize(y_train_2016,'date_','logerror',[0.05,0.95])
y_train_2017 = winsorize(y_train_2017,'date_','logerror',[0.05,0.95])
fig, ax = plt.subplots(2,2,figsize=(15,10))
ax = ax.ravel()
ax[0].hist(y_train_2016['logerror_w_0.05'],100)
ax[0].set_title('Histogram of winsorized logerror in 2016')
ax[0].set_ylabel('Count')
ax[0].set_xlabel('winsorized at [0.05,0.95] logerror')
ax[1].hist(y_train_2017['logerror_w_0.05'],100)
ax[1].set_title('Histogram of winsorized logerror in 2017')
ax[1].set_ylabel('Count')
ax[1].set_xlabel('winsorized at [0.01,0.99] logerror')
ax[2].hist(y_train_2016['logerror_wc_0.05'],100)
ax[2].set_title('Histogram of winsorized logerror in 2016')
ax[2].set_ylabel('Count')
ax[2].set_xlabel('winsorized by ym at [0.05,0.95] logerror')
ax[3].hist(y_train_2017['logerror_wc_0.05'],100)
ax[3].set_title('Histogram of winsorized logerror in 2017')
ax[3].set_ylabel('Count')
ax[3].set_xlabel('winsorized by ym at [0.05,0.95] logerror')


<a id="2.3"></a>
### Conditional Distribution
* Simple 1-d Gaussian Mixture Model 

In [None]:
from sklearn import mixture 
vname = 'logerror_wc_0.01'
titles = ['GMM of 2016 '+vname,'GMM of 2017'+vname]
fig, ax = plt.subplots(1,2,figsize=(15,5))
for i, y in enumerate(ys):
        
    f = y[[vname]]

    g = mixture.GaussianMixture(n_components=3,covariance_type='full')
    g.fit(f)
    weights = g.weights_
    means = g.means_
    covars = g.covariances_

    ax[i].hist(f, bins=100, histtype='bar', density=True, ec='red', alpha=0.5)
    f_axis = f.copy()
    f_axis.sort_values(vname,inplace=True)
    ax[i].plot(f_axis,weights[0]*stats.norm.pdf(f_axis,means[0][0],np.sqrt(covars[0][0])).ravel(), c='blue')
    ax[i].plot(f_axis,weights[1]*stats.norm.pdf(f_axis,means[1][0],np.sqrt(covars[1][0])).ravel(), c='green')
    ax[i].plot(f_axis,weights[2]*stats.norm.pdf(f_axis,means[2][0],np.sqrt(covars[2][0])).ravel(), c='m')
    ax[i].set_title(titles[i])

* Anything special for properties with multiple transactions?

In [None]:
vname = 'logerror_wc_0.01'
fig, ax = plt.subplots(1,2,figsize=(15,5))
ax = ax.ravel()
titles = ['Multiple vs Unique Transactions of 2016 '+vname,'Multiple vs Unique Transactions of 2017'+vname]
for i, y in enumerate(ys):
    cnt = y.groupby('parcelid')['transactiondate'].count()
    idx = cnt[cnt>1].index
    f = y[[vname]]
    f1 = y.loc[y.parcelid.isin(idx),vname]
    f2 = y.loc[~y.parcelid.isin(idx),vname]  
    ax[i].hist(f, bins=100, histtype='bar', density=True, ec='red', alpha=0.5)
    f_axis = f.copy()
    f_axis.sort_values(vname,inplace=True)
    ax[i].plot(f_axis,stats.norm.pdf(f_axis,np.mean(f1),np.std(f1)).ravel(), c='blue',label='multiple transactions')
    ax[i].plot(f_axis,stats.norm.pdf(f_axis,np.mean(f2),np.std(f2)).ravel(), c='green',label='unique transaction')    
    ax[i].set_title(titles[i])
    ax[i].legend()

<a id="3"></a>
## Features

<a id="3.1"></a>
### Catgorical Features
* Filter by Missing Rate: three categorical variables with many unique values, drop ['propertyzoningdesc','censustractandblock','rawcensustractandblock']
* Distribution: class imbalance, drop ['pooltypeid10','pooltypeid2','fireplaceflag','taxdelinquencyflag']
* Encode categorical variables with One Hot Encoder. 
<!-- * Relationship with Target: scatter plot of group median
# * Relationship with each other: correlation, hierarchical clustering (gower taking too long), remove highly correlated ones. 
# * Feature Importance: RF
#  -->

In [None]:
col_categorical = col_str + [c for c in X_2016_clean.columns if 'typeid' in c or 'region' in c] +\
['fips','rawcensustractandblock','censustractandblock']
col_rest = list(set(X_2016_clean.columns) - set(col_categorical))
col_rest = [c for c in col_rest if data_summary.loc[c,'unique_cnt']==1]
col_categorical = col_categorical + col_rest
print(f'{len(col_categorical)} categorical featrues identified')
col_categorical_filtered = [c for c in col_categorical if data_summary.loc[c,'missing_rate']<=0.5]
print(f'{len(col_categorical_filtered)} categorical features after filtering out high missing rate.')
l1 = X_2016_clean.shape[1] 
data_summary.loc[col_categorical_filtered,:].sort_values('unique_cnt')


<a id="3.1.1"></a>
#### Distribution
* Four variables have very high class imabalance or only one value for all observations, should be dropped: pooltypeid10,pooltypeid2,fireplaceflag,taxdelinquencyflag

In [None]:
#Frequency chart
col_drop = ['propertyzoningdesc','censustractandblock','rawcensustractandblock']
col_categorical_filtered = [c for c in col_categorical_filtered if c not in col_drop]
print(f'{len(col_categorical_filtered)} categorical features after filtering out features with too many unique values.')
fig, ax =plt.subplots(3,5, figsize=(30,15))
ax = ax.ravel()
for i, c in enumerate(col_categorical_filtered):
    print(f"{c} start.")
    df_plt = X_2016_clean.loc[X_2016_clean[c].notnull(),:]
    df_br= df_plt.groupby(c)['parcelid'].count()
    bri = np.arange(df_br.shape[0])
    ax[i].bar(bri, df_br.values)
    if data_summary.loc[c,'unique_cnt']<100:
        ax[i].set_xticks(bri)
        ax[i].set_xticklabels(df_br.index,rotation=45)
        ax[i].set_title(c)
#     df_plt = X_2016_clean.loc[X_2016_clean[c].notnull(),:]
#     ax[i].hist(df_plt[c])
#     ax[i].set_title(c)
col_drop =  ['pooltypeid10','pooltypeid2','fireplaceflag','taxdelinquencyflag']
col_categorical_filtered = [c for c in col_categorical_filtered if c not in col_drop]
print(f'{len(col_categorical_filtered)} categorical features after filtering out features with highly imbalanced class distrubtion.')
del df_plt, df_br

<!-- <a id="3.1.2"></a>
#### Correlation with Y -->

In [None]:
# import category_encoders as ce

# c = 'propertycountylandusecode'

# # dummy_i = pd.get_dummies(X_2016_clean[c])
# encoder=ce.HashingEncoder(cols=[c],n_components=100)
# dummy_i = encoder.fit_transform(X_2016_clean.iloc[1:500000])

In [None]:
# c = 'propertycountylandusecode'
# encoder=ce.HashingEncoder(cols=c,n_components=50)
# dummy_i = encoder.fit_transform(X_2016_clean)

In [None]:
dummies = []
for c in col_categorical_filtered:
    cnt = X_2016_clean[c].nunique()
    if cnt <100:
        print(f"dummy for {c} created.")
        dummy_i = pd.get_dummies(X_2016_clean[c], prefix=f'dummy_{c}')
        dummy_i = dummy_i.iloc[:,:-1]
        dummies.append(dummy_i)
dummies = pd.concat(dummies, axis=1)
X_2016_clean = pd.concat([X_2016_clean,dummies], axis=1)
l2 = X_2016_clean.shape[1]
print(f"{l2-l1} categorical dummies created.")

In [None]:
# fig, ax =plt.subplots(4,5, figsize=(30,20))
# ax = ax.ravel()


# for i, c in enumerate(col_categorical_filtered):
#     print(f"{c} start.")
#     df_plt = train_data.loc[train_data[c].notnull(),:]
#     df_br= df_plt.groupby(c)['logerror_wc_0.01'].median()
#     bri = np.arange(df_br.shape[0])
#     ax[i].scatter(bri, df_br.values)
#     if data_summary.loc[c,'unique_cnt']<100:
#         ax[i].set_xticks(bri)
#         ax[i].set_xticklabels(df_br.index,rotation=45)
#     ax[i].set_title(c)

<!-- <a id="3.1.3"></a>
#### Correlation within X -->

In [None]:
# col_categorical_filtered = [c for c in col_categorical_filtered if c not in ['pooltypeid2','pooltypeid7','pooltypeid10','fireplaceflag','taxdelinquencyflag']]
# print(f"Drop uninformative features. Left with {len(col_categorical_filtered)} categorical features.")
# fig, ax = plt.subplots(figsize=(15, 15)) 
# mask = np.zeros_like(train_data[col_categorical_filtered].corr())
# mask[np.triu_indices_from(mask)] = 1
# sns.heatmap(train_data[col_categorical_filtered].corr(method='spearman'), mask= mask, ax= ax, annot= True)

<a id="3.2"></a>
### Numeric Features

* Filter by Missing Rate
* Distribution: histogram, summary table: outlier, needs winsorize
* Relationship with Target: time series correlation stable.
* Relationship with each other: correlation, remove highly correlated ones
* Feature Importance: RF


In [None]:
col_dummy = [c for c in X_2016_clean if 'dummy' in c]
col_numeric = list(set(X_2016_clean.columns) - set(col_categorical+col_dummy))

print(f'{len(col_numeric)} numeric featrues identified')
col_numeric_filtered = [c for c in col_numeric if data_summary.loc[c,'missing_rate']<0.5]
print(f'{len(col_numeric_filtered)} numeric features after filtering out high missing rate.')
data_summary.loc[col_numeric,:]

<a id="3.2.1"></a>
#### Distribution
* Many distributions indicate outliers. Some might need winsorization depending on the model we use. These include: unitcnt, finishedsquarefeet2, structuretaxvaluedollarcnt, landtaxvaluedollarcnt, calculatedfinishedsquarefeet, taxamount,landtaxvaluedollarcnt,lotsizesquarefeet, taxvaluedollarcnt, roomcnt.

In [None]:
#Histogram in full sample
fig, ax =plt.subplots(4,6, figsize=(30,20))
ax = ax.ravel()
for i, c in enumerate(col_numeric_filtered):
#     print(f"{c} start.")
    df_plt = X_2016_clean.loc[X_2016_clean[c].notnull(),:]
    ax[i].hist(df_plt[c])
    ax[i].set_title(c)

In [None]:
df_summary = X_2016_clean[col_numeric_filtered].describe()
df_summary.T

<a id="3.2.2"></a>
#### Correlation with Y

In [None]:
# correlation time series with y
date_col = 'transactiondate'
train_data['date_'] = train_data[date_col].apply(lambda x: pd.Timestamp(x))
# train_data['ym'] = train_data['date_'].apply(lambda x:x.year*100+x.month)
train_data = winsorize(train_data,'date_','logerror',[0.01,0.99])
corr1 = train_data.groupby('ym').corr(method='spearman').reset_index()
corr1 = corr1.loc[corr1.level_1=='logerror_wc_0.01',['ym']+col_numeric_filtered]
corr1['date_'] = corr1['ym'].apply(lambda x: pd.Timestamp(f"{str(x)[0:4]}-{str(x)[4:6]}-01"))
fig, ax =plt.subplots(4,6, figsize=(30,20))
ax = ax.ravel()

for i, c in enumerate(col_numeric_filtered):
    ax[i].plot(corr1.date_, corr1[c].values)
    ax[i].set_title(c)
    ax[i].set_ylim([-0.1,0.1])
    ax[i].axhline(y=0,color='r',linestyle='--')

<a id="3.2.3"></a>
#### Correlation within X

In [None]:
#correlation with each other
col_numeric_filtered = [c for c in col_numeric_filtered if c!='assessmentyear']
fig, ax = plt.subplots(figsize=(15, 15)) 
mask = np.zeros_like(train_data[col_numeric_filtered].corr())
mask[np.triu_indices_from(mask)] = 1
sns.heatmap(train_data[col_numeric_filtered].corr(method='spearman'), mask= mask, ax= ax, annot= True)

In [None]:
# #clustering of features
# X = train_data[col_numeric_filtered]
# # setting distance_threshold=0 ensures we compute the full tree.
# model = AgglomerativeClustering(distance_threshold=0, n_clusters=None)

# model = model.fit(X)
# plt.title("Hierarchical Clustering Dendrogram: Numerical Features")
# # plot the top three levels of the dendrogram
# plot_dendrogram(model, truncate_mode="level", p=3)
# plt.xlabel("Number of points in node (or index of point if no parenthesis).")
# plt.show()

<a id="3.2.4"></a>
#### Feature Importance via RF

In [None]:
vname = 'logerror_wc_0.01'
train_data_temp = train_data[col_numeric_filtered+[vname]]
train_data_filtered = train_data_temp.dropna(how='any', axis=0)
print(f"data size reduced from {train_data.shape[0]} to {train_data_filtered.shape[0]}")
X = train_data_filtered[col_numeric_filtered]
Y = train_data_filtered['logerror_wc_0.01']
rf = RandomForestRegressor(max_depth=8)
rf.fit(X, Y)
y_pred = rf.predict(X)
print("Features sorted by their score:")
df_importance = pd.DataFrame(rf.feature_importances_,index=col_numeric_filtered,columns=['importance'])
ax = df_importance.sort_values('importance').plot(kind='barh')
ax.set_title('Feature Selection vis RF for Numeric Variables')
# print(f'oob score is {rf.oob_score_}')
print(f"In sample MAE: {mean_absolute_error(Y, y_pred)}")

In [None]:
# #winsorize
# def winsorize(df,date_col,data_col,limit=[0.01,0.99]):
#     df['ym'] = df[date_col].apply(lambda x: x.year*100 + x.month)
#     df[data_col+'_wc_'+str(limit[0])] = df.groupby('ym')[data_col].transform(lambda x: x.clip(*x.quantile(limit)))
#     df[data_col+'_w_'+str(limit[0])] = df[data_col].transform(lambda x: x.clip(*x.quantile(limit)))
#     return df
# train_data['date_'] = train_data[date_col].apply(lambda x: pd.Timestamp(x))
# train_data = winsorize(train_data,'date_','logerror',[0.01,0.99])

# col1 = list(set(list(df_impute.index.values)) - 
# set([c for c in df_impute.index if 'garage' in c]+
#     ['regionidneighborhood','numberofstories','unitcnt','airconditioningtypeid','buildingqualitytypeid','heatingorsystemtypeid'])) 
# X_2017_clean[col1] = X_2017_clean[col1].fillna(value=0)
# #fill pool id and area with 0 if poolcnt = 0
# col1 = ['pooltypeid2','pooltypeid7','pooltypeid10','poolsizesum']
# X_2017_clean.loc[X_2017_clean.poolcnt==0,col1] = 0

# train_data_2017 = pd.merge(y_train_2017, X_2017_clean, on=['parcelid'],how='left')
# train_data_2017_tmp = train_data_2017[col_numeric_filtered+col_categorical_filtered+['logerror']]
# train_data_2017_filtered = train_data_2017_tmp.dropna(how='any', axis=0)
# y_test = train_data_2017_filtered['logerror']
# X_test = train_data_2017_filtered[col_numeric_filtered+col_categorical_filtered]
train_data = pd.merge(y_train_2016, X_2016_clean, on=['parcelid'],how='left')
vname = 'logerror_wc_0.01'
col_keep = col_numeric_filtered+[vname]
col_dummy = [c for c in train_data.columns if "dummy_" in c]
colnames = col_keep + col_dummy
train_data_temp = train_data[colnames]
train_data_temp = train_data_temp.fillna(-999)
train_data_filtered = train_data_temp.dropna(how='any', axis=0)
print(f"data size reduced from {train_data.shape[0]} to {train_data_filtered.shape[0]}")
X = train_data_filtered[col_numeric_filtered+col_dummy]
Y = train_data_filtered[vname]
rf = RandomForestRegressor(max_depth=8)
rf.fit(X, Y)
y_pred = rf.predict(X)
print("Features sorted by their score:")
df_importance = pd.DataFrame(rf.feature_importances_,index=col_numeric_filtered+col_dummy,columns=['importance'])
ax = df_importance.sort_values('importance').plot(kind='barh',figsize=(10,10))
ax.set_title('Feature Selection vis RF for All Variables')
# print(f'oob score is {rf.oob_score_}')
print(f"Initial MAE: {mean_absolute_error(Y, y_pred)}")

<a id="4"></a>
## New Features

* Interaction of important features: location interaction with volume, location interaction with area
* Higher order ones
* Seasonal dummies: seasonal volumes
* External data: AHS survey: https://www.census.gov/programs-surveys/ahs.html
* Expanding location related features
* Include macroeconomic features: mortgage rates

## Tasks
* Clean up steps: further imputation of missing values, rename variables? - DM, code up data_clean function - XH
* External data: Zip code, Census data, AHS data - SC, other external datasets - DM, XH 
* Model design: stage 2 discussion