## Preprocessing 

In [5]:
import pandas as pd

In this step, we want to create a reproducible process for transforming the raw feature set into model data.

Note: We're not going to perform any preprocessing in this step, we're just going to create a tool to do so at the next step.

The reason for this is that a lot of preprocessing steps are modelling decisions that should be informed by feedback i.e. the model's performance. If we give ourselves a preprocessing tool instead of a script or static, preprocessed data file, we'll be able to change things on the fly during the model build process to speed up iterations.

This is where feature engineering and missing value imputation take place.

### Create the Preprocessor Object

We'll create a preprocessor object (class) from the training data that can be applied to the test data as well as any new data in production. We'll use the fit and transform paradigm to ensure we're only learning information from the training data (and not the test data).

In [6]:
import sys
sys.path.insert(0, './scripts')

In [7]:
import inspect
from helpers import get_data
print(inspect.getsource(get_data))

def get_data(dset):
    
    """Create the training dataset (2016) or test dataset (2017)
    
    Keyword arguments:
    dset -- a string in {train, test}
    
    Returns:
    a tuple of pandas dataframe (X) and pandas series (y)
    """
    
    year = {'train':2016, 'test':2017}[dset]
    
    train = read_in_dataset('train_{0}'.format(year))
    properties = read_in_dataset('properties_{0}'.format(year))
    merged = merge_dataset(train, properties)
    
    if dset == 'train':
        merged = filter_duplicate_parcels(merged)
        
    y = merged.pop('logerror')
    return merged, y



Notice that if you pass the argument dset='train', you'll get the 2016 data, and if you pass the argument dset='test', you'll get the 2017 data. We can use the 2017 data (that wasn't included in the orginal competition) as an additional test set to see how our model would have performed on all of 2017 and not just the months that are scored as part of the competition.

In [9]:
train_X, train_y = get_data(dset='train')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [10]:
train_X.head()

Unnamed: 0,parcelid,transactiondate,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,...,numberofstories,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock
0,11016594,2016-01-01,1.0,,,2.0,3.0,,4.0,2.0,...,,,122754.0,360170.0,2015.0,237416.0,6735.88,,,60371070000000.0
1,14366692,2016-01-01,,,,3.5,4.0,,,3.5,...,,,346458.0,585529.0,2015.0,239071.0,10153.02,,,
2,12098116,2016-01-01,1.0,,,3.0,2.0,,4.0,3.0,...,,,61994.0,119906.0,2015.0,57912.0,11484.48,,,60374640000000.0
3,12643413,2016-01-02,1.0,,,2.0,2.0,,4.0,2.0,...,,,171518.0,244880.0,2015.0,73362.0,3048.74,,,60372960000000.0
4,14432541,2016-01-02,,,,2.5,4.0,,,2.5,...,2.0,,169574.0,434551.0,2015.0,264977.0,5488.96,,,60590420000000.0


In [11]:
class preprocessor:
    
    def __init__(self, cols_to_filter=None):
        """cols_to_filter has to be a list"""
        
        self.cols_to_filter = cols_to_filter
        
    def fit(self, X, y=None):
        """Learn any information from the training data we may need to transform the test data"""
        
        # Learn from the training data and return the class itself.
        
        return self
    
    def transform(self, X, y=None):
        """transform the training or test data based on class attributes learned in the fit step"""
        
        X_new = X.drop(self.cols_to_filter, axis=1)
        
        return X_new

In [12]:
p = preprocessor(cols_to_filter = ['parcelid'])
p.transform(train_X).head()

Unnamed: 0,transactiondate,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,decktypeid,...,numberofstories,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock
0,2016-01-01,1.0,,,2.0,3.0,,4.0,2.0,,...,,,122754.0,360170.0,2015.0,237416.0,6735.88,,,60371070000000.0
1,2016-01-01,,,,3.5,4.0,,,3.5,,...,,,346458.0,585529.0,2015.0,239071.0,10153.02,,,
2,2016-01-01,1.0,,,3.0,2.0,,4.0,3.0,,...,,,61994.0,119906.0,2015.0,57912.0,11484.48,,,60374640000000.0
3,2016-01-02,1.0,,,2.0,2.0,,4.0,2.0,,...,,,171518.0,244880.0,2015.0,73362.0,3048.74,,,60372960000000.0
4,2016-01-02,,,,2.5,4.0,,,2.5,,...,2.0,,169574.0,434551.0,2015.0,264977.0,5488.96,,,60590420000000.0


### Deal with Datatime Columns

We are going to encode the datetime variables as month and year(disregarding day as it won't be included in the predicted results for the competition)

In [13]:
class preprocessor():
    
    def __init__(self, cols_to_filter=None, datecols=None):
        
        self.cols_to_filter = cols_to_filter
        self.datecols = datecols
    
    def fit(self, X, y=None):
        
        return self
    
    def transform(self, X, y=None):
        
        X_new = X.drop(self.cols_to_filter, axis=1)
        
        if self.datecols:
            for x in self.datecols:
                X_new[x + '_month'] = pd.to_datetime(X_new[x]).apply(lambda x: x.month)
                X_new[x + '_year'] = pd.to_datetime(X_new[x]).apply(lambda x: x.year)
                X_new = X_new.drop(x, axis=1)
        
        return X_new
    

In [14]:
p = preprocessor(cols_to_filter = ['parcelid'], datecols = ['transactiondate'])
train_X_transformed = p.transform(train_X)

In [15]:
train_X_transformed.head()

Unnamed: 0,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,decktypeid,finishedfloor1squarefeet,...,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock,transactiondate_month,transactiondate_year
0,1.0,,,2.0,3.0,,4.0,2.0,,,...,122754.0,360170.0,2015.0,237416.0,6735.88,,,60371070000000.0,1,2016
1,,,,3.5,4.0,,,3.5,,,...,346458.0,585529.0,2015.0,239071.0,10153.02,,,,1,2016
2,1.0,,,3.0,2.0,,4.0,3.0,,,...,61994.0,119906.0,2015.0,57912.0,11484.48,,,60374640000000.0,1,2016
3,1.0,,,2.0,2.0,,4.0,2.0,,,...,171518.0,244880.0,2015.0,73362.0,3048.74,,,60372960000000.0,1,2016
4,,,,2.5,4.0,,,2.5,,,...,169574.0,434551.0,2015.0,264977.0,5488.96,,,60590420000000.0,1,2016


### Define an Imputation Strategy

An easy strategy would be to make an educated assumption that all the numeric variables are positive and encode missing values with a -1. Let's first check that assumption.

In [47]:
train_X.loc[:, train_X.isna().sum() > 0].min()

airconditioningtypeid                    1
architecturalstyletypeid                 2
basementsqft                           100
buildingclasstypeid                      4
buildingqualitytypeid                    1
calculatedbathnbr                        1
decktypeid                              66
finishedfloor1squarefeet                44
calculatedfinishedsquarefeet             2
finishedsquarefeet12                     2
finishedsquarefeet13                  1056
finishedsquarefeet15                   560
finishedsquarefeet50                    44
finishedsquarefeet6                    257
fireplacecnt                             1
fullbathcnt                              1
garagecarcnt                             0
garagetotalsqft                          0
hashottuborspa                        True
heatingorsystemtypeid                    1
lotsizesquarefeet                      167
poolcnt                                  1
poolsizesum                             28
pooltypeid1

So it looks like we can get away with imputing missing values with a -1

In [48]:
class preprocessor:
    
    def __init__(self, cols_to_filter=None, datecols=None):
        
        self.cols_to_filter = cols_to_filter
        self.datecols = datecols
        
    def fit(self, X, y=None):
        
        return self
    
    def transform(self, X, y=None):
        
        # filter
        X_new = train_X.drop(self.cols_to_filter, axis=1)
        
        # fill NA
        X_new = X_new.fillna(-1)
        
        if self.datecols:
            for col in self.datecols:
                X_new[col + '_month'] = pd.to_datetime(X_new[col]).apply(lambda x : x.month)
                X_new[col + '_year'] = pd.to_datetime(X_new[col]).apply(lambda x : x.year)
                X_new = X_new.drop(col, axis=1)
        return X_new

In [50]:
p = preprocessor(cols_to_filter=['parcelid'], datecols=['transactiondate'])
train_X_transformed = p.transform(train_X)

In [51]:
train_X_transformed.head()

Unnamed: 0,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,decktypeid,finishedfloor1squarefeet,...,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock,transactiondate_month,transactiondate_year
0,1.0,-1.0,-1.0,2.0,3.0,-1.0,4.0,2.0,-1.0,-1.0,...,122754.0,360170.0,2015.0,237416.0,6735.88,-1,-1.0,60371070000000.0,1,2016
1,-1.0,-1.0,-1.0,3.5,4.0,-1.0,-1.0,3.5,-1.0,-1.0,...,346458.0,585529.0,2015.0,239071.0,10153.02,-1,-1.0,-1.0,1,2016
2,1.0,-1.0,-1.0,3.0,2.0,-1.0,4.0,3.0,-1.0,-1.0,...,61994.0,119906.0,2015.0,57912.0,11484.48,-1,-1.0,60374640000000.0,1,2016
3,1.0,-1.0,-1.0,2.0,2.0,-1.0,4.0,2.0,-1.0,-1.0,...,171518.0,244880.0,2015.0,73362.0,3048.74,-1,-1.0,60372960000000.0,1,2016
4,-1.0,-1.0,-1.0,2.5,4.0,-1.0,-1.0,2.5,-1.0,-1.0,...,169574.0,434551.0,2015.0,264977.0,5488.96,-1,-1.0,60590420000000.0,1,2016


In [54]:
assert all(train_X_transformed.isnull().sum() == 0)

### Encoding Categorical/Discrete Features

In [81]:
class preprocessor:
    
    def __init__(self, cols_to_filter=None, datecols=None):
        
        self.cols_to_filter = cols_to_filter
        self.datecols = datecols
        self.was_fit = False
    
    def fit(self, X, y=None):
        
        self.was_fit = True
        
        X_new = X.drop(self.cols_to_filter, axis=1)
        
        categorical_features = X_new.dtypes[X_new.dtypes == 'object'].index
        self.categorical_features = [x for x in categorical_features if 'date' not in x]
        
        self.colnames = pd.get_dummies(X_new, columns=self.categorical_features, dummy_na=True).columns
           
        return self
    
    def transform(self, X, y=None):
        
        if not self.was_fit :
            raise Error("need to fit the preprocessor first")
        
        X_new = X.drop(self.cols_to_filter, axis=1)
        
        X_new = pd.get_dummies(X_new, columns=self.categorical_features, dummy_na=True)
        newcols = set(self.colnames) - set(X_new.columns)
        
        for x in newcols:
            X_new[x] = 0
            
        X_new = X_new[self.colnames]
        
        X_new = X_new.fillna(-1) 
        
        if self.datecols:
            for col in self.datecols:
                X_new[col + '_month'] = pd.to_datetime(X_new[col]).apply(lambda x: x.month)
                X_new[col + '_year'] = pd.to_datetime(X_new[col]).apply(lambda x: x.year)
                X_new = X_new.drop(col, axis=1)
        
        return X_new
    
    def fit_transform(self, X, y=None):
        
        return self.fit(X).transform(X)
    

In [82]:
p = preprocessor(['rawcensustractandblock', 'censustractandblock', 'propertyzoningdesc', 
                                 'regionidneighborhood', 'regionidzip', 'parcelid'], datecols=['transactiondate'])
p.fit(train_X)

<__main__.preprocessor at 0xfad5c83148>

In [83]:
train_X_transformed = p.transform(train_X)

In [84]:
train_X_transformed.head()

Unnamed: 0,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,decktypeid,finishedfloor1squarefeet,...,propertycountylandusecode_73,propertycountylandusecode_8800,propertycountylandusecode_96,propertycountylandusecode_nan,fireplaceflag_True,fireplaceflag_nan,taxdelinquencyflag_Y,taxdelinquencyflag_nan,transactiondate_month,transactiondate_year
0,1.0,-1.0,-1.0,2.0,3.0,-1.0,4.0,2.0,-1.0,-1.0,...,0,0,0,0,0,1,0,1,1,2016
1,-1.0,-1.0,-1.0,3.5,4.0,-1.0,-1.0,3.5,-1.0,-1.0,...,0,0,0,0,0,1,0,1,1,2016
2,1.0,-1.0,-1.0,3.0,2.0,-1.0,4.0,3.0,-1.0,-1.0,...,0,0,0,0,0,1,0,1,1,2016
3,1.0,-1.0,-1.0,2.0,2.0,-1.0,4.0,2.0,-1.0,-1.0,...,0,0,0,0,0,1,0,1,1,2016
4,-1.0,-1.0,-1.0,2.5,4.0,-1.0,-1.0,2.5,-1.0,-1.0,...,0,0,0,0,0,1,0,1,1,2016


In [85]:
assert all(train_X_transformed.isna().sum() == 0)