Import necessary packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

Import datasets and packages

In [2]:
df_lookup = pd.read_csv('../../data/raw/EXTR_LookUp.csv', dtype='str')
df_parcel = pd.read_csv('../../data/raw/EXTR_PARCEL.csv', dtype='str')
df_resbldg = pd.read_csv('../../data/raw/EXTR_ResBldg.csv', dtype='str')
df_rpsale = pd.read_csv('../../data/raw/EXTR_RPSale.csv', dtype='str')

Strip leading and trailing spaces

In [3]:
def strip_spaces(df):
    for col in df.columns:
        df[col] = df[col].str.strip()
    return df

df_lookup = strip_spaces(df_lookup)
df_parcel = strip_spaces(df_parcel)
df_resbldg = strip_spaces(df_resbldg)
df_rpsale = strip_spaces(df_rpsale)

Define function to remove *x* std deviations of data from an input

In [4]:
def remove_extremes(data, devct):
    data = pd.Series([float(num) for num in data])
    cleaned = data.loc[data>0].copy()

    std = cleaned.std()
    med = cleaned.median()

    cleaned = cleaned.loc[(cleaned > (med - std*devct)) & (cleaned < (med+std*devct))].copy() 
    return cleaned

**Create DataFrame for Porch metrics**

In [5]:
porch = df_resbldg[['SqFtOpenPorch', 'SqFtEnclosedPorch']].copy()
porch['Parcel_ID'] = df_resbldg['Major'] + '-' + df_resbldg['Minor']
porch = porch[['Parcel_ID', 'SqFtOpenPorch', 'SqFtEnclosedPorch']].copy()

porch['SqFtOpenPorch'] = porch['SqFtOpenPorch'].astype('int')
porch['SqFtEnclosedPorch'] = porch['SqFtEnclosedPorch'].astype('int')
porch

Unnamed: 0,Parcel_ID,SqFtOpenPorch,SqFtEnclosedPorch
0,009800-0720,0,0
1,009802-0140,380,0
2,009830-0020,360,0
3,009830-0160,690,0
4,010050-0180,60,0
...,...,...,...
181505,197220-1902,0,0
181506,197220-2224,0,0
181507,197220-5172,0,0
181508,197220-5173,0,0


**Create DataFrame for Heating metrics**

In [6]:
from custom_functions import one_hot
from custom_functions import get_lookups

heating = one_hot(df_resbldg['HeatSystem'], name_lookup = get_lookups(108, df_lookup))
heating['binary_notforced'] = heating.drop('Forced Air', axis=1).sum(axis=1)
heating['binary_forced'] = heating['Forced Air']

**Bedroom and garage specs**

In [7]:
garage = df_resbldg[['SqFtGarageAttached', 'SqFtGarageBasement']].astype('int')
bedrooms = df_resbldg['Bedrooms'].astype('int')

**Aggregate Porch, Heating and Bedroom/Garage metrics**

In [8]:
porch_heating_rooms = pd.concat([porch, heating, garage, bedrooms], axis=1)
porch_heating_rooms.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181510 entries, 0 to 181509
Data columns (total 16 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   Parcel_ID           181510 non-null  object
 1   SqFtOpenPorch       181510 non-null  int32 
 2   SqFtEnclosedPorch   181510 non-null  int32 
 3   Floor-Wall          181510 non-null  int32 
 4   Gravity             181510 non-null  int32 
 5   Radiant             181510 non-null  int32 
 6   Elec BB             181510 non-null  int32 
 7   Forced Air          181510 non-null  int32 
 8   Hot Water           181510 non-null  int32 
 9   Heat Pump           181510 non-null  int32 
 10  Other               181510 non-null  int32 
 11  binary_notforced    181510 non-null  int64 
 12  binary_forced       181510 non-null  int32 
 13  SqFtGarageAttached  181510 non-null  int32 
 14  SqFtGarageBasement  181510 non-null  int32 
 15  Bedrooms            181510 non-null  int32 
dtypes:

**Pricing data aggregation**

In [9]:
# Parcel_ID = df_rpsale['Major'].copy() + '-' + df_rpsale['Minor'].copy()
# Sale_ID   = Parcel_ID + '-' + df_rpsale['ExciseTaxNbr']
# sale = pd.concat([Parcel_ID, Sale_ID], axis=1)
# sale.columns = ['Parcel_ID', 'Sale_ID']
# sale['SalePrice'] = df_rpsale['SalePrice'].astype('int')
# sale['DocumentDate'] = df_rpsale['DocumentDate'].astype(np.datetime64)
# sale

## Narrow to Residential Homes

In [25]:
res_sales = df_rpsale.copy()
principal_use_codes_to_keep = ['100']
principal_use_codes_to_keep

['100']

In [26]:
x = res_sales.PrincipalUse.unique()

In [27]:
np.delete(x, principal_use_codes_to_keep)

  len(np.delete(x, principal_use_codes_to_keep))
  len(np.delete(x, principal_use_codes_to_keep))


12

In [31]:
principal_use_codes_to_keep  = '6'

In [32]:
np.setdiff1d(res_sales.PrincipalUse.unique(), principal_use_codes_to_keep)

array(['0', '1', '10', '11', '2', '3', '4', '5', '7', '8', '9'],
      dtype=object)

In [23]:
len(x)

12

In [33]:
res_sales = df_rpsale.copy()

principal_use_codes_to_keep = ['6']
principal_use_codes_to_drop = np.setdiff1d(res_sales.PrincipalUse.unique(), principal_use_codes_to_keep)

property_class_codes_to_keep = ['7', '8']
property_class_codes_to_drop = np.setdiff1d(res_sales.PropertyClass.unique(), property_class_codes_to_keep)

property_type_codes_to_keep = ['2', '3', '6', '10', '11', '12', '13', '18', '19']
property_type_codes_to_drop = np.setdiff1d(res_sales.PropertyType.unique(), property_type_codes_to_keep)

for code in principal_use_codes_to_drop:
    res_sales['PrincipalUse'].replace(to_replace=code, value=np.nan, inplace=True)

for code in property_class_codes_to_drop:
    res_sales['PropertyClass'].replace(to_replace=code, value=np.nan, inplace=True)

for code in property_type_codes_to_drop:
    res_sales['PropertyType'].replace(to_replace=code, value=np.nan, inplace=True)

res_sales.dropna(inplace=True)
res_sales.reset_index(inplace=True, drop=True)
res_sales['ParcelID'] = res_sales.Major + '-' + res_sales.Minor
res_sales


Unnamed: 0,ExciseTaxNbr,Major,Minor,DocumentDate,SalePrice,RecordingNbr,Volume,Page,PlatNbr,PlatType,PlatLot,PlatBlock,SellerName,BuyerName,PropertyType,PrincipalUse,SaleInstrument,AFForestLand,AFCurrentUseLand,AFNonProfitUse,AFHistoricProperty,SaleReason,PropertyClass,SaleWarning,ParcelID
0,2743355,638580,0110,07/14/2015,190000,20150715002686,,,,,,,GINGRICH AUDREY B,NATION FRED I,3,6,3,N,N,N,N,1,8,15,638580-0110
1,2841697,894677,0240,12/21/2016,818161,20161228000896,,,,,,,QUADRANT CORPORATION,KANG JING,2,6,3,N,N,N,N,1,8,,894677-0240
2,2860712,408330,4150,03/15/2017,0,20170426000922,,,,,,,MALLAHAN MATTHEW JOSEPH,MALLAHAN CAROLYN S,3,6,15,N,N,N,N,11,8,18 31 38,408330-4150
3,2813396,510140,4256,07/25/2016,0,20160802000964,,,,,,,OBRANOVICH THOMAS M,OBRANOVICH THOMAS M+KAREN J,2,6,15,N,N,N,N,13,8,18 31 51,510140-4256
4,2899303,126320,0110,10/30/2017,375000,,,,,,,,ADAMS DANIEL J,SOLIS FRANSISCO FIGUEROA+MIRNA L MOLINA SANTOS,3,6,3,N,N,N,N,1,8,,126320-0110
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
246328,2755549,438920,1058,08/20/2015,1000000,20150914001478,,,,,,,PARKLAND HOMES INC,FALLING BROOK LLC,3,6,3,N,N,N,N,1,7,10 56,438920-1058
246329,2935450,219331,0270,06/07/2018,850000,20180612000762,,,,,,,LLANOS MIGUEL ANGEL III,NARAYANAN PRAKASH,11,6,3,N,N,N,N,1,8,,219331-0270
246330,2942886,886030,0550,07/16/2016,900000,20180720000629,,,,,,,PREUGSCHAT GARY N+SANDY K,POKAR NITIN+KINJABLBAHEN PATEL,3,6,3,N,N,N,N,1,8,,886030-0550
246331,2845806,769791,0030,07/16/2016,133000,20170125000407,,,,,,,SAY SORIN+ANG+DIEP JULIA,SAY SORIN,3,6,15,N,N,N,N,1,8,15 18 51,769791-0030


In [34]:
res_sales.PrincipalUse.value_counts()

6    246333
Name: PrincipalUse, dtype: int64