In [1]:
from datetime import datetime
start_time = datetime.now()

# Libraries to install
# %pip install pandas-profiling

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

print('LOAD DURATION: ', datetime.now() - start_time) # load time about 30 seconds

LOAD DURATION:  0:00:00.597930


In [2]:
%%time
property_data = pd.read_csv('NY property data.csv')

CPU times: user 5.77 s, sys: 557 ms, total: 6.32 s
Wall time: 6.32 s


## Data Cleaning: Fill in Missing Values

1. Make a list of the owner names to remove. Remove all records with those owners. Make sure you get all this right.
2. Fill in the missing ZIPs
3. Fill in missing FULLVAL, AVLAND, AVTOT
4. Fill in missing STORIES
5. Fill in LTFRONT, LTDEPTH, BLDFRONT, BLDDEPTH

### Fill in Missing Zip

In [3]:
missing_zips = np.where(pd.isnull(property_data['ZIP']))[0]
len(missing_zips)

29890

If the zip on both the record before and after the record with the missing zip are the same, replace by that zip code

In [4]:
for i in range(len(missing_zips)):
    if(property_data.loc[missing_zips[i]+1, 'ZIP'] == property_data.loc[missing_zips[i]-1,'ZIP']):
        property_data.loc[missing_zips[i],'ZIP'] = property_data.loc[missing_zips[i]-1, 'ZIP']

In [5]:
missing_zips = np.where(pd.isnull(property_data['ZIP']))[0]
len(missing_zips)

16437

For the rest of the missing zips, I simply replace missing zips with the zip from record above it

In [6]:
for i in range(len(missing_zips)):
    property_data.loc[missing_zips[i],'ZIP'] = property_data.loc[missing_zips[i]-1,'ZIP']

In [7]:
missing_zips = np.where(pd.isnull(property_data['ZIP']))[0]
len(missing_zips)

0

### Fill in Missing FULLVAL, AVLAND, AVTOT

Calculate means for AVTOT, AVLAND, FULLVAL by taxclass, avoiding records with zeros

In [8]:
property_data['FULLVAL'].replace('NaN',0)
temp = property_data[property_data['FULLVAL']!=0]
mean_fullval = temp.groupby('TAXCLASS')['FULLVAL'].mean()
print(mean_fullval)

TAXCLASS
1     5.698435e+05
1A    3.352842e+05
1B    5.613639e+05
1C    7.615359e+05
1D    2.233614e+07
2     7.998018e+05
2A    8.640037e+05
2B    1.252989e+06
2C    7.728799e+05
3     1.112765e+05
4     3.211928e+06
Name: FULLVAL, dtype: float64


In [9]:
property_data['AVLAND'].replace('NaN',0)
temp = property_data[property_data['AVLAND']!=0]
mean_avland = temp.groupby('TAXCLASS')['AVLAND'].mean()
print(mean_avland)

TAXCLASS
1      14896.433976
1A      2247.369138
1B     14781.268478
1C      8225.658898
1D    709303.793103
2      90830.095031
2A     31363.390619
2B     54446.810503
2C     25639.066531
3      43368.352941
4     600949.433397
Name: AVLAND, dtype: float64


In [10]:
property_data['AVTOT'].replace('NaN',0)
temp = property_data[property_data['AVTOT']!=0]
mean_avtot = temp.groupby('TAXCLASS')['AVTOT'].mean()
print(mean_avtot)

TAXCLASS
1     2.501609e+04
1A    1.436972e+04
1B    1.478410e+04
1C    2.898447e+04
1D    1.166866e+06
2     3.599121e+05
2A    7.961158e+04
2B    1.785982e+05
2C    1.170444e+05
3     5.007444e+04
4     1.488705e+06
Name: AVTOT, dtype: float64


Substitute decent values for AVTOT, AVLAND, FULLVAL from averages by taxclass

In [11]:
%%time
for index in mean_fullval.index:
    property_data.loc[(property_data['FULLVAL']==0)&(property_data['TAXCLASS']==index),'FULLVAL']=mean_fullval[index]
    property_data.loc[(property_data['AVLAND']==0)&(property_data['TAXCLASS']==index),'AVLAND']=mean_fullval[index]
    property_data.loc[(property_data['AVTOT']==0)&(property_data['TAXCLASS']==index),'AVTOT']=mean_fullval[index]

CPU times: user 5.6 s, sys: 60.9 ms, total: 5.66 s
Wall time: 5.57 s


### Fill in Missing STORIES

In [12]:
temp = property_data[property_data['STORIES'].isnull()]
len(temp)

56264

In [13]:
len(property_data[property_data['STORIES']==0])

0

In addition, there should not be a value zero for this field, so we treat zeros as missing values as well.
I'm grouping by TAXCLASS, since that should be a pretty good description of the nature of the buildings

In [14]:
temp['TAXCLASS'].value_counts()

1B    24736
4     22354
3      4635
2      3435
1       897
2C      138
2B       34
2A       30
1A        5
Name: TAXCLASS, dtype: int64

In [15]:
mean_stories = property_data.groupby('TAXCLASS')['STORIES'].mean()
print(mean_stories)

TAXCLASS
1      2.111641
1A     1.656837
1B     4.000000
1C     3.052748
1D     1.068966
2     16.095110
2A     2.844574
2B     4.004494
2C     4.745097
3      1.333333
4      5.446968
Name: STORIES, dtype: float64


In [16]:
property_data['STORIES']=property_data['STORIES'].fillna(value=0)
for index in mean_stories.index:
    property_data.loc[(property_data['STORIES']==0)&(property_data['TAXCLASS']==index),'STORIES']=mean_stories[index]

In [17]:
property_data.head().transpose()

Unnamed: 0,0,1,2,3,4
RECORD,1,2,3,4,5
BBLE,1000010101,1000010201,1000020001,1000020023,1000030001
B,1,1,1,1,1
BLOCK,1,1,2,2,3
LOT,101,201,1,23,1
EASEMENT,,,,,
OWNER,U S GOVT LAND & BLDGS,U S GOVT LAND & BLDGS,DEPT OF GENERAL SERVI,DEPARTMENT OF BUSINES,PARKS AND RECREATION
BLDGCL,P7,Z9,Y7,T2,Q1
TAXCLASS,4,4,4,4,4
LTFRONT,500,27,709,793,323


### Fill in Missing LOT and Building Sizes

In [18]:
# since these 4 values do not have NAs, we need to replace 0s
# calculate groupwise average. First replace the 0's and 1's by NAs so they are not counted in calculating mean
#also treat the sizes of 1s as missing
property_data.loc[property_data['LTFRONT']==0,'LTFRONT']=np.nan
property_data.loc[property_data['LTDEPTH']==0,'LTDEPTH']=np.nan
property_data.loc[property_data['BLDFRONT']==0,'BLDFRONT']=np.nan
property_data.loc[property_data['BLDDEPTH']==0,'BLDDEPTH']=np.nan
property_data.loc[property_data['LTFRONT']==1,'LTFRONT']=np.nan
property_data.loc[property_data['LTDEPTH']==1,'LTDEPTH']=np.nan
property_data.loc[property_data['BLDFRONT']==1,'BLDFRONT']=np.nan
property_data.loc[property_data['BLDDEPTH']==1,'BLDDEPTH']=np.nan

In [19]:
# calculate the mean (mean function ignores NAs but not 0s hence we converted 0 to NA)
mean_LTFRONT = property_data.groupby(property_data['TAXCLASS'])['LTFRONT'].mean()
mean_LTDEPTH = property_data.groupby(property_data['TAXCLASS'])['LTDEPTH'].mean()
mean_BLDFRONT = property_data.groupby(property_data['TAXCLASS'])['BLDFRONT'].mean()
mean_BLDDEPTH = property_data.groupby(property_data['TAXCLASS'])['BLDDEPTH'].mean()

In [20]:
# impute values
for index in mean_LTFRONT.index:
    property_data.loc[(property_data['LTFRONT'].isnull())&(property_data['TAXCLASS']==index),'LTFRONT']=mean_LTFRONT[index]
    property_data.loc[(property_data['LTDEPTH'].isnull())&(property_data['TAXCLASS']==index),'LTDEPTH']=mean_LTFRONT[index]
    property_data.loc[(property_data['BLDFRONT'].isnull())&(property_data['TAXCLASS']==index),'BLDFRONT']=mean_LTFRONT[index]
    property_data.loc[(property_data['BLDDEPTH'].isnull())&(property_data['TAXCLASS']==index),'BLDDEPTH']=mean_LTFRONT[index]

## Feature Engineering: Add New Variables
S1 = LTFRONT * LTDEPTH
S2 = BLDFRONT * BLDDEPTH
S3 = S2 * STORIES

In [21]:
# convert ZIP from float to integers
# convert ZIP from float to integers
property_data['ZIP']  = property_data['ZIP'].astype(str)
property_data['zip3'] = property_data['ZIP'].str[:3]

In [22]:
property_data['ltsize'] = property_data['LTFRONT'] * property_data['LTDEPTH']
property_data['bldsize'] = property_data['BLDFRONT'] * property_data['BLDDEPTH']
property_data['bldvol'] = property_data['bldsize'] * property_data['STORIES']

In [23]:
# make variables that are typical measures of how normal a property is
property_data['r1'] = property_data['FULLVAL'] / property_data['ltsize']
property_data['r2'] = property_data['FULLVAL'] / property_data['bldsize']
property_data['r3'] = property_data['FULLVAL'] / property_data['bldvol']
property_data['r4'] = property_data['AVLAND'] / property_data['ltsize']
property_data['r5'] = property_data['AVLAND'] / property_data['bldsize']
property_data['r6'] = property_data['AVLAND'] / property_data['bldvol']
property_data['r7'] = property_data['AVTOT'] / property_data['ltsize']
property_data['r8'] = property_data['AVTOT'] / property_data['bldsize']
property_data['r9'] = property_data['AVTOT'] / property_data['bldvol']

In [24]:
ninevars = ['r1','r2','r3','r4','r5','r6','r7','r8','r9']
zip5_mean = property_data.groupby('ZIP')[ninevars].mean()
zip3_mean = property_data.groupby('zip3')[ninevars].mean()
taxclass_mean = property_data.groupby('TAXCLASS')[ninevars].mean()
borough_mean = property_data.groupby('B')[ninevars].mean()

In [25]:
property_data = property_data.join(zip5_mean, on='ZIP', rsuffix='_zip5')
property_data = property_data.join(zip3_mean, on='zip3', rsuffix='_zip3')
property_data = property_data.join(taxclass_mean, on='TAXCLASS', rsuffix='_taxclass')
property_data = property_data.join(borough_mean, on='B', rsuffix='_boro')

In [26]:
rsuffix = ['_zip5','_zip3','_taxclass','_boro']
for var in ninevars:
    for r in rsuffix:
        property_data[str(var)+r] = property_data[var] / property_data[str(var)+r]

In [27]:
property_data.columns

Index(['RECORD', 'BBLE', 'B', 'BLOCK', 'LOT', 'EASEMENT', 'OWNER', 'BLDGCL',
       'TAXCLASS', 'LTFRONT', 'LTDEPTH', 'EXT', 'STORIES', 'FULLVAL', 'AVLAND',
       'AVTOT', 'EXLAND', 'EXTOT', 'EXCD1', 'STADDR', 'ZIP', 'EXMPTCL',
       'BLDFRONT', 'BLDDEPTH', 'AVLAND2', 'AVTOT2', 'EXLAND2', 'EXTOT2',
       'EXCD2', 'PERIOD', 'YEAR', 'VALTYPE', 'zip3', 'ltsize', 'bldsize',
       'bldvol', 'r1', 'r2', 'r3', 'r4', 'r5', 'r6', 'r7', 'r8', 'r9',
       'r1_zip5', 'r2_zip5', 'r3_zip5', 'r4_zip5', 'r5_zip5', 'r6_zip5',
       'r7_zip5', 'r8_zip5', 'r9_zip5', 'r1_zip3', 'r2_zip3', 'r3_zip3',
       'r4_zip3', 'r5_zip3', 'r6_zip3', 'r7_zip3', 'r8_zip3', 'r9_zip3',
       'r1_taxclass', 'r2_taxclass', 'r3_taxclass', 'r4_taxclass',
       'r5_taxclass', 'r6_taxclass', 'r7_taxclass', 'r8_taxclass',
       'r9_taxclass', 'r1_boro', 'r2_boro', 'r3_boro', 'r4_boro', 'r5_boro',
       'r6_boro', 'r7_boro', 'r8_boro', 'r9_boro'],
      dtype='object')

In [None]:
property_data[cols].describe

In [36]:
cols = ['r1', 'r2', 'r3', 'r4', 'r5', 'r6', 'r7', 'r8', 'r9',
       'r1_zip5', 'r2_zip5', 'r3_zip5', 'r4_zip5', 'r5_zip5', 'r6_zip5',
       'r7_zip5', 'r8_zip5', 'r9_zip5', 'r1_zip3', 'r2_zip3', 'r3_zip3',
       'r4_zip3', 'r5_zip3', 'r6_zip3', 'r7_zip3', 'r8_zip3', 'r9_zip3',
       'r1_taxclass', 'r2_taxclass', 'r3_taxclass', 'r4_taxclass',
       'r5_taxclass', 'r6_taxclass', 'r7_taxclass', 'r8_taxclass',
       'r9_taxclass', 'r1_boro', 'r2_boro', 'r3_boro', 'r4_boro', 'r5_boro',
       'r6_boro', 'r7_boro', 'r8_boro', 'r9_boro']

{}
property_data[cols].describe().round(2).transpose().to_csv('stats_on_vars.csv')