In [2]:
# Python 2 & 3 Compatibility
from __future__ import print_function, division

# Necessary imports
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy
import seaborn as sns
from seaborn import plt
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV

%matplotlib inline

In [3]:
from dbfread import DBF

In [4]:
table = DBF('../data/raw/hennepin_county_metrogis_parcels/hennepin_county_metrogis_parcels.dbf', load=True, raw=True)

In [6]:
cols = table.field_names
cols

['COUNTY_ID',
 'PIN',
 'BLDG_NUM',
 'PREFIX_DIR',
 'PREFIXTYPE',
 'STREETNAME',
 'STREETTYPE',
 'SUFFIX_DIR',
 'UNIT_INFO',
 'CITY',
 'CITY_USPS',
 'ZIP',
 'ZIP4',
 'PLAT_NAME',
 'BLOCK',
 'LOT',
 'ACRES_POLY',
 'ACRES_DEED',
 'USE1_DESC',
 'USE2_DESC',
 'USE3_DESC',
 'USE4_DESC',
 'MULTI_USES',
 'LANDMARK',
 'OWNER_NAME',
 'OWNER_MORE',
 'OWN_ADD_L1',
 'OWN_ADD_L2',
 'OWN_ADD_L3',
 'TAX_NAME',
 'TAX_ADD_L1',
 'TAX_ADD_L2',
 'TAX_ADD_L3',
 'HOMESTEAD',
 'EMV_LAND',
 'EMV_BLDG',
 'EMV_TOTAL',
 'TAX_CAPAC',
 'TOTAL_TAX',
 'SPEC_ASSES',
 'TAX_EXEMPT',
 'XUSE1_DESC',
 'XUSE2_DESC',
 'XUSE3_DESC',
 'XUSE4_DESC',
 'DWELL_TYPE',
 'HOME_STYLE',
 'FIN_SQ_FT',
 'GARAGE',
 'GARAGESQFT',
 'BASEMENT',
 'HEATING',
 'COOLING',
 'YEAR_BUILT',
 'NUM_UNITS',
 'SALE_DATE',
 'SALE_VALUE',
 'SCHOOL_DST',
 'WSHD_DIST',
 'GREEN_ACRE',
 'OPEN_SPACE',
 'AG_PRESERV',
 'AGPRE_ENRD',
 'AGPRE_EXPD',
 'PARC_CODE',
 'Shape_area',
 'Shape_len']

In [7]:
df = pd.DataFrame(table.records)

In [8]:
df.sample(5)

Unnamed: 0,COUNTY_ID,PIN,BLDG_NUM,PREFIX_DIR,PREFIXTYPE,STREETNAME,STREETTYPE,SUFFIX_DIR,UNIT_INFO,CITY,...,SCHOOL_DST,WSHD_DIST,GREEN_ACRE,OPEN_SPACE,AG_PRESERV,AGPRE_ENRD,AGPRE_EXPD,PARC_CODE,Shape_area,Shape_len
192608,b'053',b'053-3011922410056',b'7102 ',b' ',b' ',b'OLIVE LA N ',b' ',b' ',b' ',b'MAPLE GROVE ',...,b'279 ',b'Lower Minnesota River ...,b'N',b'N',b'N',b' ',b' ',b' 0',b' 1.27749839011e+003',b' 1.51534686669e+002'
20117,b'053',b'053-2802924410601',b'52 ',b' ',b' ',b'GROVELAND TER ',b' ',b' ',b'A316 ',b'MINNEAPOLIS ',...,b'001 ',b'Middle Mississippi ...,b'N',b'N',b'N',b' ',b' ',b' 0',b' 6.76664192289e+003',b' 3.83155196280e+002'
309456,b'053',b'053-0602824440102',b'3750 ',b' ',b' ',b'HUNTINGTON AVE S ',b' ',b' ',b' ',b'ST. LOUIS PARK ',...,b'283 ',b'Minnehaha Creek ...,b'N',b'N',b'N',b' ',b' ',b' 0',b' 8.78432493880e+002',b' 1.25042085119e+002'
201688,b'053',b'053-2611922120038',b'76 ',b' ',b' ',b'ADDRESS UNASSIGNED ',b' ',b' ',b' ',b'MAPLE GROVE ',...,b'279 ',b' ...,b'N',b'N',b'N',b' ',b' ',b' 0',b' 1.65218755863e+003',b' 2.20717937983e+002'
213156,b'053',b'053-1811922130017',b'18053 ',b' ',b' ',b'90TH PL N ',b' ',b' ',b' ',b'MAPLE GROVE ',...,b'279 ',b'Lower Minnesota River ...,b'N',b'N',b'N',b' ',b' ',b' 0',b' 7.27030556515e+002',b' 1.15103267059e+002'


In [9]:
df.shape

(429570, 67)

In [17]:
def decode_binary(df, cols):
    decode = lambda x : x.decode('utf-8')
    for col in cols:
        df[col].apply(decode)
        
def decode_string(df, cols):
    decode = lambda x : string(x)
    for col in cols:
        df[col].apply(decode)

In [18]:
decode_binary(df, cols)
df.head()

Unnamed: 0,COUNTY_ID,PIN,BLDG_NUM,PREFIX_DIR,PREFIXTYPE,STREETNAME,STREETTYPE,SUFFIX_DIR,UNIT_INFO,CITY,...,SCHOOL_DST,WSHD_DIST,GREEN_ACRE,OPEN_SPACE,AG_PRESERV,AGPRE_ENRD,AGPRE_EXPD,PARC_CODE,Shape_area,Shape_len
0,b'053',b'053-0911821410021',b'54 ',b' ',b' ',b'ADDRESS UNASSIGNED ',b' ',b' ',b' ',b'CRYSTAL ',...,b'281 ',b'Shingle Creek ...,b'N',b'N',b'N',b' ',b' ',b' 0',b' 7.25247600000e-002',b' 9.55874206426e-001'
1,b'053',b'053-1311821120005',b'4400 1/2 ',b' ',b' ',b'LYNDALE AVE N ',b' ',b' ',b' ',b'MINNEAPOLIS ',...,b'001 ',b'Shingle Creek ...,b'N',b'N',b'N',b' ',b' ',b' 0',b' 7.25380050000e-002',b' 9.55961240593e-001'
2,b'053',b'053-1311724440113',b'2325 ',b' ',b' ',b'MONTCLAIR LA ',b' ',b' ',b' ',b'MOUND ',...,b' ',b' ...,b'N',b'N',b'N',b' ',b' ',b' 0',b' 7.25352550000e-002',b' 9.55943566274e-001'
3,b'053',b'053-2911821440091',b'28 ',b' ',b' ',b'ADDRESS UNASSIGNED ',b' ',b' ',b' ',b'GOLDEN VALLEY ',...,b'281 ',b'Bassett Creek ...,b'N',b'N',b'N',b' ',b' ',b' 0',b' 7.25358400000e-002',b' 9.55947921559e-001'
4,b'053',b'053-0111823440003',b'80 ',b' ',b' ',b'ADDRESS UNASSIGNED ',b' ',b' ',b' ',b'MEDINA ',...,b'284 ',b' ...,b'N',b'N',b'N',b' ',b' ',b' 0',b' 7.25419200000e-002',b' 9.55987121801e-001'


In [None]:
decode_string(df, cols)
df.head()

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 429570 entries, 0 to 429569
Data columns (total 67 columns):
COUNTY_ID     429570 non-null object
PIN           429570 non-null object
BLDG_NUM      429570 non-null object
PREFIX_DIR    429570 non-null object
PREFIXTYPE    429570 non-null object
STREETNAME    429570 non-null object
STREETTYPE    429570 non-null object
SUFFIX_DIR    429570 non-null object
UNIT_INFO     429570 non-null object
CITY          429570 non-null object
CITY_USPS     429570 non-null object
ZIP           429570 non-null object
ZIP4          429570 non-null object
PLAT_NAME     429570 non-null object
BLOCK         429570 non-null object
LOT           429570 non-null object
ACRES_POLY    429570 non-null object
ACRES_DEED    429570 non-null object
USE1_DESC     429570 non-null object
USE2_DESC     429570 non-null object
USE3_DESC     429570 non-null object
USE4_DESC     429570 non-null object
MULTI_USES    429570 non-null object
LANDMARK      429570 non-null object
O

In [25]:
type(df['STREETNAME'].ix[1])

bytes

In [None]:
clean_b = lambda x: re.sub(r'(b\')(\w*)(\')',r'\2', x)
census['name'] = census['name'].apply(clean_dashes)

In [22]:
import re
x = "b'LYNDALE AVE N '"
y = re.sub(r'(b\')(\w*)(\')',r'\2', x)
y

"b'LYNDALE AVE N '"