# Prepare

### Project Requirements

- Data Prep: Column data types are appropriate for the data they contain
- Data Prep: Missing values are investigated and handled
- Data Prep: Outliers are investigated and handled

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from env import get_db_url
import acquire
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = acquire.get_zillow_data()

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 77380 entries, 0 to 77379
Data columns (total 68 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   id                            77380 non-null  int64  
 1   parcelid                      77380 non-null  int64  
 2   airconditioningtypeid         24953 non-null  float64
 3   architecturalstyletypeid      206 non-null    float64
 4   basementsqft                  50 non-null     float64
 5   bathroomcnt                   77380 non-null  float64
 6   bedroomcnt                    77380 non-null  float64
 7   buildingclasstypeid           15 non-null     float64
 8   buildingqualitytypeid         49671 non-null  float64
 9   calculatedbathnbr             76771 non-null  float64
 10  decktypeid                    614 non-null    float64
 11  finishedfloor1squarefeet      6023 non-null   float64
 12  calculatedfinishedsquarefeet  77184 non-null  float64
 13  f

In [4]:
# sets thresh hold to 50 percent nulls
threshold = df.shape[0] * .50

# remove columns with high nulls
df = df.dropna(axis=1, thresh=threshold)

df.columns

Index(['id', 'parcelid', 'bathroomcnt', 'bedroomcnt', 'buildingqualitytypeid',
       'calculatedbathnbr', 'calculatedfinishedsquarefeet',
       'finishedsquarefeet12', 'fips', 'fullbathcnt', 'heatingorsystemtypeid',
       'latitude', 'longitude', 'lotsizesquarefeet',
       'propertycountylandusecode', 'propertylandusetypeid',
       'propertyzoningdesc', 'rawcensustractandblock', 'regionidcity',
       'regionidcounty', 'regionidzip', 'roomcnt', 'unitcnt', 'yearbuilt',
       'structuretaxvaluedollarcnt', 'taxvaluedollarcnt', 'assessmentyear',
       'landtaxvaluedollarcnt', 'taxamount', 'censustractandblock', 'logerror',
       'transactiondate', 'heatingorsystemdesc', 'propertylandusedesc'],
      dtype='object')

In [5]:
df.propertylandusedesc.value_counts()

Single Family Residential                     52319
Condominium                                   19294
Duplex (2 Units, Any Combination)              2009
Planned Unit Development                       1944
Quadruplex (4 Units, Any Combination)           727
Triplex (3 Units, Any Combination)              535
Cluster Home                                    333
Mobile Home                                      74
Manufactured, Modular, Prefabricated Homes       58
Residential General                              37
Cooperative                                      29
Commercial/Office/Residential Mixed Used         15
Townhouse                                         6
Name: propertylandusedesc, dtype: int64

#### should only need to analyze single family residental

In [6]:
# only list single family residental
df = df [(df.propertylandusedesc == 'Single Family Residential')]

In [7]:
df.bedroomcnt.value_counts()

3.0     23302
4.0     15210
2.0      8317
5.0      3967
6.0       634
1.0       611
0.0       135
7.0       106
8.0        24
9.0         8
10.0        2
14.0        1
11.0        1
12.0        1
Name: bedroomcnt, dtype: int64

In [8]:
df.bathroomcnt.value_counts()

2.0     21839
3.0     10650
1.0      9536
2.5      3932
4.0      2225
3.5       916
1.5       839
5.0       803
4.5       686
6.0       320
5.5       224
0.0       119
7.0        88
8.0        53
6.5        47
7.5        16
9.0        13
10.0        5
11.0        3
8.5         3
18.0        1
13.0        1
Name: bathroomcnt, dtype: int64

In [9]:
### lets only anaylze >0 and <6
df = df[(df.bedroomcnt <= 6 ) & (df.bathroomcnt <= 6 ) & (df.bedroomcnt > 0 ) & (df.bathroomcnt > 0) ]

In [10]:
df.bedroomcnt.value_counts()

3.0    23294
4.0    15189
2.0     8315
5.0     3891
1.0      607
6.0      545
Name: bedroomcnt, dtype: int64

In [11]:
df.bathroomcnt.value_counts()

2.0    21827
3.0    10621
1.0     9518
2.5     3930
4.0     2202
3.5      913
1.5      839
5.0      787
4.5      679
6.0      302
5.5      223
Name: bathroomcnt, dtype: int64

In [12]:
# keep only properties less than 3000 square feet. same as regression project.
df = df[df.calculatedfinishedsquarefeet <= 3000 ]    

In [13]:
# keep only properties less than 1m.
df = df[df.taxvaluedollarcnt <= 100000]  

In [14]:
df.unitcnt.value_counts()

1.0    4899
Name: unitcnt, dtype: int64

In [15]:
df.roomcnt.value_counts()
#same as bedrooms. will drop

0.0     4925
6.0      584
7.0      442
5.0      247
8.0      204
4.0       68
9.0       48
3.0        8
10.0       6
2.0        2
11.0       1
Name: roomcnt, dtype: int64

In [16]:
df.calculatedbathnbr.value_counts()
# already showed in bathrooms as a float

2.0    3593
1.0    2390
3.0     281
2.5     134
1.5     131
4.0       5
3.5       1
Name: calculatedbathnbr, dtype: int64

In [17]:
df.propertylandusedesc.value_counts()

Single Family Residential    6535
Name: propertylandusedesc, dtype: int64

In [18]:
df = df.dropna()

In [19]:
df

Unnamed: 0,id,parcelid,bathroomcnt,bedroomcnt,buildingqualitytypeid,calculatedbathnbr,calculatedfinishedsquarefeet,finishedsquarefeet12,fips,fullbathcnt,...,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,censustractandblock,logerror,transactiondate,heatingorsystemdesc,propertylandusedesc
28,707014,12036177,2.0,3.0,4.0,2.0,1851.0,1851.0,6037.0,2.0,...,32264.0,85035.0,2016.0,52771.0,1232.08,6.037189e+13,0.206470,2017-01-02,Floor/Wall,Single Family Residential
44,930752,12106936,2.0,3.0,6.0,2.0,1447.0,1447.0,6037.0,2.0,...,57734.0,70755.0,2016.0,13021.0,1050.39,6.037462e+13,0.075156,2017-01-02,Floor/Wall,Single Family Residential
67,1628261,11016518,2.0,4.0,8.0,2.0,1625.0,1625.0,6037.0,2.0,...,54667.0,97097.0,2016.0,42430.0,1307.93,6.037107e+13,-0.060470,2017-01-02,Central,Single Family Residential
70,1061201,11018202,1.0,2.0,4.0,1.0,812.0,812.0,6037.0,1.0,...,21513.0,40746.0,2016.0,19233.0,668.70,6.037110e+13,0.021262,2017-01-02,Floor/Wall,Single Family Residential
86,843278,12579560,1.0,2.0,4.0,1.0,1027.0,1027.0,6037.0,1.0,...,27521.0,49034.0,2016.0,21513.0,1179.11,6.037544e+13,-0.025721,2017-01-02,Floor/Wall,Single Family Residential
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77356,620802,12885530,1.0,2.0,4.0,1.0,820.0,820.0,6037.0,1.0,...,49067.0,60841.0,2016.0,11774.0,904.27,6.037403e+13,0.003457,2017-09-19,Floor/Wall,Single Family Residential
77365,347372,11608641,2.0,2.0,6.0,2.0,1281.0,1281.0,6037.0,2.0,...,24621.0,97499.0,2016.0,72878.0,1391.79,6.037269e+13,-0.816510,2017-09-19,Central,Single Family Residential
77367,2747021,11318911,1.0,1.0,5.0,1.0,624.0,624.0,6037.0,1.0,...,49024.0,81706.0,2016.0,32682.0,1131.24,6.037901e+13,0.013268,2017-09-19,Floor/Wall,Single Family Residential
77373,1373391,10722691,2.0,3.0,6.0,2.0,1570.0,1570.0,6037.0,2.0,...,46784.0,72026.0,2016.0,25242.0,1000.70,6.037135e+13,0.081196,2017-09-19,Central,Single Family Residential


In [21]:
df.shape

(4716, 34)

In [23]:
df =df.drop(columns= ['finishedsquarefeet12', 'fullbathcnt', 'calculatedbathnbr',
                      'propertyzoningdesc', 'unitcnt', 'propertylandusedesc',
                      'assessmentyear', 'roomcnt', 'regionidcounty', 'propertylandusetypeid',
                      'heatingorsystemtypeid', 'id', 'heatingorsystemdesc', 'buildingqualitytypeid'],axis=1)

In [26]:
df.shape

(4716, 20)

In [27]:
df.columns

Index(['parcelid', 'bathroomcnt', 'bedroomcnt', 'calculatedfinishedsquarefeet',
       'fips', 'latitude', 'longitude', 'lotsizesquarefeet',
       'propertycountylandusecode', 'rawcensustractandblock', 'regionidcity',
       'regionidzip', 'yearbuilt', 'structuretaxvaluedollarcnt',
       'taxvaluedollarcnt', 'landtaxvaluedollarcnt', 'taxamount',
       'censustractandblock', 'logerror', 'transactiondate'],
      dtype='object')

In [37]:
def overview(df):
    print('--- Shape: {}'.format(df.shape))
    print('--- Info')
    df.info()
    print('--- Column Descriptions')
    print(df.describe(include='all'))
    return df

In [38]:
overview(df)

--- Shape: (4716, 20)
--- Info
<class 'pandas.core.frame.DataFrame'>
Int64Index: 4716 entries, 28 to 77378
Data columns (total 20 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   parcelid                      4716 non-null   int64  
 1   bathroomcnt                   4716 non-null   float64
 2   bedroomcnt                    4716 non-null   float64
 3   calculatedfinishedsquarefeet  4716 non-null   float64
 4   fips                          4716 non-null   float64
 5   latitude                      4716 non-null   float64
 6   longitude                     4716 non-null   float64
 7   lotsizesquarefeet             4716 non-null   float64
 8   propertycountylandusecode     4716 non-null   object 
 9   rawcensustractandblock        4716 non-null   float64
 10  regionidcity                  4716 non-null   float64
 11  regionidzip                   4716 non-null   float64
 12  yearbuilt                    

Unnamed: 0,parcelid,bathroomcnt,bedroomcnt,calculatedfinishedsquarefeet,fips,latitude,longitude,lotsizesquarefeet,propertycountylandusecode,rawcensustractandblock,regionidcity,regionidzip,yearbuilt,structuretaxvaluedollarcnt,taxvaluedollarcnt,landtaxvaluedollarcnt,taxamount,censustractandblock,logerror,transactiondate
28,12036177,2.0,3.0,1851.0,6037.0,34103373.0,-118293280.0,6714.0,0100,6.037189e+07,12447.0,96008.0,1920.0,32264.0,85035.0,52771.0,1232.08,6.037189e+13,0.206470,2017-01-02
44,12106936,2.0,3.0,1447.0,6037.0,34166370.0,-118151336.0,7283.0,0100,6.037462e+07,47019.0,96291.0,1901.0,57734.0,70755.0,13021.0,1050.39,6.037462e+13,0.075156,2017-01-02
67,11016518,2.0,4.0,1625.0,6037.0,34282275.0,-118492692.0,8427.0,0100,6.037107e+07,12447.0,96370.0,1956.0,54667.0,97097.0,42430.0,1307.93,6.037107e+13,-0.060470,2017-01-02
70,11018202,1.0,2.0,812.0,6037.0,34276431.0,-118447368.0,7150.0,0100,6.037110e+07,12447.0,96366.0,1917.0,21513.0,40746.0,19233.0,668.70,6.037110e+13,0.021262,2017-01-02
86,12579560,1.0,2.0,1027.0,6037.0,33816016.0,-118271776.0,5574.0,0100,6.037544e+07,10723.0,96229.0,1951.0,27521.0,49034.0,21513.0,1179.11,6.037544e+13,-0.025721,2017-01-02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77356,12885530,1.0,2.0,820.0,6037.0,34044147.0,-117762663.0,5717.0,0100,6.037403e+07,20008.0,96506.0,1949.0,49067.0,60841.0,11774.0,904.27,6.037403e+13,0.003457,2017-09-19
77365,11608641,2.0,2.0,1281.0,6037.0,34032660.0,-118400587.0,12000.0,0100,6.037269e+07,12447.0,96015.0,1954.0,24621.0,97499.0,72878.0,1391.79,6.037269e+13,-0.816510,2017-09-19
77367,11318911,1.0,1.0,624.0,6037.0,34667122.0,-118222003.0,45398.0,0100,6.037901e+07,5534.0,97319.0,1944.0,49024.0,81706.0,32682.0,1131.24,6.037901e+13,0.013268,2017-09-19
77373,10722691,2.0,3.0,1570.0,6037.0,34194943.0,-118629218.0,7499.0,0100,6.037135e+07,12447.0,96342.0,1958.0,46784.0,72026.0,25242.0,1000.70,6.037135e+13,0.081196,2017-09-19


In [40]:
#adding same features from regression project
df['transactiondate'] = df.transactiondate.astype('str')
df['transaction_month'] = df.transactiondate.str.split('-',expand=True)[1]

In [41]:
df

Unnamed: 0,parcelid,bathroomcnt,bedroomcnt,calculatedfinishedsquarefeet,fips,latitude,longitude,lotsizesquarefeet,propertycountylandusecode,rawcensustractandblock,...,regionidzip,yearbuilt,structuretaxvaluedollarcnt,taxvaluedollarcnt,landtaxvaluedollarcnt,taxamount,censustractandblock,logerror,transactiondate,transaction_month
28,12036177,2.0,3.0,1851.0,6037.0,34103373.0,-118293280.0,6714.0,0100,6.037189e+07,...,96008.0,1920.0,32264.0,85035.0,52771.0,1232.08,6.037189e+13,0.206470,2017-01-02,01
44,12106936,2.0,3.0,1447.0,6037.0,34166370.0,-118151336.0,7283.0,0100,6.037462e+07,...,96291.0,1901.0,57734.0,70755.0,13021.0,1050.39,6.037462e+13,0.075156,2017-01-02,01
67,11016518,2.0,4.0,1625.0,6037.0,34282275.0,-118492692.0,8427.0,0100,6.037107e+07,...,96370.0,1956.0,54667.0,97097.0,42430.0,1307.93,6.037107e+13,-0.060470,2017-01-02,01
70,11018202,1.0,2.0,812.0,6037.0,34276431.0,-118447368.0,7150.0,0100,6.037110e+07,...,96366.0,1917.0,21513.0,40746.0,19233.0,668.70,6.037110e+13,0.021262,2017-01-02,01
86,12579560,1.0,2.0,1027.0,6037.0,33816016.0,-118271776.0,5574.0,0100,6.037544e+07,...,96229.0,1951.0,27521.0,49034.0,21513.0,1179.11,6.037544e+13,-0.025721,2017-01-02,01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77356,12885530,1.0,2.0,820.0,6037.0,34044147.0,-117762663.0,5717.0,0100,6.037403e+07,...,96506.0,1949.0,49067.0,60841.0,11774.0,904.27,6.037403e+13,0.003457,2017-09-19,09
77365,11608641,2.0,2.0,1281.0,6037.0,34032660.0,-118400587.0,12000.0,0100,6.037269e+07,...,96015.0,1954.0,24621.0,97499.0,72878.0,1391.79,6.037269e+13,-0.816510,2017-09-19,09
77367,11318911,1.0,1.0,624.0,6037.0,34667122.0,-118222003.0,45398.0,0100,6.037901e+07,...,97319.0,1944.0,49024.0,81706.0,32682.0,1131.24,6.037901e+13,0.013268,2017-09-19,09
77373,10722691,2.0,3.0,1570.0,6037.0,34194943.0,-118629218.0,7499.0,0100,6.037135e+07,...,96342.0,1958.0,46784.0,72026.0,25242.0,1000.70,6.037135e+13,0.081196,2017-09-19,09


In [42]:
df['tax_rate'] = (df.taxamount/df.taxvaluedollarcnt) * 100

In [43]:
df

Unnamed: 0,parcelid,bathroomcnt,bedroomcnt,calculatedfinishedsquarefeet,fips,latitude,longitude,lotsizesquarefeet,propertycountylandusecode,rawcensustractandblock,...,yearbuilt,structuretaxvaluedollarcnt,taxvaluedollarcnt,landtaxvaluedollarcnt,taxamount,censustractandblock,logerror,transactiondate,transaction_month,tax_rate
28,12036177,2.0,3.0,1851.0,6037.0,34103373.0,-118293280.0,6714.0,0100,6.037189e+07,...,1920.0,32264.0,85035.0,52771.0,1232.08,6.037189e+13,0.206470,2017-01-02,01,1.448909
44,12106936,2.0,3.0,1447.0,6037.0,34166370.0,-118151336.0,7283.0,0100,6.037462e+07,...,1901.0,57734.0,70755.0,13021.0,1050.39,6.037462e+13,0.075156,2017-01-02,01,1.484545
67,11016518,2.0,4.0,1625.0,6037.0,34282275.0,-118492692.0,8427.0,0100,6.037107e+07,...,1956.0,54667.0,97097.0,42430.0,1307.93,6.037107e+13,-0.060470,2017-01-02,01,1.347034
70,11018202,1.0,2.0,812.0,6037.0,34276431.0,-118447368.0,7150.0,0100,6.037110e+07,...,1917.0,21513.0,40746.0,19233.0,668.70,6.037110e+13,0.021262,2017-01-02,01,1.641143
86,12579560,1.0,2.0,1027.0,6037.0,33816016.0,-118271776.0,5574.0,0100,6.037544e+07,...,1951.0,27521.0,49034.0,21513.0,1179.11,6.037544e+13,-0.025721,2017-01-02,01,2.404678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77356,12885530,1.0,2.0,820.0,6037.0,34044147.0,-117762663.0,5717.0,0100,6.037403e+07,...,1949.0,49067.0,60841.0,11774.0,904.27,6.037403e+13,0.003457,2017-09-19,09,1.486284
77365,11608641,2.0,2.0,1281.0,6037.0,34032660.0,-118400587.0,12000.0,0100,6.037269e+07,...,1954.0,24621.0,97499.0,72878.0,1391.79,6.037269e+13,-0.816510,2017-09-19,09,1.427492
77367,11318911,1.0,1.0,624.0,6037.0,34667122.0,-118222003.0,45398.0,0100,6.037901e+07,...,1944.0,49024.0,81706.0,32682.0,1131.24,6.037901e+13,0.013268,2017-09-19,09,1.384525
77373,10722691,2.0,3.0,1570.0,6037.0,34194943.0,-118629218.0,7499.0,0100,6.037135e+07,...,1958.0,46784.0,72026.0,25242.0,1000.70,6.037135e+13,0.081196,2017-09-19,09,1.389359


In [44]:
    df['yearbuilt'] = df['yearbuilt'].astype(int)
    df["bedroomcnt"] = df["bedroomcnt"].astype(int)
    df["calculatedfinishedsquarefeet"] = df["calculatedfinishedsquarefeet"].astype(int)
    df["fips"] = df["fips"].astype(int)
    df["lotsizesquarefeet"] = df["lotsizesquarefeet"].astype(int)
    df["rawcensustractandblock"] = df["rawcensustractandblock"].astype(int)
    df["regionidcity"] = df["regionidcity"].astype(int)
    df["regionidzip"] = df["regionidzip"].astype(int)
    df["censustractandblock"] = df["censustractandblock"].astype(int)
    df["structuretaxvaluedollarcnt"] = df["structuretaxvaluedollarcnt"].astype(int)
    df["taxvaluedollarcnt"] = df["taxvaluedollarcnt"].astype(int)
    df["landtaxvaluedollarcnt"] = df["landtaxvaluedollarcnt"].astype(int)
    df["taxamount"] = df["taxamount"].astype(int)
    df.yearbuilt = df.yearbuilt.astype(object) 
    df['age'] = 2017-df['yearbuilt']
    df = df.drop(columns='yearbuilt')
    df['age'] = df['age'].astype('int')

In [45]:
overview(df)

--- Shape: (4716, 22)
--- Info
<class 'pandas.core.frame.DataFrame'>
Int64Index: 4716 entries, 28 to 77378
Data columns (total 22 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   parcelid                      4716 non-null   int64  
 1   bathroomcnt                   4716 non-null   float64
 2   bedroomcnt                    4716 non-null   int64  
 3   calculatedfinishedsquarefeet  4716 non-null   int64  
 4   fips                          4716 non-null   int64  
 5   latitude                      4716 non-null   float64
 6   longitude                     4716 non-null   float64
 7   lotsizesquarefeet             4716 non-null   int64  
 8   propertycountylandusecode     4716 non-null   object 
 9   rawcensustractandblock        4716 non-null   int64  
 10  regionidcity                  4716 non-null   int64  
 11  regionidzip                   4716 non-null   int64  
 12  structuretaxvaluedollarcnt   

Unnamed: 0,parcelid,bathroomcnt,bedroomcnt,calculatedfinishedsquarefeet,fips,latitude,longitude,lotsizesquarefeet,propertycountylandusecode,rawcensustractandblock,...,structuretaxvaluedollarcnt,taxvaluedollarcnt,landtaxvaluedollarcnt,taxamount,censustractandblock,logerror,transactiondate,transaction_month,tax_rate,age
28,12036177,2.0,3,1851,6037,34103373.0,-118293280.0,6714,0100,60371892,...,32264,85035,52771,1232,60371892012001,0.206470,2017-01-02,01,1.448909,97
44,12106936,2.0,3,1447,6037,34166370.0,-118151336.0,7283,0100,60374616,...,57734,70755,13021,1050,60374616001002,0.075156,2017-01-02,01,1.484545,116
67,11016518,2.0,4,1625,6037,34282275.0,-118492692.0,8427,0100,60371066,...,54667,97097,42430,1307,60371066461002,-0.060470,2017-01-02,01,1.347034,61
70,11018202,1.0,2,812,6037,34276431.0,-118447368.0,7150,0100,60371095,...,21513,40746,19233,668,60371095003018,0.021262,2017-01-02,01,1.641143,100
86,12579560,1.0,2,1027,6037,33816016.0,-118271776.0,5574,0100,60375437,...,27521,49034,21513,1179,60375437025006,-0.025721,2017-01-02,01,2.404678,66
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77356,12885530,1.0,2,820,6037,34044147.0,-117762663.0,5717,0100,60374025,...,49067,60841,11774,904,60374025022022,0.003457,2017-09-19,09,1.486284,68
77365,11608641,2.0,2,1281,6037,34032660.0,-118400587.0,12000,0100,60372690,...,24621,97499,72878,1391,60372690003005,-0.816510,2017-09-19,09,1.427492,63
77367,11318911,1.0,1,624,6037,34667122.0,-118222003.0,45398,0100,60379011,...,49024,81706,32682,1131,60379011012002,0.013268,2017-09-19,09,1.384525,73
77373,10722691,2.0,3,1570,6037,34194943.0,-118629218.0,7499,0100,60371352,...,46784,72026,25242,1000,60371352011004,0.081196,2017-09-19,09,1.389359,59


In [46]:
df.isnull().sum()

parcelid                        0
bathroomcnt                     0
bedroomcnt                      0
calculatedfinishedsquarefeet    0
fips                            0
latitude                        0
longitude                       0
lotsizesquarefeet               0
propertycountylandusecode       0
rawcensustractandblock          0
regionidcity                    0
regionidzip                     0
structuretaxvaluedollarcnt      0
taxvaluedollarcnt               0
landtaxvaluedollarcnt           0
taxamount                       0
censustractandblock             0
logerror                        0
transactiondate                 0
transaction_month               0
tax_rate                        0
age                             0
dtype: int64

In [49]:
def get_upper_outliers(s, k):
    '''
    Given a series and a cutoff value, k, returns the upper outliers for the
    series.

    The values returned will be either 0 (if the point is not an outlier), or a
    number that indicates how far away from the upper bound the observation is.
    '''
    q1, q3 = s.quantile([.25, .75])
    iqr = q3 - q1
    upper_bound = q3 + k * iqr
    return s.apply(lambda x: max([x - upper_bound, 0]))

def add_upper_outlier_columns(df, k):
    '''
    Add a column with the suffix _outliers for all the numeric columns
    in the given dataframe.
    '''
    # outlier_cols = {col + '_outliers': get_upper_outliers(df[col], k)
    #                 for col in df.select_dtypes('number')}
    # return df.assign(**outlier_cols)

    for col in df.select_dtypes('number'):
        df[col + '_outliers'] = get_upper_outliers(df[col], k)

    return df

add_upper_outlier_columns(df, k=1.5)

df.head()

Unnamed: 0,parcelid,bathroomcnt,bedroomcnt,calculatedfinishedsquarefeet,fips,latitude,longitude,lotsizesquarefeet,propertycountylandusecode,rawcensustractandblock,...,regionidcity_outliers,regionidzip_outliers,structuretaxvaluedollarcnt_outliers,taxvaluedollarcnt_outliers,landtaxvaluedollarcnt_outliers,taxamount_outliers,censustractandblock_outliers,logerror_outliers,tax_rate_outliers,age_outliers
28,12036177,2.0,3,1851,6037,34103373.0,-118293280.0,6714,100,60371892,...,0.0,0.0,0,0,0.0,0.0,0,0.033818,0.0,1.0
44,12106936,2.0,3,1447,6037,34166370.0,-118151336.0,7283,100,60374616,...,0.0,0.0,0,0,0.0,0.0,0,0.0,0.0,20.0
67,11016518,2.0,4,1625,6037,34282275.0,-118492692.0,8427,100,60371066,...,0.0,0.0,0,0,0.0,0.0,0,0.0,0.0,0.0
70,11018202,1.0,2,812,6037,34276431.0,-118447368.0,7150,100,60371095,...,0.0,0.0,0,0,0.0,0.0,0,0.0,0.0,4.0
86,12579560,1.0,2,1027,6037,33816016.0,-118271776.0,5574,100,60375437,...,0.0,0.0,0,0,0.0,0.0,0,0.0,0.0,0.0


In [50]:
outlier_cols = [col for col in df if col.endswith('_outliers')]
for col in outlier_cols:
    print('~~~\n' + col)
    data = df[col][df[col] > 0]
    print(data.describe())

~~~
parcelid_outliers
count    0.0
mean     NaN
std      NaN
min      NaN
25%      NaN
50%      NaN
75%      NaN
max      NaN
Name: parcelid_outliers, dtype: float64
~~~
bathroomcnt_outliers
count    3.0
mean     0.5
std      0.0
min      0.5
25%      0.5
50%      0.5
75%      0.5
max      0.5
Name: bathroomcnt_outliers, dtype: float64
~~~
bedroomcnt_outliers
count    56.000000
mean      0.607143
std       0.312094
min       0.500000
25%       0.500000
50%       0.500000
75%       0.500000
max       1.500000
Name: bedroomcnt_outliers, dtype: float64
~~~
calculatedfinishedsquarefeet_outliers
count     66.000000
mean     228.852273
std      197.802819
min        6.125000
25%       77.375000
50%      167.125000
75%      314.875000
max      790.125000
Name: calculatedfinishedsquarefeet_outliers, dtype: float64
~~~
fips_outliers
count    0.0
mean     NaN
std      NaN
min      NaN
25%      NaN
50%      NaN
75%      NaN
max      NaN
Name: fips_outliers, dtype: float64
~~~
latitude_outliers
co

In [52]:
#removing outliers
df = remove_outliers(df, 3, ['lotsizesquarefeet', 'structuretaxvaluedollarcnt','rawcensustractandblock']) 

# Put all together in prepare.py