# Dean Property Consulting 2024

## Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [101]:
df.head()

Unnamed: 0,date,sale_ask,units,floors,sf,annual$
0,2010-01-03,770000.0,1.0,2.5,2722,36033.0
1,2010-01-06,1397478.0,2.0,3.0,3780,63202.0
2,2010-01-10,505000.0,1.0,1.0,800,30089.0
3,2010-01-11,450000.0,1.0,1.0,650,31473.0
4,2010-01-11,1530000.0,3.0,3.0,3200,69400.0


In [130]:
df[df['sf'] == 5164.00]

Unnamed: 0,date,sale_ask,units,floors,sf,annual$
8338,2015-04-07,930000.0,0.0,0.0,5164.0,30668.0


In [142]:
df[df['sf'] == 0.]

Unnamed: 0,date,sale_ask,units,floors,sf,annual$
1059,2012-05-02,2350000.0,0.0,0.0,0.0,74391.0
1060,2012-05-02,2350000.0,0.0,0.0,0.0,74391.0
1498,2012-09-20,1570000.0,0.0,0.0,0.0,45048.0
1517,2012-09-27,1800000.0,0.0,0.0,0.0,51245.0
1518,2012-09-27,1800000.0,0.0,0.0,0.0,51245.0
...,...,...,...,...,...,...
40325,2023-10-26,1965000.0,0.0,1.0,0.0,113609.0
40329,2023-10-26,1400000.0,1.0,0.0,0.0,80943.0
40341,2023-10-27,375000.0,0.0,0.0,0.0,21681.0
40363,2023-11-02,810000.0,0.0,0.0,0.0,45209.0


In [146]:
# load dataset
df = pd.read_csv('../data/df_lownull.csv', parse_dates=True)
df = df.drop(columns=['Unnamed: 0'])

# clean column names (lowercase and remove spaces)
df.columns = [col.lower().replace(' ', '').replace('/', '_') for col in df.columns]

# clean and convert columns
df['annual$'] = df['annual$'].str.replace('$', '').str.replace(',', '').astype(float)
df['date'] = df['date'].astype('datetime64[ns]')
df['sale_ask'] = df['sale_ask'].astype('float')

# clean the 'sf' column before filtering
df['sf'] = (
    df['sf']
    .str.replace(',', '')             # remove commas
    .str.replace(' \t-  ', '')        # remove tabs
    .str.replace(' ', '')             # remove spaces
    .str.replace('$', '')             # remove $ signs
    .str.replace('sf', '')            # remove extra letters
    .replace('', np.nan)              # replace empty strings with nan
)
df['sf'] = df['sf'].astype('float')  # convert 'sf' to float

# drop rows with nan values in specific columns
df = df.dropna(subset=['date', 'sale_ask', 'units', 'floors', 'sf', 'annual$'])

# drop rows where any of 'sf', 'units', or 'floors' equals zero
df = df[~(df[['sf', 'units', 'floors']] == 0).any(axis=1)]

# update and convert remaining columns
df.at[12080, 'units'] = 1
df['units'] = df['units'].astype('float')
df['floors'] = df['floors'].astype('float')

# check results
print(df.shape)
df[df['sf'] == 0]  # should return an empty dataframe

(39687, 6)


Unnamed: 0,date,sale_ask,units,floors,sf,annual$


In [148]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 39687 entries, 0 to 40683
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   date      39687 non-null  datetime64[ns]
 1   sale_ask  39687 non-null  float64       
 2   units     39687 non-null  float64       
 3   floors    39687 non-null  float64       
 4   sf        39687 non-null  float64       
 5   annual$   39687 non-null  float64       
dtypes: datetime64[ns](1), float64(5)
memory usage: 3.1 MB


In [149]:
df.head()

Unnamed: 0,date,sale_ask,units,floors,sf,annual$
0,2010-01-03,770000.0,1.0,2.5,2722.0,36033.0
1,2010-01-06,1397478.0,2.0,3.0,3780.0,63202.0
2,2010-01-10,505000.0,1.0,1.0,800.0,30089.0
3,2010-01-11,450000.0,1.0,1.0,650.0,31473.0
4,2010-01-11,1530000.0,3.0,3.0,3200.0,69400.0


In [89]:
for col in df.columns:
    print(col, set(df[col].apply(type)))

date {<class 'pandas._libs.tslibs.timestamps.Timestamp'>}
sale_ask {<class 'float'>}
units {<class 'str'>}
floors {<class 'str'>}
sf {<class 'str'>}
annual$ {<class 'float'>}


In [75]:
df.shape

(40684, 6)

In [76]:
df.head()

Unnamed: 0,date,sale_ask,units,floors,sf,annual$
0,2010-01-03,770000.0,1,2.5,2722,36033.0
1,2010-01-06,1397478.0,2,3.0,3780,63202.0
2,2010-01-10,505000.0,1,1.0,800,30089.0
3,2010-01-11,450000.0,1,1.0,650,31473.0
4,2010-01-11,1530000.0,3,3.0,3200,69400.0


In [77]:
# apply the filter to a column in the DataFrame
desired_dtype = float
filtered_rows = df[df['floors'].apply(lambda x: isinstance(x, desired_dtype))]
filtered_rows

Unnamed: 0,date,sale_ask,units,floors,sf,annual$
5103,2014-06-09,3869350.0,1,,3836.0,127049.0
7614,2015-02-10,1111000.0,,,,33856.0
10106,2015-09-16,830000.0,,,,25293.0
10426,2015-10-08,1100000.0,,,2700.0,33683.0
12136,2016-03-27,275000.0,"1 bdr, 1 bath, 17 weeks, 500 SF, $480 common, ...",,500.0,8715.0
13427,2016-08-07,305000.0,,,500.0,10707.0
13693,2016-09-04,1850000.0,,,,49081.0
14578,2016-12-04,1850000.0,,,,53357.0
14579,2016-12-05,1900000.0,,,,54799.0


In [79]:
df.at[12136, 'units'] = 1
# apply the filter to a column in the DataFrame
desired_dtype = float
filtered_rows = df[df['floors'].apply(lambda x: isinstance(x, desired_dtype))]
filtered_rows

Unnamed: 0,date,sale_ask,units,floors,sf,annual$
5103,2014-06-09,3869350.0,1.0,,3836.0,127049.0
7614,2015-02-10,1111000.0,,,,33856.0
10106,2015-09-16,830000.0,,,,25293.0
10426,2015-10-08,1100000.0,,,2700.0,33683.0
12136,2016-03-27,275000.0,1.0,,500.0,8715.0
13427,2016-08-07,305000.0,,,500.0,10707.0
13693,2016-09-04,1850000.0,,,,49081.0
14578,2016-12-04,1850000.0,,,,53357.0
14579,2016-12-05,1900000.0,,,,54799.0


In [86]:
# apply the filter to a column in the DataFrame
desired_dtype = str
filtered_rows = df[df['floors'].apply(lambda x: isinstance(x, desired_dtype))]
filtered_rows.shape

(40654, 6)

In [71]:
for col in df.columns:
    print(col, set(df[col].apply(type)))

date {<class 'pandas._libs.tslibs.timestamps.Timestamp'>, <class 'pandas._libs.tslibs.nattype.NaTType'>}
sale_ask {<class 'float'>}
units {<class 'int'>, <class 'str'>}
floors {<class 'float'>, <class 'str'>}
sf {<class 'float'>, <class 'str'>}
annual$ {<class 'float'>}


In [34]:
for col in df.columns:
    print('column:', col)
    print('unique values:', df[col].unique())
    print('minimum:', df[col].min())
    print('maximum:', df[col].max())
    print('\n')

column: date
unique values: ['2010-01-03' '2010-01-06' '2010-01-10' ... '2023-12-12' '2023-12-13'
 '2023-12-14']
minimum: 2010-01-03
maximum: 2023-12-14


column: sale_ask
unique values: [ 770000. 1397478.  505000. ... 2924000. 2035482. 1366000.]
minimum: 0.0
maximum: 102000000.0


column: units
unique values: ['1' '2' '3' '8' '4' '5' '7' '6' '10' '35' '53' '12' '0' '9' '19' '24'
 '40' '21' '36' '26' '16' '20' nan '14' '60' '23' '33' '15' '37' '13' '30'
 '17' '11' '27' '18'
 '2 bdr, 1 bath, 2 weeks, common $445, tax $1968, Corcoran'
 '1 bdr, 1 bath, 17 weeks, 500 SF, $480 common, 57% tax Corcoran' '192'
 '22' '25' '31' '80' '41' '32' '68' '134' '34' '42' '54' '70' '49' '28'
 '48' '43' '39' '38' '29' '62' '46' '56' '44' '52' '47' '51' '86' '167']


TypeError: '<=' not supported between instances of 'str' and 'float'