# Imports

In [442]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

# Load Data

In [443]:
data = pd.read_csv('data/DC_Properties.csv')

  interactivity=interactivity, compiler=compiler, result=result)


# Data Cleaning

## remove rows with missing values

In [444]:
# since our many concern is the price of the building, rows without the price have only a small to none value
data = data[data["PRICE"].notnull()]

# remove rows with strange prices
data = data.loc[(data["PRICE"] <= 25000000) & (data["PRICE"] >= 40000)]

# unqualifed data is not representative for the market
data = data[data.QUALIFIED != "U"]

## drop columns

In [445]:
# see above
data = data.drop(["QUALIFIED"], axis=1)
# this column has only one value "2018-07-22 18:01:43" => useless
data = data.drop(["GIS_LAST_MOD_DTTM"], axis=1)
# since the unique id is equal to the row number we don't need it
data = data.drop(["Unnamed: 0"], axis=1)
# many many missing values
data = data.drop(["LIVING_GBA", "CMPLX_NUM"], axis=1)
# these columns have nothing to say
data = data.drop(["STATE", "CITY", "SOURCE", "BLDG_NUM"], axis=1)
# these columns contain only unique texts which can't be computed
data = data.drop(["NATIONALGRID", "FULLADDRESS"], axis=1)
# in our other notebook we proved that there is a realy high (< 0.999) correlation between x, y and longitude and latititude because in both show the longitude and latitdude of a building
data = data.drop(["X", "Y"], axis=1)
# this column has way to many unique values => makes the onehot encoded data really big. It correlates strongly with the other location information 
data = data.drop(["CENSUS_BLOCK"], axis=1)
# we have to many columns with location information
data = data.drop(["ZIPCODE", "ASSESSMENT_NBHD", "CENSUS_TRACT", "WARD", "SQUARE", "QUADRANT"], axis=1)

## data insertion

In [446]:
data.loc[data['YR_RMDL'].isnull(), 'YR_RMDL'] = data['EYB']
data.loc[data['AYB'].isnull(), 'AYB'] = data['EYB']
# set the missing sale date to the year it was build
data["SALEDATE"] = np.where(data["SALEDATE"].isnull(), data["AYB"], data["SALEDATE"])
# set missing values with the mean, mode or median
data.NUM_UNITS[data.NUM_UNITS.isnull()] =  data.NUM_UNITS.mode().iloc[0]
data.STORIES=data.STORIES.round()
data.STORIES[data.STORIES.isnull()] =  data.STORIES.mode().iloc[0]
data.GBA[data.GBA.isnull()] =  data.GBA.mean()
data.STYLE[data.STYLE.isnull()] =  data.STYLE.mode().iloc[0]
data.STRUCT[data.STRUCT.isnull()] =  data.STRUCT.mode().iloc[0]
data.GRADE[data.GRADE.isnull()] =  data.GRADE.mode().iloc[0]
data.CNDTN[data.CNDTN.isnull()] =  data.CNDTN.mode().iloc[0]
data.EXTWALL[data.EXTWALL.isnull()] =  data.EXTWALL.mode().iloc[0]
data.ROOF[data.ROOF.isnull()] =  data.ROOF.mode().iloc[0]
data.INTWALL[data.INTWALL.isnull()] =  data.INTWALL.mode().iloc[0]
data.KITCHENS[data.KITCHENS.isnull()] =  data.KITCHENS.mode().iloc[0]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexi

In [447]:
# these values make no sense
data.ROOMS = data.ROOMS.astype(int)
data = data[data.ROOMS != 0]
data = data[data.AC != "0"]
data.STORIES = data.STORIES.astype(int)
data = data[data.STORIES < 50]
data = data[data.FIREPLACES < 20]
data = data[data.FIREPLACES < 40]
data = data[data.ASSESSMENT_SUBNBHD.notnull()]

## data convertion

In [448]:
#convert saledate to datetime
data['SALEDATE'] = pd.to_datetime(data['SALEDATE'])
#Calculating the difference in years between Last Sale Date and Year Built
data['SalevYB']=data['SALEDATE'].dt.year - data['AYB']
#Calculating the difference in years between Last Sale Date and Year Improved
data['SalevYI']=data['SALEDATE'].dt.year - data['EYB']

data['SalevYR']=data['SALEDATE'].dt.year - data['YR_RMDL']

data = data.drop(["SALEDATE", "EYB", "AYB", "YR_RMDL"], axis=1)

## final results

In [449]:
missing_values = data.isnull().sum()
print("All Data:")
print("Row Count: " + str(data.shape[0]))
print("Col Count: " + str(data.shape[1]))
print()
print(missing_values)

All Data:
Row Count: 77223
Col Count: 29

BATHRM                0
HF_BATHRM             0
HEAT                  0
AC                    0
NUM_UNITS             0
ROOMS                 0
BEDRM                 0
STORIES               0
PRICE                 0
QUALIFIED             0
SALE_NUM              0
GBA                   0
STYLE                 0
STRUCT                0
GRADE                 0
CNDTN                 0
EXTWALL               0
ROOF                  0
INTWALL               0
KITCHENS              0
FIREPLACES            0
USECODE               0
LANDAREA              0
LATITUDE              0
LONGITUDE             0
ASSESSMENT_SUBNBHD    0
SalevYB               0
SalevYI               0
SalevYR               0
dtype: int64


In [450]:
data.describe()

Unnamed: 0,BATHRM,HF_BATHRM,NUM_UNITS,ROOMS,BEDRM,STORIES,PRICE,SALE_NUM,GBA,KITCHENS,FIREPLACES,USECODE,LANDAREA,LATITUDE,LONGITUDE,SalevYB,SalevYI,SalevYR
count,77223.0,77223.0,77223.0,77223.0,77223.0,77223.0,77223.0,77223.0,77223.0,77223.0,77223.0,77223.0,77223.0,77223.0,77223.0,77223.0,77223.0,77223.0
mean,1.899162,0.455913,1.155148,6.113502,2.724111,2.055022,523844.4,1.891535,1705.527111,1.173588,0.462427,14.485412,2069.942595,38.911831,-77.012934,69.16701,45.22505,18.211841
std,0.979668,0.572225,0.531226,2.612614,1.351563,0.343191,611253.7,1.43426,633.313668,0.564643,0.795199,3.865696,2599.152189,0.029976,0.039791,36.572381,28.254165,23.811674
min,0.0,0.0,0.0,1.0,0.0,0.0,40000.0,1.0,252.0,0.0,0.0,11.0,0.0,38.819731,-77.113909,-14.0,-21.0,-26.0
25%,1.0,0.0,1.0,4.0,2.0,2.0,236600.0,1.0,1360.0,1.0,0.0,11.0,585.0,38.894172,-77.035304,50.0,30.0,1.0
50%,2.0,0.0,1.0,6.0,3.0,2.0,400000.0,1.0,1726.985225,1.0,0.0,13.0,1452.0,38.912721,-77.017776,77.0,45.0,8.0
75%,2.0,1.0,1.0,7.0,3.0,2.0,650000.0,3.0,1726.985225,1.0,1.0,16.0,2519.0,38.930909,-76.987067,96.0,57.0,35.0
max,11.0,11.0,6.0,31.0,20.0,20.0,23960290.0,15.0,14966.0,44.0,13.0,117.0,187301.0,38.982532,-76.909758,262.0,195.0,1995.0


## save to csv

In [451]:
data.to_csv("data/data_cleaned.csv", index=False)