# Imports

In [381]:
import pandas as pd
import numpy as np

# Load Data

In [382]:
data = pd.read_csv('data/DC_Properties.csv')

  interactivity=interactivity, compiler=compiler, result=result)


# Data Exploration

In [383]:
data.shape # we have 49 columns and 158957 rows

(158957, 49)

In [384]:
missing_values = data.isnull().sum()

print("Row Count: " + str(data.shape[0]))
print()
print(missing_values)

Row Count: 158957

Unnamed: 0                 0
BATHRM                     0
HF_BATHRM                  0
HEAT                       0
AC                         0
NUM_UNITS              52261
ROOMS                      0
BEDRM                      0
AYB                      271
YR_RMDL                78029
EYB                        0
STORIES                52305
SALEDATE               26770
PRICE                  60741
QUALIFIED                  0
SALE_NUM                   0
GBA                    52261
BLDG_NUM                   0
STYLE                  52261
STRUCT                 52261
GRADE                  52261
CNDTN                  52261
EXTWALL                52261
ROOF                   52261
INTWALL                52261
KITCHENS               52262
FIREPLACES                 0
USECODE                    0
LANDAREA                   0
GIS_LAST_MOD_DTTM          0
SOURCE                     0
CMPLX_NUM             106696
LIVING_GBA            106696
FULLADDRESS            5

In [385]:
data[data["STYLE"].notnull()].shape # 52.261 rows don't have a value for the "STYLE" column

(106696, 49)

In [386]:
data[data["STRUCT"].notnull() | data["STYLE"].notnull() | data["GRADE"].notnull() | data["CNDTN"].notnull() | data["EXTWALL"].notnull() | data["ROOF"].notnull() | data["INTWALL"].notnull()].shape # the same rows don't have a value for: "STRUCT", "GRADE", "CNDTN", "EXTWALL", "ROOF", "INTWALL"

(106696, 49)

In [387]:
# command copied form data cleaning
data_dropped_rows = data[data["STRUCT"].notnull() | data["STYLE"].notnull() | data["GRADE"].notnull() | data["CNDTN"].notnull() | data["EXTWALL"].notnull() | data["ROOF"].notnull() | data["INTWALL"].notnull()]

In [388]:
missing_values = data_dropped_rows.isnull().sum()

print("Row Count: " + str(data_dropped_rows.shape[0]))
print()
print(missing_values)

Row Count: 106696

Unnamed: 0                 0
BATHRM                     0
HF_BATHRM                  0
HEAT                       0
AC                         0
NUM_UNITS                  0
ROOMS                      0
BEDRM                      0
AYB                      241
YR_RMDL                57417
EYB                        0
STORIES                   44
SALEDATE               22513
PRICE                  48796
QUALIFIED                  0
SALE_NUM                   0
GBA                        0
BLDG_NUM                   0
STYLE                      0
STRUCT                     0
GRADE                      0
CNDTN                      0
EXTWALL                    0
ROOF                       0
INTWALL                    0
KITCHENS                   1
FIREPLACES                 0
USECODE                    0
LANDAREA                   0
GIS_LAST_MOD_DTTM          0
SOURCE                     0
CMPLX_NUM             106696
LIVING_GBA            106696
FULLADDRESS             

1) no row has a value for the cmplx_num and the living_gba => useless
2) there are many missing values for the sale_date and the yr_rmdl => maybe they aren't missing, it could be the case, that these houses were never remodeled and/ or sold before, in the following we will probably handle them this way.

There is a column called "Unnamed: 0", lets see what's in there.

In [389]:
data["Unnamed: 0"]

0              0
1              1
2              2
3              3
4              4
5              5
6              6
7              7
8              8
9              9
10            10
11            11
12            12
13            13
14            14
15            15
16            16
17            17
18            18
19            19
20            20
21            21
22            22
23            23
24            24
25            25
26            26
27            27
28            28
29            29
           ...  
158927    158927
158928    158928
158929    158929
158930    158930
158931    158931
158932    158932
158933    158933
158934    158934
158935    158935
158936    158936
158937    158937
158938    158938
158939    158939
158940    158940
158941    158941
158942    158942
158943    158943
158944    158944
158945    158945
158946    158946
158947    158947
158948    158948
158949    158949
158950    158950
158951    158951
158952    158952
158953    158953
158954    1589

It's just the same as the index (probably the unique id column).

In [390]:
data["STATE"].unique() 

array(['DC', nan], dtype=object)

In [391]:
data["CITY"].unique()

array(['WASHINGTON', nan], dtype=object)

Since our dataset is only about Washington DC these columns habe no use.

In [392]:
data["ZIPCODE"].unique()

array([ 20009.,  20037.,  20036.,  20005.,  20052.,  20008.,  20001.,
        20024.,  20006.,  20003.,  20002.,  20007.,  20016.,  20015.,
        20011.,  20010.,  20012.,  20017.,  20018.,  20019.,  20020.,
        20032.,     nan,  20392.,  20004.])

# Data Cleaning

## remove rows with missing values

In [393]:
# this command will throw away ~33% of the data (maybe we will keep the data and do two seperate tests, one where we will throw away the rows where the data is missing and one where we will throw away these columns)
# data = data[data["STRUCT"].notnull() | data["STYLE"].notnull() | data["GRADE"].notnull() | data["CNDTN"].notnull() | data["EXTWALL"].notnull() | data["ROOF"].notnull() | data["INTWALL"].notnull()]

# since our many concern is the price of the building, rows without the price have only a small to none value
data = data[data["PRICE"].notnull()]

# in these very few rows (~200) there are values missing
data = data[data["X"].notnull() & data["Y"].notnull() & data["QUADRANT"].notnull() & data["AYB"].notnull() & data["WARD"].notnull() & data["ASSESSMENT_NBHD"].notnull() & data["CENSUS_TRACT"].notnull() & data["LONGITUDE"].notnull() & data["LATITUDE"].notnull() & data["ZIPCODE"].notnull()] 

## drop columns

In [394]:
# this column has only one value "2018-07-22 18:01:43" => useless
data = data.drop(["GIS_LAST_MOD_DTTM"], axis=1)
# since the unique id is equal to the row number we don't need it
data = data.drop(["Unnamed: 0"], axis=1)
# many many missing values
data = data.drop(["LIVING_GBA", "CMPLX_NUM"], axis=1)
# these columns have nothing to say
data = data.drop(["STATE", "CITY"], axis=1)


## data insertion

In [395]:
# set the missing sale date to the year it was build
data["SALEDATE"] = np.where(data["SALEDATE"].isnull(), data["AYB"], data["SALEDATE"])

# add a column wich says wether a building was remodeled and insert missing values in to the YR_RMDL column
data["WAS_REMODELED"] = np.where(data["YR_RMDL"].isnull(), 0, 1)
data["YR_RMDL"] = np.where(data["YR_RMDL"].isnull(), -1, data["YR_RMDL"])

## final results

In [398]:
missing_values = data.isnull().sum()
print("All Data:")
print("Row Count: " + str(data.shape[0]))
print("Col Count: " + str(data.shape[1]))
print()
print(missing_values)

data_col = data.drop(["NUM_UNITS", "STORIES", "GBA", "STYLE", "STRUCT", "GRADE", "CNDTN", "EXTWALL", "ROOF", "INTWALL", "KITCHENS", "FULLADDRESS", "NATIONALGRID", "ASSESSMENT_SUBNBHD", "CENSUS_BLOCK"], axis=1)
missing_values_col = data_col.isnull().sum()
print()
print("--------------------------------------------------")
print()
print("Data with dropped columns:")
print("Row Count: " + str(data_col.shape[0]))
print("Col Count: " + str(data_col.shape[1]))
print()
print(missing_values_col)

data_row = data[data["STRUCT"].notnull() & data["STYLE"].notnull() & data["GRADE"].notnull() & data["CNDTN"].notnull() & data["EXTWALL"].notnull() & data["ROOF"].notnull() & data["INTWALL"].notnull()  & data["STORIES"].notnull() & data["KITCHENS"].notnull()  & data["FULLADDRESS"].notnull()  & data["NATIONALGRID"].notnull()  & data["ASSESSMENT_SUBNBHD"].notnull()  & data["CENSUS_BLOCK"].notnull()]
missing_values_row = data_row.isnull().sum()
print()
print("--------------------------------------------------")
print()
print("Data with dropped rows:")
print("Row Count: " + str(data_row.shape[0]))
print("Col Count: " + str(data_row.shape[1]))
print()
print(missing_values_row)



All Data:
Row Count: 98003
Col Count: 44

BATHRM                    0
HF_BATHRM                 0
HEAT                      0
AC                        0
NUM_UNITS             40299
ROOMS                     0
BEDRM                     0
AYB                       0
YR_RMDL                   0
EYB                       0
STORIES               40332
SALEDATE                  0
PRICE                     0
QUALIFIED                 0
SALE_NUM                  0
GBA                   40299
BLDG_NUM                  0
STYLE                 40299
STRUCT                40299
GRADE                 40299
CNDTN                 40299
EXTWALL               40299
ROOF                  40299
INTWALL               40299
KITCHENS              40300
FIREPLACES                0
USECODE                   0
LANDAREA                  0
SOURCE                    0
FULLADDRESS           40629
ZIPCODE                   0
NATIONALGRID          40625
LATITUDE                  0
LONGITUDE                 0
ASSESS

# Use Logistic Regression