# D.C. Properties - Fixing the data

This notebook fixes the most important columns of the DC Properties dataset and selects the most revelant columns.

## Imports and Config setting

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
pd.set_option('display.max_columns', None)

## Data loading and Selection

Define a series of parameters that will be used in the notebook

In [3]:
# Params
input_data_path = '0_dc_properties_raw_zipped.csv'
output_data_path = '1_dc_properties_fixed_zipped.csv'


Load the data file and give a preview of it

In [4]:
data_df = pd.read_csv(input_data_path, low_memory=False, index_col=0, compression='zip')
data_df

Unnamed: 0,BATHRM,HF_BATHRM,HEAT,AC,NUM_UNITS,ROOMS,BEDRM,AYB,YR_RMDL,EYB,STORIES,SALEDATE,PRICE,QUALIFIED,SALE_NUM,GBA,BLDG_NUM,STYLE,STRUCT,GRADE,CNDTN,EXTWALL,ROOF,INTWALL,KITCHENS,FIREPLACES,USECODE,LANDAREA,GIS_LAST_MOD_DTTM,SOURCE,CMPLX_NUM,LIVING_GBA,FULLADDRESS,CITY,STATE,ZIPCODE,NATIONALGRID,LATITUDE,LONGITUDE,ASSESSMENT_NBHD,ASSESSMENT_SUBNBHD,CENSUS_TRACT,CENSUS_BLOCK,WARD,SQUARE,X,Y,QUADRANT
0,4,0,Warm Cool,Y,2.0,8,4,1910.0,1988.0,1972,3.0,2003-11-25 00:00:00,1095000.0,Q,1,2522.0,1,3 Story,Row Inside,Very Good,Good,Common Brick,Metal- Sms,Hardwood,2.0,5,24,1680,2018-07-22 18:01:43,Residential,,,1748 SWANN STREET NW,WASHINGTON,DC,20009.0,18S UJ 23061 09289,38.914680,-77.040832,Old City 2,040 D Old City 2,4201.0,004201 2006,Ward 2,0152,-77.040429,38.914881,NW
1,3,1,Warm Cool,Y,2.0,11,5,1898.0,2007.0,1972,3.0,2000-08-17 00:00:00,,U,1,2567.0,1,3 Story,Row Inside,Very Good,Good,Common Brick,Built Up,Hardwood,2.0,4,24,1680,2018-07-22 18:01:43,Residential,,,1746 SWANN STREET NW,WASHINGTON,DC,20009.0,18S UJ 23067 09289,38.914683,-77.040764,Old City 2,040 D Old City 2,4201.0,004201 2006,Ward 2,0152,-77.040429,38.914881,NW
2,3,1,Hot Water Rad,Y,2.0,9,5,1910.0,2009.0,1984,3.0,2016-06-21 00:00:00,2100000.0,Q,3,2522.0,1,3 Story,Row Inside,Very Good,Very Good,Common Brick,Built Up,Hardwood,2.0,4,24,1680,2018-07-22 18:01:43,Residential,,,1744 SWANN STREET NW,WASHINGTON,DC,20009.0,18S UJ 23074 09289,38.914684,-77.040678,Old City 2,040 D Old City 2,4201.0,004201 2006,Ward 2,0152,-77.040429,38.914881,NW
3,3,1,Hot Water Rad,Y,2.0,8,5,1900.0,2003.0,1984,3.0,2006-07-12 00:00:00,1602000.0,Q,1,2484.0,1,3 Story,Row Inside,Very Good,Good,Common Brick,Built Up,Hardwood,2.0,3,24,1680,2018-07-22 18:01:43,Residential,,,1742 SWANN STREET NW,WASHINGTON,DC,20009.0,18S UJ 23078 09288,38.914683,-77.040629,Old City 2,040 D Old City 2,4201.0,004201 2006,Ward 2,0152,-77.040429,38.914881,NW
4,2,1,Warm Cool,Y,1.0,11,3,1913.0,2012.0,1985,3.0,,,U,1,5255.0,1,3 Story,Semi-Detached,Very Good,Good,Common Brick,Neopren,Hardwood,1.0,0,13,2032,2018-07-22 18:01:43,Residential,,,1804 NEW HAMPSHIRE AVENUE NW,WASHINGTON,DC,20009.0,18S UJ 23188 09253,38.914383,-77.039361,Old City 2,040 D Old City 2,4201.0,004201 2006,Ward 2,0152,-77.040429,38.914881,NW
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
158952,1,0,Forced Air,Y,,3,1,1938.0,2006.0,1938,,2015-04-03 00:00:00,399900.0,Q,4,,1,,,,,,,,,0,16,394,2018-07-22 18:01:38,Condominium,2786.0,639.0,,,,20001.0,,38.911840,-77.019420,Old City 2,040 B Old City 2,4801.0,,Ward 6,0477,-77.019422,38.911848,NW
158953,1,0,Forced Air,Y,,4,2,1938.0,2006.0,1938,,2013-10-04 00:00:00,416000.0,Q,1,,1,,,,,,,,,0,16,506,2018-07-22 18:01:38,Condominium,2786.0,820.0,,,,20001.0,,38.911840,-77.019420,Old City 2,040 B Old City 2,4801.0,,Ward 6,0477,-77.019422,38.911848,NW
158954,2,0,Forced Air,Y,,4,2,1920.0,2007.0,1920,,2008-09-30 00:00:00,600000.0,U,1,,1,,,,,,,,,0,16,467,2018-07-22 18:01:38,Condominium,2880.0,1167.0,,,,20001.0,,38.911840,-77.019420,Old City 2,040 B Old City 2,4801.0,,Ward 6,0477,-77.019422,38.911848,NW
158955,1,0,Warm Cool,Y,,2,0,1965.0,,1965,,2015-04-14 00:00:00,215100.0,Q,3,,1,,,,,,,,,0,17,332,2018-07-22 18:01:38,Condominium,2275.0,447.0,,,,20024.0,,38.872953,-77.018230,Southwest Waterfront,,11000.0,,Ward 6,0504,-77.018232,38.872961,SW


## Drop some nulls

One of the most important aspects of this dataset is the condition of the building. Later on we will try to predict such condition, therefore we want to ignore those rows that don't have a value for it

In [5]:
# We don't want data that we don't know the condition
data_df = data_df[~data_df['CNDTN'].isnull()]

In [6]:
# Check the number of nulls in the data
data_df.isnull().sum()

BATHRM                     0
HF_BATHRM                  0
HEAT                       0
AC                         0
NUM_UNITS                  0
ROOMS                      0
BEDRM                      0
AYB                      241
YR_RMDL                57417
EYB                        0
STORIES                   44
SALEDATE               22513
PRICE                  48796
QUALIFIED                  0
SALE_NUM                   0
GBA                        0
BLDG_NUM                   0
STYLE                      0
STRUCT                     0
GRADE                      0
CNDTN                      0
EXTWALL                    0
ROOF                       0
INTWALL                    0
KITCHENS                   1
FIREPLACES                 0
USECODE                    0
LANDAREA                   0
GIS_LAST_MOD_DTTM          0
SOURCE                     0
CMPLX_NUM             106696
LIVING_GBA            106696
FULLADDRESS              656
CITY                     645
STATE         

## Pre-processing

Originally the data had the SALEDATE as a timestamp, usually it's kind of painful to work with timestamps and for the purposes of this exercise we just want to know the year of sale

In [7]:
# SALEDATE conversion to extract the year and name it YR_SALE
data_df['YR_SALE'] = pd.DatetimeIndex(data_df['SALEDATE']).year
data_df = data_df.drop('SALEDATE', axis=1)
data_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,BATHRM,HF_BATHRM,HEAT,AC,NUM_UNITS,ROOMS,BEDRM,AYB,YR_RMDL,EYB,STORIES,PRICE,QUALIFIED,SALE_NUM,GBA,BLDG_NUM,STYLE,STRUCT,GRADE,CNDTN,EXTWALL,ROOF,INTWALL,KITCHENS,FIREPLACES,USECODE,LANDAREA,GIS_LAST_MOD_DTTM,SOURCE,CMPLX_NUM,LIVING_GBA,FULLADDRESS,CITY,STATE,ZIPCODE,NATIONALGRID,LATITUDE,LONGITUDE,ASSESSMENT_NBHD,ASSESSMENT_SUBNBHD,CENSUS_TRACT,CENSUS_BLOCK,WARD,SQUARE,X,Y,QUADRANT,YR_SALE
0,4,0,Warm Cool,Y,2.0,8,4,1910.0,1988.0,1972,3.0,1095000.0,Q,1,2522.0,1,3 Story,Row Inside,Very Good,Good,Common Brick,Metal- Sms,Hardwood,2.0,5,24,1680,2018-07-22 18:01:43,Residential,,,1748 SWANN STREET NW,WASHINGTON,DC,20009.0,18S UJ 23061 09289,38.914680,-77.040832,Old City 2,040 D Old City 2,4201.0,004201 2006,Ward 2,0152,-77.040429,38.914881,NW,2003.0
1,3,1,Warm Cool,Y,2.0,11,5,1898.0,2007.0,1972,3.0,,U,1,2567.0,1,3 Story,Row Inside,Very Good,Good,Common Brick,Built Up,Hardwood,2.0,4,24,1680,2018-07-22 18:01:43,Residential,,,1746 SWANN STREET NW,WASHINGTON,DC,20009.0,18S UJ 23067 09289,38.914683,-77.040764,Old City 2,040 D Old City 2,4201.0,004201 2006,Ward 2,0152,-77.040429,38.914881,NW,2000.0
2,3,1,Hot Water Rad,Y,2.0,9,5,1910.0,2009.0,1984,3.0,2100000.0,Q,3,2522.0,1,3 Story,Row Inside,Very Good,Very Good,Common Brick,Built Up,Hardwood,2.0,4,24,1680,2018-07-22 18:01:43,Residential,,,1744 SWANN STREET NW,WASHINGTON,DC,20009.0,18S UJ 23074 09289,38.914684,-77.040678,Old City 2,040 D Old City 2,4201.0,004201 2006,Ward 2,0152,-77.040429,38.914881,NW,2016.0
3,3,1,Hot Water Rad,Y,2.0,8,5,1900.0,2003.0,1984,3.0,1602000.0,Q,1,2484.0,1,3 Story,Row Inside,Very Good,Good,Common Brick,Built Up,Hardwood,2.0,3,24,1680,2018-07-22 18:01:43,Residential,,,1742 SWANN STREET NW,WASHINGTON,DC,20009.0,18S UJ 23078 09288,38.914683,-77.040629,Old City 2,040 D Old City 2,4201.0,004201 2006,Ward 2,0152,-77.040429,38.914881,NW,2006.0
4,2,1,Warm Cool,Y,1.0,11,3,1913.0,2012.0,1985,3.0,,U,1,5255.0,1,3 Story,Semi-Detached,Very Good,Good,Common Brick,Neopren,Hardwood,1.0,0,13,2032,2018-07-22 18:01:43,Residential,,,1804 NEW HAMPSHIRE AVENUE NW,WASHINGTON,DC,20009.0,18S UJ 23188 09253,38.914383,-77.039361,Old City 2,040 D Old City 2,4201.0,004201 2006,Ward 2,0152,-77.040429,38.914881,NW,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106691,2,0,Forced Air,N,2.0,8,4,1953.0,,1962,2.0,,U,1,1600.0,1,2 Story,Multi,Average,Average,Common Brick,Built Up,Hardwood,2.0,0,23,6337,2018-07-22 18:01:43,Residential,,,123 JOLIET STREET SW,WASHINGTON,DC,20032.0,18S UH 25829 98897,38.821651,-77.006283,Congress Heights,016 A Congress Heights,10900.0,010900 2000,Ward 8,6254,-77.006347,38.821799,SW,
106692,2,0,Forced Air,N,2.0,10,5,1953.0,,1962,2.0,100000.0,U,1,1600.0,1,2 Story,Multi,Average,Average,Common Brick,Built Up,Hardwood,2.0,0,23,5348,2018-07-22 18:01:43,Residential,,,127 JOLIET STREET SW,WASHINGTON,DC,20032.0,18S UH 25818 98885,38.821534,-77.006407,Congress Heights,016 A Congress Heights,10900.0,010900 2000,Ward 8,6254,-77.006347,38.821799,SW,2012.0
106693,2,0,Forced Air,N,2.0,10,4,1953.0,,1953,2.0,,U,1,1600.0,1,2 Story,Multi,Average,Average,Common Brick,Built Up,Hardwood,2.0,0,23,3466,2018-07-22 18:01:43,Residential,,,131 JOLIET STREET SW,WASHINGTON,DC,20032.0,18S UH 25815 98879,38.821481,-77.006446,Congress Heights,016 A Congress Heights,10900.0,010900 2000,Ward 8,6254,-77.006347,38.821799,SW,2009.0
106694,2,0,Forced Air,N,2.0,10,4,1953.0,2017.0,1971,2.0,215000.0,U,4,1600.0,1,2 Story,Multi,Average,Good,Common Brick,Comp Shingle,Hardwood,2.0,0,23,3046,2018-07-22 18:01:43,Residential,,,135 JOLIET STREET SW,WASHINGTON,DC,20032.0,18S UH 25807 98865,38.821356,-77.006528,Congress Heights,016 A Congress Heights,10900.0,010900 2000,Ward 8,6254,-77.006347,38.821799,SW,2017.0


The other relevant value that we already looked at is condition. Let's take a look at the count of each value of it.

In [8]:
# Look at the distribution of the values
data_df['CNDTN'].value_counts()

Average      58217
Good         37497
Very Good     8130
Excellent     1338
Fair          1320
Poor           175
Default         19
Name: CNDTN, dtype: int64

Given that is not clear what default means (in compare to the other values), and that is present in such a small number of rows, let's go ahead a delete those rows

In [9]:
# Remove Default
data_df = data_df[data_df.CNDTN != 'Default']
data_df

Unnamed: 0,BATHRM,HF_BATHRM,HEAT,AC,NUM_UNITS,ROOMS,BEDRM,AYB,YR_RMDL,EYB,STORIES,PRICE,QUALIFIED,SALE_NUM,GBA,BLDG_NUM,STYLE,STRUCT,GRADE,CNDTN,EXTWALL,ROOF,INTWALL,KITCHENS,FIREPLACES,USECODE,LANDAREA,GIS_LAST_MOD_DTTM,SOURCE,CMPLX_NUM,LIVING_GBA,FULLADDRESS,CITY,STATE,ZIPCODE,NATIONALGRID,LATITUDE,LONGITUDE,ASSESSMENT_NBHD,ASSESSMENT_SUBNBHD,CENSUS_TRACT,CENSUS_BLOCK,WARD,SQUARE,X,Y,QUADRANT,YR_SALE
0,4,0,Warm Cool,Y,2.0,8,4,1910.0,1988.0,1972,3.0,1095000.0,Q,1,2522.0,1,3 Story,Row Inside,Very Good,Good,Common Brick,Metal- Sms,Hardwood,2.0,5,24,1680,2018-07-22 18:01:43,Residential,,,1748 SWANN STREET NW,WASHINGTON,DC,20009.0,18S UJ 23061 09289,38.914680,-77.040832,Old City 2,040 D Old City 2,4201.0,004201 2006,Ward 2,0152,-77.040429,38.914881,NW,2003.0
1,3,1,Warm Cool,Y,2.0,11,5,1898.0,2007.0,1972,3.0,,U,1,2567.0,1,3 Story,Row Inside,Very Good,Good,Common Brick,Built Up,Hardwood,2.0,4,24,1680,2018-07-22 18:01:43,Residential,,,1746 SWANN STREET NW,WASHINGTON,DC,20009.0,18S UJ 23067 09289,38.914683,-77.040764,Old City 2,040 D Old City 2,4201.0,004201 2006,Ward 2,0152,-77.040429,38.914881,NW,2000.0
2,3,1,Hot Water Rad,Y,2.0,9,5,1910.0,2009.0,1984,3.0,2100000.0,Q,3,2522.0,1,3 Story,Row Inside,Very Good,Very Good,Common Brick,Built Up,Hardwood,2.0,4,24,1680,2018-07-22 18:01:43,Residential,,,1744 SWANN STREET NW,WASHINGTON,DC,20009.0,18S UJ 23074 09289,38.914684,-77.040678,Old City 2,040 D Old City 2,4201.0,004201 2006,Ward 2,0152,-77.040429,38.914881,NW,2016.0
3,3,1,Hot Water Rad,Y,2.0,8,5,1900.0,2003.0,1984,3.0,1602000.0,Q,1,2484.0,1,3 Story,Row Inside,Very Good,Good,Common Brick,Built Up,Hardwood,2.0,3,24,1680,2018-07-22 18:01:43,Residential,,,1742 SWANN STREET NW,WASHINGTON,DC,20009.0,18S UJ 23078 09288,38.914683,-77.040629,Old City 2,040 D Old City 2,4201.0,004201 2006,Ward 2,0152,-77.040429,38.914881,NW,2006.0
4,2,1,Warm Cool,Y,1.0,11,3,1913.0,2012.0,1985,3.0,,U,1,5255.0,1,3 Story,Semi-Detached,Very Good,Good,Common Brick,Neopren,Hardwood,1.0,0,13,2032,2018-07-22 18:01:43,Residential,,,1804 NEW HAMPSHIRE AVENUE NW,WASHINGTON,DC,20009.0,18S UJ 23188 09253,38.914383,-77.039361,Old City 2,040 D Old City 2,4201.0,004201 2006,Ward 2,0152,-77.040429,38.914881,NW,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106691,2,0,Forced Air,N,2.0,8,4,1953.0,,1962,2.0,,U,1,1600.0,1,2 Story,Multi,Average,Average,Common Brick,Built Up,Hardwood,2.0,0,23,6337,2018-07-22 18:01:43,Residential,,,123 JOLIET STREET SW,WASHINGTON,DC,20032.0,18S UH 25829 98897,38.821651,-77.006283,Congress Heights,016 A Congress Heights,10900.0,010900 2000,Ward 8,6254,-77.006347,38.821799,SW,
106692,2,0,Forced Air,N,2.0,10,5,1953.0,,1962,2.0,100000.0,U,1,1600.0,1,2 Story,Multi,Average,Average,Common Brick,Built Up,Hardwood,2.0,0,23,5348,2018-07-22 18:01:43,Residential,,,127 JOLIET STREET SW,WASHINGTON,DC,20032.0,18S UH 25818 98885,38.821534,-77.006407,Congress Heights,016 A Congress Heights,10900.0,010900 2000,Ward 8,6254,-77.006347,38.821799,SW,2012.0
106693,2,0,Forced Air,N,2.0,10,4,1953.0,,1953,2.0,,U,1,1600.0,1,2 Story,Multi,Average,Average,Common Brick,Built Up,Hardwood,2.0,0,23,3466,2018-07-22 18:01:43,Residential,,,131 JOLIET STREET SW,WASHINGTON,DC,20032.0,18S UH 25815 98879,38.821481,-77.006446,Congress Heights,016 A Congress Heights,10900.0,010900 2000,Ward 8,6254,-77.006347,38.821799,SW,2009.0
106694,2,0,Forced Air,N,2.0,10,4,1953.0,2017.0,1971,2.0,215000.0,U,4,1600.0,1,2 Story,Multi,Average,Good,Common Brick,Comp Shingle,Hardwood,2.0,0,23,3046,2018-07-22 18:01:43,Residential,,,135 JOLIET STREET SW,WASHINGTON,DC,20032.0,18S UH 25807 98865,38.821356,-77.006528,Congress Heights,016 A Congress Heights,10900.0,010900 2000,Ward 8,6254,-77.006347,38.821799,SW,2017.0


Finally, it will be kind of painful to work with a string value in our target variable. Therefore, let's encode it to a numerical value. 

Note: The order of the encoding is important

In [11]:
# Encode the CNDTN values
from sklearn.preprocessing import OrdinalEncoder

condition_values = ['Poor', 'Fair', 'Average', 'Good', 'Very Good', 'Excellent']

ordinal_encoder = OrdinalEncoder(categories=[condition_values], handle_unknown='use_encoded_value', unknown_value=-1)
ordinal_encoder.fit(data_df[['CNDTN']])
encoded_data = pd.DataFrame(ordinal_encoder.transform(data_df[['CNDTN']]), columns=['CNDTN'])
data_df = data_df.drop(['CNDTN'], axis=1).reset_index(drop=True)

data_df = pd.concat([data_df, encoded_data], axis=1)
data_df

Unnamed: 0,BATHRM,HF_BATHRM,HEAT,AC,NUM_UNITS,ROOMS,BEDRM,AYB,YR_RMDL,EYB,STORIES,PRICE,QUALIFIED,SALE_NUM,GBA,BLDG_NUM,STYLE,STRUCT,GRADE,EXTWALL,ROOF,INTWALL,KITCHENS,FIREPLACES,USECODE,LANDAREA,GIS_LAST_MOD_DTTM,SOURCE,CMPLX_NUM,LIVING_GBA,FULLADDRESS,CITY,STATE,ZIPCODE,NATIONALGRID,LATITUDE,LONGITUDE,ASSESSMENT_NBHD,ASSESSMENT_SUBNBHD,CENSUS_TRACT,CENSUS_BLOCK,WARD,SQUARE,X,Y,QUADRANT,YR_SALE,CNDTN
0,4,0,Warm Cool,Y,2.0,8,4,1910.0,1988.0,1972,3.0,1095000.0,Q,1,2522.0,1,3 Story,Row Inside,Very Good,Common Brick,Metal- Sms,Hardwood,2.0,5,24,1680,2018-07-22 18:01:43,Residential,,,1748 SWANN STREET NW,WASHINGTON,DC,20009.0,18S UJ 23061 09289,38.914680,-77.040832,Old City 2,040 D Old City 2,4201.0,004201 2006,Ward 2,0152,-77.040429,38.914881,NW,2003.0,3.0
1,3,1,Warm Cool,Y,2.0,11,5,1898.0,2007.0,1972,3.0,,U,1,2567.0,1,3 Story,Row Inside,Very Good,Common Brick,Built Up,Hardwood,2.0,4,24,1680,2018-07-22 18:01:43,Residential,,,1746 SWANN STREET NW,WASHINGTON,DC,20009.0,18S UJ 23067 09289,38.914683,-77.040764,Old City 2,040 D Old City 2,4201.0,004201 2006,Ward 2,0152,-77.040429,38.914881,NW,2000.0,3.0
2,3,1,Hot Water Rad,Y,2.0,9,5,1910.0,2009.0,1984,3.0,2100000.0,Q,3,2522.0,1,3 Story,Row Inside,Very Good,Common Brick,Built Up,Hardwood,2.0,4,24,1680,2018-07-22 18:01:43,Residential,,,1744 SWANN STREET NW,WASHINGTON,DC,20009.0,18S UJ 23074 09289,38.914684,-77.040678,Old City 2,040 D Old City 2,4201.0,004201 2006,Ward 2,0152,-77.040429,38.914881,NW,2016.0,4.0
3,3,1,Hot Water Rad,Y,2.0,8,5,1900.0,2003.0,1984,3.0,1602000.0,Q,1,2484.0,1,3 Story,Row Inside,Very Good,Common Brick,Built Up,Hardwood,2.0,3,24,1680,2018-07-22 18:01:43,Residential,,,1742 SWANN STREET NW,WASHINGTON,DC,20009.0,18S UJ 23078 09288,38.914683,-77.040629,Old City 2,040 D Old City 2,4201.0,004201 2006,Ward 2,0152,-77.040429,38.914881,NW,2006.0,3.0
4,2,1,Warm Cool,Y,1.0,11,3,1913.0,2012.0,1985,3.0,,U,1,5255.0,1,3 Story,Semi-Detached,Very Good,Common Brick,Neopren,Hardwood,1.0,0,13,2032,2018-07-22 18:01:43,Residential,,,1804 NEW HAMPSHIRE AVENUE NW,WASHINGTON,DC,20009.0,18S UJ 23188 09253,38.914383,-77.039361,Old City 2,040 D Old City 2,4201.0,004201 2006,Ward 2,0152,-77.040429,38.914881,NW,,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106672,2,0,Forced Air,N,2.0,8,4,1953.0,,1962,2.0,,U,1,1600.0,1,2 Story,Multi,Average,Common Brick,Built Up,Hardwood,2.0,0,23,6337,2018-07-22 18:01:43,Residential,,,123 JOLIET STREET SW,WASHINGTON,DC,20032.0,18S UH 25829 98897,38.821651,-77.006283,Congress Heights,016 A Congress Heights,10900.0,010900 2000,Ward 8,6254,-77.006347,38.821799,SW,,2.0
106673,2,0,Forced Air,N,2.0,10,5,1953.0,,1962,2.0,100000.0,U,1,1600.0,1,2 Story,Multi,Average,Common Brick,Built Up,Hardwood,2.0,0,23,5348,2018-07-22 18:01:43,Residential,,,127 JOLIET STREET SW,WASHINGTON,DC,20032.0,18S UH 25818 98885,38.821534,-77.006407,Congress Heights,016 A Congress Heights,10900.0,010900 2000,Ward 8,6254,-77.006347,38.821799,SW,2012.0,2.0
106674,2,0,Forced Air,N,2.0,10,4,1953.0,,1953,2.0,,U,1,1600.0,1,2 Story,Multi,Average,Common Brick,Built Up,Hardwood,2.0,0,23,3466,2018-07-22 18:01:43,Residential,,,131 JOLIET STREET SW,WASHINGTON,DC,20032.0,18S UH 25815 98879,38.821481,-77.006446,Congress Heights,016 A Congress Heights,10900.0,010900 2000,Ward 8,6254,-77.006347,38.821799,SW,2009.0,2.0
106675,2,0,Forced Air,N,2.0,10,4,1953.0,2017.0,1971,2.0,215000.0,U,4,1600.0,1,2 Story,Multi,Average,Common Brick,Comp Shingle,Hardwood,2.0,0,23,3046,2018-07-22 18:01:43,Residential,,,135 JOLIET STREET SW,WASHINGTON,DC,20032.0,18S UH 25807 98865,38.821356,-77.006528,Congress Heights,016 A Congress Heights,10900.0,010900 2000,Ward 8,6254,-77.006347,38.821799,SW,2017.0,3.0


## Save Data

Finally, let's save out results so we can continue using them in the next notebook

In [12]:
data_df.reset_index(drop=True).to_csv(output_data_path, compression='zip')