<b>Create Sidebar Navigation - best viewed in full screen</b>

In [1]:
%%javascript
$('<div id="toc"></div>').css({position: 'fixed', top: '120px', left: 0}).appendTo(document.body);
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js');

<IPython.core.display.Javascript object>

# Read In Data

In [2]:
# Import dependencies
import pandas as pd
import re
import numpy as np

In [3]:
# CSV translated from .xlsx (Excel workbook) file downloaded from:
# https://www.wake.gov/departments-government/tax-administration/data-files-statistics-and-reports/real-estate-property-data-files
housing_df = pd.read_csv("../resources/original/wake_county_residential_data_original.csv", 
                         usecols=["Street_Number", "Street_Prefix", "Street_Name", "Street_Type", "Street_Suffix", 
                                  "Planning_Jurisdiction", "Zoning", "Deeded_Acreage", "Assessed_Building_Value", 
                                  "Assessed_Land_Value", "BILLING_CLASS", "Year_Built", "UNITS", "HEATED_AREA", 
                                  "Year_of_Addition", "Remodeled_Year", "DESIGN_STYLE", "BATH", "PHYSICAL_CITY", 
                                  "PHYSICAL_ZIP_CODE"],
                         dtype={"Street_Suffix": str}) # Street_Suffix has conflicting types; set to string to avoid error
housing_df = housing_df.fillna("OTHER")

housing_df.head()

Unnamed: 0,Street_Number,Street_Prefix,Street_Name,Street_Type,Street_Suffix,Planning_Jurisdiction,Zoning,Deeded_Acreage,Assessed_Building_Value,Assessed_Land_Value,BILLING_CLASS,Year_Built,UNITS,HEATED_AREA,Year_of_Addition,Remodeled_Year,DESIGN_STYLE,BATH,PHYSICAL_CITY,PHYSICAL_ZIP_CODE
0,1506,OTHER,WAKE FOREST,RD,OTHER,RA,NX-3,0.32,0,112906,1,0,0,0,0,0,OTHER,OTHER,RALEIGH,27604.0
1,6012,OTHER,TRIANGLE,DR,OTHER,RA,IX-3,2.0,524503,392040,1,1979,0,4500,0,0,A,I,RALEIGH,27617.0
2,6012,OTHER,TRIANGLE,DR,OTHER,RA,IX-3,2.0,524503,392040,1,1989,0,4008,0,0,A,I,RALEIGH,27617.0
3,1601,OTHER,WAKE FOREST,RD,OTHER,RA,IX-3,1.38,374128,245076,1,1993,0,8500,0,0,A,I,RALEIGH,27604.0
4,1831,OTHER,CAPITAL,BLVD,OTHER,RA,IX-3,1.11,238511,578774,1,1968,0,3088,0,0,A,I,RALEIGH,27604.0


In [4]:
# Current length of dataframe
len(housing_df.index)

439238

## Combine and Clean Address Fields

In [5]:
# Create site address column from street number, prefix, name, type, and suffix
housing_df["Address"] = housing_df["Street_Number"].astype(str) + " " + housing_df["Street_Prefix"] + " " + housing_df["Street_Name"] + " " + housing_df["Street_Type"] + " " + housing_df["Street_Suffix"]

# If value is "OTHER" (was NaN), replace with nothing
housing_df["Address"] = housing_df["Address"].str.replace(r'\s?OTHER', '', regex=True)

# Remove extra spaces
housing_df["Address"] = housing_df["Address"].str.replace(r'\s+', ' ', regex=True)

# Drop street number, prefix, name, type, and suffix columns
housing_df = housing_df.drop(columns=["Street_Number", "Street_Prefix", "Street_Name", "Street_Type", "Street_Suffix"])

housing_df.head()

Unnamed: 0,Planning_Jurisdiction,Zoning,Deeded_Acreage,Assessed_Building_Value,Assessed_Land_Value,BILLING_CLASS,Year_Built,UNITS,HEATED_AREA,Year_of_Addition,Remodeled_Year,DESIGN_STYLE,BATH,PHYSICAL_CITY,PHYSICAL_ZIP_CODE,Address
0,RA,NX-3,0.32,0,112906,1,0,0,0,0,0,OTHER,OTHER,RALEIGH,27604.0,1506 WAKE FOREST RD
1,RA,IX-3,2.0,524503,392040,1,1979,0,4500,0,0,A,I,RALEIGH,27617.0,6012 TRIANGLE DR
2,RA,IX-3,2.0,524503,392040,1,1989,0,4008,0,0,A,I,RALEIGH,27617.0,6012 TRIANGLE DR
3,RA,IX-3,1.38,374128,245076,1,1993,0,8500,0,0,A,I,RALEIGH,27604.0,1601 WAKE FOREST RD
4,RA,IX-3,1.11,238511,578774,1,1968,0,3088,0,0,A,I,RALEIGH,27604.0,1831 CAPITAL BLVD


# Narrow Down Dataset
Goal: single-family homes in Wake County

## Filter: Has City

In [6]:
# Keep only locations that have a city
housing_df = housing_df.loc[housing_df["PHYSICAL_CITY"] != "OTHER"]
housing_df.head()

Unnamed: 0,Planning_Jurisdiction,Zoning,Deeded_Acreage,Assessed_Building_Value,Assessed_Land_Value,BILLING_CLASS,Year_Built,UNITS,HEATED_AREA,Year_of_Addition,Remodeled_Year,DESIGN_STYLE,BATH,PHYSICAL_CITY,PHYSICAL_ZIP_CODE,Address
0,RA,NX-3,0.32,0,112906,1,0,0,0,0,0,OTHER,OTHER,RALEIGH,27604.0,1506 WAKE FOREST RD
1,RA,IX-3,2.0,524503,392040,1,1979,0,4500,0,0,A,I,RALEIGH,27617.0,6012 TRIANGLE DR
2,RA,IX-3,2.0,524503,392040,1,1989,0,4008,0,0,A,I,RALEIGH,27617.0,6012 TRIANGLE DR
3,RA,IX-3,1.38,374128,245076,1,1993,0,8500,0,0,A,I,RALEIGH,27604.0,1601 WAKE FOREST RD
4,RA,IX-3,1.11,238511,578774,1,1968,0,3088,0,0,A,I,RALEIGH,27604.0,1831 CAPITAL BLVD


In [7]:
len(housing_df.index)

436807

## Filter: Residential District

In [8]:
# Keep only residential districts
housing_df = housing_df[housing_df["Zoning"].str.contains('^R|^LD|^MD|^HD|^MH|^MO')]
housing_df.head()

Unnamed: 0,Planning_Jurisdiction,Zoning,Deeded_Acreage,Assessed_Building_Value,Assessed_Land_Value,BILLING_CLASS,Year_Built,UNITS,HEATED_AREA,Year_of_Addition,Remodeled_Year,DESIGN_STYLE,BATH,PHYSICAL_CITY,PHYSICAL_ZIP_CODE,Address
5,RA,R-10,13.83,0,394155,3,0,0,0,0,0,OTHER,OTHER,RALEIGH,27610.0,609 BAILEY DR
6,RA,RX-3,0.26,7960,509783,1,0,0,0,0,0,OTHER,OTHER,RALEIGH,27607.0,103 CHAMBERLAIN ST
8,RA,R-4,0.21,134321,115000,2,1964,1,1828,0,0,I,C,RALEIGH,27610.0,2457 BERTIE DR
9,RA,R-4,0.46,114933,45000,2,1970,1,1240,0,0,A,A,RALEIGH,27610.0,2848 PROVIDENCE RD
10,RA,R-4,0.43,132624,90000,2,1999,1,1037,0,0,A,C,RALEIGH,27606.0,409 S LAKESIDE DR


In [9]:
# Current length of dataframe
len(housing_df.index)

328436

## Filter: Owned by Individual(s)

In [10]:
# Keep only houses with an individual's billing class (as opposed to corporations, HOA, etc.)
housing_df = housing_df.loc[housing_df["BILLING_CLASS"] == 2]

# Drop the column
housing_df = housing_df.drop(columns=["BILLING_CLASS"])

housing_df.head()

Unnamed: 0,Planning_Jurisdiction,Zoning,Deeded_Acreage,Assessed_Building_Value,Assessed_Land_Value,Year_Built,UNITS,HEATED_AREA,Year_of_Addition,Remodeled_Year,DESIGN_STYLE,BATH,PHYSICAL_CITY,PHYSICAL_ZIP_CODE,Address
8,RA,R-4,0.21,134321,115000,1964,1,1828,0,0,I,C,RALEIGH,27610.0,2457 BERTIE DR
9,RA,R-4,0.46,114933,45000,1970,1,1240,0,0,A,A,RALEIGH,27610.0,2848 PROVIDENCE RD
10,RA,R-4,0.43,132624,90000,1999,1,1037,0,0,A,C,RALEIGH,27606.0,409 S LAKESIDE DR
11,WE,R3,0.46,118723,32000,1900,1,2261,0,0,A,C,WENDELL,27591.0,540 MARSHBURN RD
13,RA,R-4,0.76,0,202500,0,0,0,0,0,OTHER,OTHER,RALEIGH,27604.0,1612 BENNETT ST


In [11]:
# Current length of dataframe
len(housing_df.index)

276429

## Filter: Not Empty Lot

In [12]:
# Keep only lots with houses (not empty lots)
housing_df = housing_df.loc[housing_df["Year_Built"] != 0]

# Keep only lots with valid heated area (square footage)
housing_df = housing_df.loc[housing_df["HEATED_AREA"] != 0]


housing_df.head()

Unnamed: 0,Planning_Jurisdiction,Zoning,Deeded_Acreage,Assessed_Building_Value,Assessed_Land_Value,Year_Built,UNITS,HEATED_AREA,Year_of_Addition,Remodeled_Year,DESIGN_STYLE,BATH,PHYSICAL_CITY,PHYSICAL_ZIP_CODE,Address
8,RA,R-4,0.21,134321,115000,1964,1,1828,0,0,I,C,RALEIGH,27610.0,2457 BERTIE DR
9,RA,R-4,0.46,114933,45000,1970,1,1240,0,0,A,A,RALEIGH,27610.0,2848 PROVIDENCE RD
10,RA,R-4,0.43,132624,90000,1999,1,1037,0,0,A,C,RALEIGH,27606.0,409 S LAKESIDE DR
11,WE,R3,0.46,118723,32000,1900,1,2261,0,0,A,C,WENDELL,27591.0,540 MARSHBURN RD
14,RA,R-2,0.96,358290,140000,1971,1,3770,0,0,A,J,RALEIGH,27613.0,8712 W LAKE CT


In [13]:
# Current length of dataframe
len(housing_df.index)

262670

## Filter: Single Unit

In [14]:
# Keep only lots with 1 unit (i.e. not apartments)
housing_df = housing_df.loc[housing_df["UNITS"] == 1]

# Drop units column
housing_df = housing_df.drop(columns="UNITS")

housing_df.head()

Unnamed: 0,Planning_Jurisdiction,Zoning,Deeded_Acreage,Assessed_Building_Value,Assessed_Land_Value,Year_Built,HEATED_AREA,Year_of_Addition,Remodeled_Year,DESIGN_STYLE,BATH,PHYSICAL_CITY,PHYSICAL_ZIP_CODE,Address
8,RA,R-4,0.21,134321,115000,1964,1828,0,0,I,C,RALEIGH,27610.0,2457 BERTIE DR
9,RA,R-4,0.46,114933,45000,1970,1240,0,0,A,A,RALEIGH,27610.0,2848 PROVIDENCE RD
10,RA,R-4,0.43,132624,90000,1999,1037,0,0,A,C,RALEIGH,27606.0,409 S LAKESIDE DR
11,WE,R3,0.46,118723,32000,1900,2261,0,0,A,C,WENDELL,27591.0,540 MARSHBURN RD
14,RA,R-2,0.96,358290,140000,1971,3770,0,0,A,J,RALEIGH,27613.0,8712 W LAKE CT


In [15]:
# Current length of dataframe
len(housing_df.index)

260250

# Alter Desired Fields

## Create: House Age

In [16]:
# Create column of house age
housing_df["Age"] = 2023 - housing_df["Year_Built"]

# Drop year built column
housing_df = housing_df.drop(columns=["Year_Built"])

housing_df.head()

Unnamed: 0,Planning_Jurisdiction,Zoning,Deeded_Acreage,Assessed_Building_Value,Assessed_Land_Value,HEATED_AREA,Year_of_Addition,Remodeled_Year,DESIGN_STYLE,BATH,PHYSICAL_CITY,PHYSICAL_ZIP_CODE,Address,Age
8,RA,R-4,0.21,134321,115000,1828,0,0,I,C,RALEIGH,27610.0,2457 BERTIE DR,59
9,RA,R-4,0.46,114933,45000,1240,0,0,A,A,RALEIGH,27610.0,2848 PROVIDENCE RD,53
10,RA,R-4,0.43,132624,90000,1037,0,0,A,C,RALEIGH,27606.0,409 S LAKESIDE DR,24
11,WE,R3,0.46,118723,32000,2261,0,0,A,C,WENDELL,27591.0,540 MARSHBURN RD,123
14,RA,R-2,0.96,358290,140000,3770,0,0,A,J,RALEIGH,27613.0,8712 W LAKE CT,52


## Create: Total Value

In [17]:
# Change assessed building and land values to integers
housing_df = housing_df.replace(",", "", regex=True)
housing_df = housing_df.astype({"Assessed_Building_Value": "int", "Assessed_Land_Value": "int"})

# Create total value column to combine house and land values
housing_df["Total_Value"] = housing_df["Assessed_Building_Value"] + housing_df["Assessed_Land_Value"]

# Drop assessed value columns
housing_df = housing_df.drop(columns=["Assessed_Building_Value", "Assessed_Land_Value"])

housing_df.head()

Unnamed: 0,Planning_Jurisdiction,Zoning,Deeded_Acreage,HEATED_AREA,Year_of_Addition,Remodeled_Year,DESIGN_STYLE,BATH,PHYSICAL_CITY,PHYSICAL_ZIP_CODE,Address,Age,Total_Value
8,RA,R-4,0.21,1828,0,0,I,C,RALEIGH,27610.0,2457 BERTIE DR,59,249321
9,RA,R-4,0.46,1240,0,0,A,A,RALEIGH,27610.0,2848 PROVIDENCE RD,53,159933
10,RA,R-4,0.43,1037,0,0,A,C,RALEIGH,27606.0,409 S LAKESIDE DR,24,222624
11,WE,R3,0.46,2261,0,0,A,C,WENDELL,27591.0,540 MARSHBURN RD,123,150723
14,RA,R-2,0.96,3770,0,0,A,J,RALEIGH,27613.0,8712 W LAKE CT,52,498290


## Create: Remodel/Addition (Boolean)

In [18]:
# Create remodel/addition column for houses that have remodelled or added an addition (true/false) in (1/0) form
housing_df["Remodel_Addition"] = np.where((housing_df["Year_of_Addition"] != 0) | (housing_df["Remodeled_Year"] != 0), 1, 0)

# Drop year of addition and remodeled year columns
housing_df = housing_df.drop(columns=["Year_of_Addition", "Remodeled_Year"])

housing_df.head()

Unnamed: 0,Planning_Jurisdiction,Zoning,Deeded_Acreage,HEATED_AREA,DESIGN_STYLE,BATH,PHYSICAL_CITY,PHYSICAL_ZIP_CODE,Address,Age,Total_Value,Remodel_Addition
8,RA,R-4,0.21,1828,I,C,RALEIGH,27610.0,2457 BERTIE DR,59,249321,0
9,RA,R-4,0.46,1240,A,A,RALEIGH,27610.0,2848 PROVIDENCE RD,53,159933,0
10,RA,R-4,0.43,1037,A,C,RALEIGH,27606.0,409 S LAKESIDE DR,24,222624,0
11,WE,R3,0.46,2261,A,C,WENDELL,27591.0,540 MARSHBURN RD,123,150723,0
14,RA,R-2,0.96,3770,A,J,RALEIGH,27613.0,8712 W LAKE CT,52,498290,0


## Alter: Planning Jurisdictions to Integer

In [19]:
# See possible values for planning jurisdictions
housing_df["Planning_Jurisdiction"].value_counts()

RA    107843
WC     48947
CA     38979
FV     14712
AP     13948
GA     10217
HS      8576
MO      5831
RO      3704
WE      2959
ZB      2506
WF      1182
KN       488
AN       338
DU        20
Name: Planning_Jurisdiction, dtype: int64

In [20]:
# Change planning jurisdiction to numbers for easier analysis and machine learning
housing_df["Planning_Jurisdiction"] = np.where(housing_df["Planning_Jurisdiction"] == "AN", "0", 
                                               np.where(housing_df["Planning_Jurisdiction"] == "AP", "1", 
                                               np.where(housing_df["Planning_Jurisdiction"] == "CA", "2", 
                                               np.where(housing_df["Planning_Jurisdiction"] == "DU", "3", 
                                               np.where(housing_df["Planning_Jurisdiction"] == "FV", "4", 
                                               np.where(housing_df["Planning_Jurisdiction"] == "GA", "5", 
                                               np.where(housing_df["Planning_Jurisdiction"] == "HS", "6", 
                                               np.where(housing_df["Planning_Jurisdiction"] == "KN", "7", 
                                               np.where(housing_df["Planning_Jurisdiction"] == "MO", "8", 
                                               np.where(housing_df["Planning_Jurisdiction"] == "RA", "9", 
                                               np.where(housing_df["Planning_Jurisdiction"] == "RD", "10", 
                                               np.where(housing_df["Planning_Jurisdiction"] == "RO", "11", 
                                               np.where(housing_df["Planning_Jurisdiction"] == "WC", "12", 
                                               np.where(housing_df["Planning_Jurisdiction"] == "WE", "13", 
                                               np.where(housing_df["Planning_Jurisdiction"] == "WF", "14", 15)))))))))))))))
housing_df["Planning_Jurisdiction"] = housing_df["Planning_Jurisdiction"].astype("int")

housing_df.head()

Unnamed: 0,Planning_Jurisdiction,Zoning,Deeded_Acreage,HEATED_AREA,DESIGN_STYLE,BATH,PHYSICAL_CITY,PHYSICAL_ZIP_CODE,Address,Age,Total_Value,Remodel_Addition
8,9,R-4,0.21,1828,I,C,RALEIGH,27610.0,2457 BERTIE DR,59,249321,0
9,9,R-4,0.46,1240,A,A,RALEIGH,27610.0,2848 PROVIDENCE RD,53,159933,0
10,9,R-4,0.43,1037,A,C,RALEIGH,27606.0,409 S LAKESIDE DR,24,222624,0
11,13,R3,0.46,2261,A,C,WENDELL,27591.0,540 MARSHBURN RD,123,150723,0
14,9,R-2,0.96,3770,A,J,RALEIGH,27613.0,8712 W LAKE CT,52,498290,0


## Alter: Design Styles to Integer

In [21]:
# See possible values for design styles
housing_df["DESIGN_STYLE"].value_counts()

A        198991
C         31877
D          7629
G          7623
I          7119
N          3361
J          2062
K           699
M           390
O           299
L           128
F            33
B            22
H             9
E             7
OTHER         1
Name: DESIGN_STYLE, dtype: int64

In [22]:
# Change design style to numbers for easier analysis and machine learning
housing_df["DESIGN_STYLE"] = np.where(housing_df["DESIGN_STYLE"] == "A", "0", np.where(housing_df["DESIGN_STYLE"] == "B", "1",
                                      np.where(housing_df["DESIGN_STYLE"] == "C", "2", np.where(housing_df["DESIGN_STYLE"] == "D", "3",
                                      np.where(housing_df["DESIGN_STYLE"] == "E", "4", np.where(housing_df["DESIGN_STYLE"] == "F", "5",
                                      np.where(housing_df["DESIGN_STYLE"] == "G", "6", np.where(housing_df["DESIGN_STYLE"] == "H", "7",
                                      np.where(housing_df["DESIGN_STYLE"] == "I", "8", np.where(housing_df["DESIGN_STYLE"] == "J", "9",
                                      np.where(housing_df["DESIGN_STYLE"] == "K", "10", np.where(housing_df["DESIGN_STYLE"] == "L", "11",
                                      np.where(housing_df["DESIGN_STYLE"] == "M", "12", np.where(housing_df["DESIGN_STYLE"] == "N", "13",
                                      np.where(housing_df["DESIGN_STYLE"] == "O", "14", "15")))))))))))))))
housing_df["DESIGN_STYLE"] = housing_df["DESIGN_STYLE"].astype("int")

# Drop houses without a design style (set to 15)
housing_df = housing_df.loc[housing_df["DESIGN_STYLE"] != 15]

housing_df.head()

Unnamed: 0,Planning_Jurisdiction,Zoning,Deeded_Acreage,HEATED_AREA,DESIGN_STYLE,BATH,PHYSICAL_CITY,PHYSICAL_ZIP_CODE,Address,Age,Total_Value,Remodel_Addition
8,9,R-4,0.21,1828,8,C,RALEIGH,27610.0,2457 BERTIE DR,59,249321,0
9,9,R-4,0.46,1240,0,A,RALEIGH,27610.0,2848 PROVIDENCE RD,53,159933,0
10,9,R-4,0.43,1037,0,C,RALEIGH,27606.0,409 S LAKESIDE DR,24,222624,0
11,13,R3,0.46,2261,0,C,WENDELL,27591.0,540 MARSHBURN RD,123,150723,0
14,9,R-2,0.96,3770,0,J,RALEIGH,27613.0,8712 W LAKE CT,52,498290,0


In [23]:
# Current length of dataframe
len(housing_df.index)

260249

## Alter: Bath to Float

In [24]:
# See possible values for baths
housing_df["BATH"].value_counts()

D        100030
C         53039
F         46923
E         24845
J         13322
A         12110
B          9872
H            73
I            21
OTHER        13
G             1
Name: BATH, dtype: int64

In [25]:
# Change bath column to number count (each letter represents a number of baths) and change to float data type
housing_df["BATH"] = np.where(housing_df["BATH"] == "A", "1", np.where(housing_df["BATH"] == "B", "1.5", 
                              np.where(housing_df["BATH"] == "C", "2", np.where(housing_df["BATH"] == "D", "2.5",
                              np.where(housing_df["BATH"] == "E", "3", np.where(housing_df["BATH"] == "F", "3.5", 0))))))
housing_df["BATH"] = housing_df["BATH"].astype("float")

# Remove houses set to 0 (G = limited plumbing; H = no plumbing; I = adequate; J = no fixtures; OTHER = unknown)
# NOTE: the original code F means 3.5+; there will be some discrepancies in the data analysis because of this
housing_df = housing_df.loc[housing_df["BATH"] != 0]

housing_df.head()

Unnamed: 0,Planning_Jurisdiction,Zoning,Deeded_Acreage,HEATED_AREA,DESIGN_STYLE,BATH,PHYSICAL_CITY,PHYSICAL_ZIP_CODE,Address,Age,Total_Value,Remodel_Addition
8,9,R-4,0.21,1828,8,2.0,RALEIGH,27610.0,2457 BERTIE DR,59,249321,0
9,9,R-4,0.46,1240,0,1.0,RALEIGH,27610.0,2848 PROVIDENCE RD,53,159933,0
10,9,R-4,0.43,1037,0,2.0,RALEIGH,27606.0,409 S LAKESIDE DR,24,222624,0
11,13,R3,0.46,2261,0,2.0,WENDELL,27591.0,540 MARSHBURN RD,123,150723,0
15,5,R2,0.51,996,0,1.0,RALEIGH,27603.0,605 WOODLAND RD,67,140801,0


In [26]:
# Current length of dataframe
len(housing_df.index)

246819

## Alter: Zip Code to Integer

In [27]:
# Remove houses without a zip code
housing_df = housing_df.loc[housing_df["PHYSICAL_ZIP_CODE"] != "OTHER"]

# Change zip code to int
housing_df["PHYSICAL_ZIP_CODE"] = housing_df["PHYSICAL_ZIP_CODE"].astype("int")

housing_df.head()

Unnamed: 0,Planning_Jurisdiction,Zoning,Deeded_Acreage,HEATED_AREA,DESIGN_STYLE,BATH,PHYSICAL_CITY,PHYSICAL_ZIP_CODE,Address,Age,Total_Value,Remodel_Addition
8,9,R-4,0.21,1828,8,2.0,RALEIGH,27610,2457 BERTIE DR,59,249321,0
9,9,R-4,0.46,1240,0,1.0,RALEIGH,27610,2848 PROVIDENCE RD,53,159933,0
10,9,R-4,0.43,1037,0,2.0,RALEIGH,27606,409 S LAKESIDE DR,24,222624,0
11,13,R3,0.46,2261,0,2.0,WENDELL,27591,540 MARSHBURN RD,123,150723,0
15,5,R2,0.51,996,0,1.0,RALEIGH,27603,605 WOODLAND RD,67,140801,0


In [28]:
len(housing_df.index)

246770

# Clean DataFrame

## Reorder and Rename Columns

In [29]:
# Reorder and rename columns
housing_df = housing_df[["Total_Value", "Address", "PHYSICAL_CITY", "PHYSICAL_ZIP_CODE", "Planning_Jurisdiction", 
                         "Zoning", "Deeded_Acreage", "HEATED_AREA", "Age", "BATH", "Remodel_Addition", "DESIGN_STYLE"]]
housing_df.rename(columns={"PHYSICAL_CITY": "City", "PHYSICAL_ZIP_CODE": "Zip_Code", "Deeded_Acreage": "Acreage",
                           "HEATED_AREA": "Sqft", "BATH": "Bath", "DESIGN_STYLE": "Style"
                           }, inplace=True)
housing_df = housing_df.reset_index(drop=True)

housing_df.head()

Unnamed: 0,Total_Value,Address,City,Zip_Code,Planning_Jurisdiction,Zoning,Acreage,Sqft,Age,Bath,Remodel_Addition,Style
0,249321,2457 BERTIE DR,RALEIGH,27610,9,R-4,0.21,1828,59,2.0,0,8
1,159933,2848 PROVIDENCE RD,RALEIGH,27610,9,R-4,0.46,1240,53,1.0,0,0
2,222624,409 S LAKESIDE DR,RALEIGH,27606,9,R-4,0.43,1037,24,2.0,0,0
3,150723,540 MARSHBURN RD,WENDELL,27591,13,R3,0.46,2261,123,2.0,0,0
4,140801,605 WOODLAND RD,RALEIGH,27603,5,R2,0.51,996,67,1.0,0,0


## Check For and Drop Duplicates

In [30]:
# Check for duplicates
print(housing_df.shape)
print(housing_df["Address"].nunique())

(246770, 12)
241903


In [31]:
# Drop duplicates
housing_df.drop_duplicates(subset=["Address"], keep="first", inplace=True)

In [32]:
# Check if dropping duplicates worked
print(housing_df.shape)
print(housing_df["Address"].nunique())

(241903, 12)
241903


# Export to CSV

In [33]:
# Export to csv file
housing_df.to_csv("../resources/in_progress/housing_data_clean.csv", index=False)