In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import matplotlib
plt.style.use('ggplot')
from matplotlib.pyplot import figure

%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (12,8)

pd.options.mode.chained_assignment = None



# Now we need to read in the data
cleaning_df = pd.read_csv('CLEANING_SITE_ADDRE.csv')


In [2]:
cleaning_df

Unnamed: 0,Street_Number,Street_Prefix,Street_Name,Street_Type,Street_Suffix,CITY,ZONING,SALE_PRICE,SALE_DATE,BILLING_CLASS,YEAR_BUILT,UNITS,DESIGN_STYLE,CITY.1,ZIP_CODE,TYPE_AND_USE,SITE_ADDRE
0,1506,,WAKE FOREST,RD,,RAL,NX-3,0,,1,0,0,,RALEIGH,27604.0,,1506 WAKE FOREST RD
1,6012,,TRIANGLE,DR,,RAL,IX-3,740000,3/5/2012,1,1979,0,A,RALEIGH,27617.0,20.0,6012 TRIANGLE DR
2,6012,,TRIANGLE,DR,,RAL,IX-3,740000,3/5/2012,1,1989,0,A,RALEIGH,27617.0,34.0,6012 TRIANGLE DR
3,1601,,WAKE FOREST,RD,,RAL,IX-3,185000,10/8/1984,1,1993,0,A,RALEIGH,27604.0,85.0,1601 WAKE FOREST RD
4,1831,,CAPITAL,BLVD,,RAL,IX-3,750000,6/17/2016,1,1968,0,A,RALEIGH,27604.0,200.0,1831 CAPITAL BLVD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
439031,0,,UNKNOWN ADDRESS,DR,,,UK,0,,6,0,0,,,0.0,,0 UNKNOWN ADDRESS DR
439032,0,,VARIOUS,DR,,ANG,UK,0,,6,0,0,,ANGIER,0.0,,0 VARIOUS DR
439033,0,,UNKNOWN ADDRESS,DR,,ANG,UK,0,,6,0,0,,ANGIER,0.0,,0 UNKNOWN ADDRESS DR
439034,0,,VARIOUS MSD,DR,,MOR,UK,0,,6,0,0,,MORRISVILLE,,,0 VARIOUS MSD DR


In [3]:
cleaning_rows_df = cleaning_df[~cleaning_df['Street_Name'].str.contains('VARIOUS|UNKNOWN', case=False)]

In [4]:
#Changing the zipcode dtype 
cleaning_rows_df['ZIP_CODE'] = pd.to_numeric(cleaning_rows_df['ZIP_CODE'], errors='coerce').fillna(0).astype(int)

In [5]:
cleaning_rows_df

Unnamed: 0,Street_Number,Street_Prefix,Street_Name,Street_Type,Street_Suffix,CITY,ZONING,SALE_PRICE,SALE_DATE,BILLING_CLASS,YEAR_BUILT,UNITS,DESIGN_STYLE,CITY.1,ZIP_CODE,TYPE_AND_USE,SITE_ADDRE
0,1506,,WAKE FOREST,RD,,RAL,NX-3,0,,1,0,0,,RALEIGH,27604,,1506 WAKE FOREST RD
1,6012,,TRIANGLE,DR,,RAL,IX-3,740000,3/5/2012,1,1979,0,A,RALEIGH,27617,20.0,6012 TRIANGLE DR
2,6012,,TRIANGLE,DR,,RAL,IX-3,740000,3/5/2012,1,1989,0,A,RALEIGH,27617,34.0,6012 TRIANGLE DR
3,1601,,WAKE FOREST,RD,,RAL,IX-3,185000,10/8/1984,1,1993,0,A,RALEIGH,27604,85.0,1601 WAKE FOREST RD
4,1831,,CAPITAL,BLVD,,RAL,IX-3,750000,6/17/2016,1,1968,0,A,RALEIGH,27604,200.0,1831 CAPITAL BLVD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
438994,0,W,GANNON,AVE,,ZEB,,0,,1,0,0,,ZEBULON,27597,,0 W GANNON AVE
438995,316,,FAYETTEVILLE,ST,,RAL,BUS,25000,4/5/2008,6,1995,0,A,RALEIGH,27601,83.0,316 FAYETTEVILLE ST
438996,316,,FAYETTEVILLE,ST,,RAL,BUS,25000,4/5/2008,6,1985,4,A,RALEIGH,27601,59.0,316 FAYETTEVILLE ST
438997,224,,CHAPEL HILL,RD,,,GC,25000000,11/11/2014,6,2019,20,A,WAKE FOREST,27587,7.0,224 CHAPEL HILL RD


In [6]:
#Cleaning the data for TYPE_AND_USE(this is how wake county defines the type of building it is)
cleaning_rows_df2 = cleaning_rows_df.loc[cleaning_rows_df['BILLING_CLASS'] == 2]

In [7]:
cleaning_rows_df2 = cleaning_rows_df2.drop(columns=['CITY','TYPE_AND_USE','Street_Number','Street_Prefix','Street_Name','Street_Type','Street_Suffix'])

In [9]:
cleaning_rows_df2 = cleaning_rows_df2.rename(columns={'CITY.1': 'CITY'})

In [10]:
cleaning_rows_df2

Unnamed: 0,ZONING,SALE_PRICE,SALE_DATE,BILLING_CLASS,YEAR_BUILT,UNITS,DESIGN_STYLE,CITY,ZIP_CODE,SITE_ADDRE
8,R-4,34500,1/1/1974,2,1964,1,I,RALEIGH,27610,2457 BERTIE DR
9,R-4,35500,5/18/1983,2,1970,1,A,RALEIGH,27610,2848 PROVIDENCE RD
10,R-4,0,,2,1999,1,A,RALEIGH,27606,409 S LAKESIDE DR
11,R3,37500,9/16/2004,2,1900,1,A,WENDELL,27591,540 MARSHBURN RD
13,R-4,5000,1/1/1971,2,0,0,,RALEIGH,27604,1612 BENNETT ST
...,...,...,...,...,...,...,...,...,...,...
438876,,0,,2,0,0,,YOUNGSVILLE,27596,5829 JACK JONES RD
438878,,0,,2,0,0,,RALEIGH,27607,2408 EVERETT AVE
438879,,0,,2,0,0,,RALEIGH,27607,2406 EVERETT AVE
438888,,0,,2,0,0,,RALEIGH,27607,3302 HALL PL


In [11]:
WAKE_COUNTY_HOUSING_DATA = cleaning_rows_df2[['SITE_ADDRE','CITY','ZIP_CODE','ZONING','SALE_PRICE','SALE_DATE','BILLING_CLASS','YEAR_BUILT','UNITS','DESIGN_STYLE']]

In [12]:
WAKE_COUNTY_HOUSING_DATA

Unnamed: 0,SITE_ADDRE,CITY,ZIP_CODE,ZONING,SALE_PRICE,SALE_DATE,BILLING_CLASS,YEAR_BUILT,UNITS,DESIGN_STYLE
8,2457 BERTIE DR,RALEIGH,27610,R-4,34500,1/1/1974,2,1964,1,I
9,2848 PROVIDENCE RD,RALEIGH,27610,R-4,35500,5/18/1983,2,1970,1,A
10,409 S LAKESIDE DR,RALEIGH,27606,R-4,0,,2,1999,1,A
11,540 MARSHBURN RD,WENDELL,27591,R3,37500,9/16/2004,2,1900,1,A
13,1612 BENNETT ST,RALEIGH,27604,R-4,5000,1/1/1971,2,0,0,
...,...,...,...,...,...,...,...,...,...,...
438876,5829 JACK JONES RD,YOUNGSVILLE,27596,,0,,2,0,0,
438878,2408 EVERETT AVE,RALEIGH,27607,,0,,2,0,0,
438879,2406 EVERETT AVE,RALEIGH,27607,,0,,2,0,0,
438888,3302 HALL PL,RALEIGH,27607,,0,,2,0,0,


In [13]:
WAKE_COUNTY_HOUSING_DATA.to_csv('WAKE_COUNTY_HOUSING_DATA.csv')
