## Summary of data
## Properties from San Jose, CA
## 6099 condos
## 92 different neighborhoods

In [1]:
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import matplotlib.path as path

from string import punctuation

In [2]:
def make_histogram(data, bins):
    fig, ax = plt.subplots()

    # histogram our data with numpy
    n, bins = np.histogram(data, bins)

    # get the corners of the rectangles for the histogram
    left = np.array(bins[:-1])
    right = np.array(bins[1:])
    bottom = np.zeros(len(left))
    top = bottom + n


    # we need a (numrects x numsides x 2) numpy array for the path helper
    # function to build a compound path
    XY = np.array([[left, left, right, right], [bottom, top, top, bottom]]).T

    # get the Path object
    barpath = path.Path.make_compound_path_from_polys(XY)

    # make a patch out of it
    patch = patches.PathPatch(barpath)
    ax.add_patch(patch)

    # update the view limits
    ax.set_xlim(left[0], right[-1])
    ax.set_ylim(bottom.min(), top.max())

    plt.show()

In [43]:
# convert string percentages to numerical
def convert_percentage(df):
    for col in df.columns:
        c = col.split(" ")
        if c[-1] == "MoM" or c[-1] == "YoY": df[col] = [float(str(x).strip("%")) for x in df[col]]
    
    df['Average Sale To List'] = [float(str(x).strip("%")) for x in df['Average Sale To List']]

In [4]:
### read in data
def read_data(filename, property_type):
    df = pd.read_csv(filename)
    df.columns = [x.strip(" ") for x in df.columns]
    df['property_type'] = property_type
    df.head()
    return (df)

In [5]:
### add columns for city, state, and neighborhood
def parse_region(df):
    df['location'] = [x.split(",") for x in df.Region]
    df['city'] = [x[0].strip(",").lower() for x in df.location]
    df['state'] = [x[1].split(" ")[1].lower().strip() for x in df.location]
    df['neighborhood'] = [x[1].split(" ")[3].strip().lower() for x in df.location]
    df = df[df.city == 'san jose']
    df = df.drop('location', 1)
    df.head()

In [6]:
### convert Median Sale Price to a float
def convert_med_sale_pr(df):
    df[u'Median Sale Price'] = [x.strip("$").replace(",", "") for x in df["Median Sale Price"]]
    df[u'Median Sale Price'] = [int(x) if x[len(x)-1] != "K" else int(x.strip("K"))*1000 
                                for x in df["Median Sale Price"]]

In [44]:
### given the filename and property type return
### a dataframe with string fields converted to
### numerical and regions parsed out
def clean_data(filename, property_type):
    df = read_data(filename, property_type)
    convert_med_sale_pr(df)
    parse_region(df)
    convert_percentage(df)
    return (df)

In [37]:
# merge several dataframes together
# files = list of tuples, 
# each tuple should contain filename first then property type

# How to deal with NA's? fill with 0 for now ...
def merge_data(files):
    frames = []
    for f in files:
        frames.append(clean_data(f[0], f[1]))
    return (pd.concat(frames).fillna(0))

In [51]:
### convert neighborhood to a numerical value
def convert_neighborhood(df):
    neighborhoods = df.neighborhood.unique()
    df['neighborhood_num'] = [np.nonzero(df.neighborhood == x)[0][0] for x in df.neighborhood]

In [52]:
files = [('sanjose_condos.csv', 'condo'), ('sanjose_townhouse2.csv', 'townhouse'),
        ('sanjose_multiunit.csv', 'multiunit'), ("sanjose_singlefamily.csv", "singlefamily")]
total = merge_data(files)
convert_neighborhood(total)

In [40]:
len(total.index)

15419

In [41]:
neighborhoods = np.unique(total['neighborhood'])
print (neighborhoods, len(neighborhoods)) # 92 neighborhoods

(array(['alexander', 'alma-almaden', 'almaden', 'anderson', 'area',
       'atlanta', 'barbera-stokes', 'berryessa', 'blackford',
       'bonita-24th', 'branham-jarvis', 'branham-kirk', 'brigadoon',
       'broadway-palmhaven', 'brooktree', 'bucknall', 'buena', 'calabazas',
       'cambrian', 'cambrian-pioneer', 'carlton', 'carson',
       'cedarville-giannotta', 'central', 'chaboya', 'cherrywood',
       'citaldi', 'clayton', 'commodore', 'crossgate', 'cypress-jurdo',
       'de', 'deer', 'del', 'dovehill', 'downtown', 'east', 'eden',
       'erikson', 'estates', 'evergreen', 'everydale-neimen', 'flickinger',
       'fowler', 'gilchrist', 'great', 'greylands', 'hammer', 'hayes',
       'heritage', 'hidden', 'hillsdale', 'hillview', 'holy', 'joaquin',
       'kenwood', 'king', 'kooser', 'lanai-cunningham', 'loma', 'lone',
       'los', 'lynbrook', 'lynhaven', 'mayfair', 'mckay-ringwood',
       'meadow', 'meadows', 'millic-phelps', 'miner', 'mirassou',
       'morrill', 'mount', 'muril

In [42]:
total.describe() # descriptive statistics

Unnamed: 0,Median Sale Price,Median Sale Price MoM,Median Sale Price YoY,Homes Sold,Homes Sold MoM,Homes Sold YoY,New Listings,New Listings MoM,New Listings YoY,Inventory,Inventory MoM,Inventory YoY,Days on Market,Days on Market MoM,Days on Market YoY,Average Sale To List,Average Sale To List MoM,Average Sale To List YoY
count,15419.0,15419.0,15419.0,15419.0,15419.0,15419.0,15419.0,15419.0,15419.0,15419.0,15419.0,15419.0,15419.0,15419.0,15419.0,15419.0,15419.0,15419.0
mean,678097.0,1.441598,13.424645,108.108113,7.759433,18.569492,135.838641,6.04178,12.451365,317.767949,11.687794,10.796414,26.702056,-0.146702,-3.628964,103.11002,0.039043,0.771289
std,296352.0,10.651508,21.476353,646.383152,47.591472,92.052994,837.190488,45.285962,80.893613,2164.869063,65.519173,91.913587,27.307119,20.172737,35.939966,4.922048,2.6155,5.076339
min,97000.0,-61.5,-65.5,1.0,-85.7,-92.9,0.0,-87.5,-92.9,0.0,-87.5,-94.7,0.0,-435.0,-548.0,0.0,-23.1,-42.5
25%,460000.0,-1.6,0.0,3.0,-16.7,-28.6,3.0,-18.2,-28.6,1.0,-12.85,-33.3,11.0,-2.0,-10.0,100.3,-0.9,-1.6
50%,633000.0,0.0,10.4,6.0,0.0,0.0,6.0,0.0,0.0,2.0,0.0,0.0,17.0,0.0,0.0,102.5,0.0,0.4
75%,830000.0,3.5,21.3,10.0,20.0,33.3,11.0,19.2,25.0,5.0,7.0,0.0,32.0,3.0,7.0,105.3,0.9,3.4
max,2400000.0,169.6,335.7,10715.0,700.0,1400.0,13145.0,500.0,1000.0,28027.0,900.0,1500.0,556.0,515.0,535.0,137.6,38.1,61.2


In [None]:
make_histogram(df['Median Sale Price'], 50)

In [None]:
sum(pd.isnull(df['New Listings YoY'])) # how to deal with missing values?

In [None]:
df = df.fillna(0)

In [None]:
df.describe()