## Summary of data
## Properties from San Jose, CA
## 5,753 listings
## 92 different neighborhoods

In [1]:
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import matplotlib.path as path

from string import punctuation

In [2]:
def make_histogram(data, bins):
    fig, ax = plt.subplots()

    # histogram our data with numpy
    n, bins = np.histogram(data, bins)

    # get the corners of the rectangles for the histogram
    left = np.array(bins[:-1])
    right = np.array(bins[1:])
    bottom = np.zeros(len(left))
    top = bottom + n


    # we need a (numrects x numsides x 2) numpy array for the path helper
    # function to build a compound path
    XY = np.array([[left, left, right, right], [bottom, top, top, bottom]]).T

    # get the Path object
    barpath = path.Path.make_compound_path_from_polys(XY)

    # make a patch out of it
    patch = patches.PathPatch(barpath)
    ax.add_patch(patch)

    # update the view limits
    ax.set_xlim(left[0], right[-1])
    ax.set_ylim(bottom.min(), top.max())

    plt.show()

In [13]:
def convert_percentage(df):
    for col in df.columns:
        c = col.split(" ")
        if c[-1] == "MoM" or c[-1] == "YoY": df[col] = [float(str(x).strip("%")) for x in df[col]]
    
    df['Average Sale To List'] = [float(str(x).strip("%")) for x in df['Average Sale To List']]

In [8]:
### read in data
def read_data(filename, property_type):
    df = pd.read_csv(filename)
    df.columns = [x.strip(" ") for x in df.columns]
    df['property_type'] = property_type
    df.head()
    return (df)

In [5]:
### add columns for city, state, and neighborhood
def parse_region(df):
    df['location'] = [x.split(",") for x in df.Region]
    df['city'] = [x[0].strip(",").lower() for x in df.location]
    df['state'] = [x[1].split(" ")[1].lower().strip() for x in df.location]
    df['neighborhood'] = [x[1].split(" ")[3].strip().lower() for x in df.location]
    df = df[df.city == 'san jose']
    df = df.drop('location', 1)
    df.head()

In [6]:
### convert Median Sale Price to a float
def convert_med_sale_pr(df):
    df[u'Median Sale Price'] = [x.strip("$").replace(",", "") for x in df["Median Sale Price"]]
    df[u'Median Sale Price'] = [int(x) if x[len(x)-1] != "K" else int(x.strip("K"))*1000 
                                for x in df["Median Sale Price"]]

In [14]:
df = read_data("sanjose_condos.csv", "condo")
convert_med_sale_pr(df)
parse_region(df)
convert_percentage(df)

In [15]:
df.head()

Unnamed: 0,Region,Month of Period End,Median Sale Price,Median Sale Price MoM,Median Sale Price YoY,Homes Sold,Homes Sold MoM,Homes Sold YoY,New Listings,New Listings MoM,...,Days on Market MoM,Days on Market YoY,Average Sale To List,Average Sale To List MoM,Average Sale To List YoY,property_type,location,city,state,neighborhood
0,"San Jose, CA - Alexander",January 2012,152000,-1.0,18.8,12,-7.7,100.0,12.0,0.0,...,4.0,-26.0,103.4,2.3,10.2,condo,"[San Jose, CA - Alexander]",san jose,ca,alexander
1,"San Jose, CA - Alexander",February 2012,146000,-4.0,-1.7,10,-16.7,42.9,11.0,-8.3,...,2.0,-23.0,103.4,0.0,8.9,condo,"[San Jose, CA - Alexander]",san jose,ca,alexander
2,"San Jose, CA - Alexander",March 2012,146000,0.0,-7.0,13,30.0,116.7,12.0,9.1,...,3.0,-22.0,102.0,-1.4,4.7,condo,"[San Jose, CA - Alexander]",san jose,ca,alexander
3,"San Jose, CA - Alexander",April 2012,140000,-3.6,-7.4,10,-23.1,25.0,5.0,-58.3,...,1.0,-21.0,100.5,-1.5,2.4,condo,"[San Jose, CA - Alexander]",san jose,ca,alexander
4,"San Jose, CA - Alexander",May 2012,150000,7.0,12.8,11,10.0,0.0,8.0,60.0,...,1.0,-5.0,100.8,0.3,2.3,condo,"[San Jose, CA - Alexander]",san jose,ca,alexander


In [16]:
len(df.index)

6099

In [17]:
neighborhoods = np.unique(df['neighborhood'])
print (neighborhoods, len(neighborhoods))

(array(['alexander', 'alma-almaden', 'almaden', 'anderson', 'area',
       'atlanta', 'barbera-stokes', 'berryessa', 'blackford',
       'bonita-24th', 'branham-jarvis', 'branham-kirk', 'brigadoon',
       'broadway-palmhaven', 'brooktree', 'bucknall', 'buena', 'calabazas',
       'cambrian', 'cambrian-pioneer', 'carlton', 'carson',
       'cedarville-giannotta', 'central', 'chaboya', 'cherrywood',
       'citaldi', 'clayton', 'commodore', 'crossgate', 'cypress-jurdo',
       'de', 'deer', 'del', 'dovehill', 'downtown', 'east', 'eden',
       'erikson', 'estates', 'evergreen', 'everydale-neimen', 'flickinger',
       'fowler', 'gilchrist', 'great', 'greylands', 'hammer', 'hayes',
       'heritage', 'hidden', 'hillsdale', 'hillview', 'holy', 'joaquin',
       'kenwood', 'king', 'kooser', 'lanai-cunningham', 'loma', 'lone',
       'los', 'lynbrook', 'lynhaven', 'mayfair', 'mckay-ringwood',
       'meadow', 'meadows', 'millic-phelps', 'miner', 'mirassou',
       'morrill', 'mount', 'muril

In [18]:
df.describe()



Unnamed: 0,Median Sale Price,Median Sale Price MoM,Median Sale Price YoY,Homes Sold,Homes Sold MoM,Homes Sold YoY,New Listings,New Listings MoM,New Listings YoY,Inventory,Inventory MoM,Inventory YoY,Days on Market,Days on Market MoM,Days on Market YoY,Average Sale To List,Average Sale To List MoM,Average Sale To List YoY
count,6099.0,6078.0,6059.0,6099.0,6078.0,6059.0,6072.0,6034.0,6008.0,5564.0,5234.0,5158.0,6095.0,6072.0,6049.0,6099.0,6078.0,6059.0
mean,667441.5,1.828397,15.379898,166.00951,6.90181,18.771282,208.738307,6.244962,12.468309,528.149892,17.35021,17.218767,24.227892,-0.170619,-3.376426,103.227332,0.034847,0.821439
std,294510.2,13.193925,25.687286,881.015046,45.459799,96.89716,1140.236187,45.000474,82.396511,3028.361492,79.290597,116.312958,20.550244,17.651727,30.297723,3.870555,2.405655,4.583911
min,125000.0,-61.5,-65.5,1.0,-85.7,-92.9,1.0,-87.5,-92.9,1.0,-87.5,-94.7,1.0,-327.0,-383.0,77.7,-18.5,-27.8
25%,450000.0,,,5.0,,,,,,,,,,,,100.7,,
50%,625000.0,,,8.0,,,,,,,,,,,,102.7,,
75%,824000.0,,,11.0,,,,,,,,,,,,105.2,,
max,2248000.0,169.6,335.7,10715.0,600.0,1400.0,13145.0,500.0,900.0,28027.0,700.0,1500.0,357.0,220.0,270.0,127.9,24.3,28.5


In [None]:
make_histogram(df['Median Sale Price'], 50)

In [19]:
sum(pd.isnull(df['New Listings YoY']))

91

In [27]:
df = df.fillna(0)

In [28]:
df.describe()

Unnamed: 0,Median Sale Price,Median Sale Price MoM,Median Sale Price YoY,Homes Sold,Homes Sold MoM,Homes Sold YoY,New Listings,New Listings MoM,New Listings YoY,Inventory,Inventory MoM,Inventory YoY,Days on Market,Days on Market MoM,Days on Market YoY,Average Sale To List,Average Sale To List MoM,Average Sale To List YoY
count,6099.0,6099.0,6099.0,6099.0,6099.0,6099.0,6099.0,6099.0,6099.0,6099.0,6099.0,6099.0,6099.0,6099.0,6099.0,6099.0,6099.0,6099.0
mean,667441.5,1.822102,15.279029,166.00951,6.878046,18.648172,207.814232,6.178406,12.282276,481.820954,14.88949,14.562125,24.212002,-0.169864,-3.348746,103.227332,0.034727,0.816052
std,294510.2,13.171622,25.632984,881.015046,45.383256,96.590724,1137.793492,44.76459,81.793372,2896.324563,73.700978,107.143564,20.552866,17.612609,30.174792,3.870555,2.40151,4.569333
min,125000.0,-61.5,-65.5,1.0,-85.7,-92.9,0.0,-87.5,-92.9,0.0,-87.5,-94.7,0.0,-327.0,-383.0,77.7,-18.5,-27.8
25%,450000.0,-2.0,2.5,5.0,-18.2,-30.0,5.0,-19.45,-32.75,2.0,-25.0,-50.0,11.0,-3.0,-11.0,100.7,-1.0,-1.7
50%,625000.0,0.1,11.8,8.0,0.0,0.0,8.0,0.0,0.0,3.0,0.0,0.0,17.0,0.0,0.0,102.7,0.0,0.8
75%,824000.0,4.3,23.1,11.0,20.0,33.3,12.0,22.2,30.0,5.0,25.0,25.0,30.0,3.0,8.0,105.2,1.0,3.5
max,2248000.0,169.6,335.7,10715.0,600.0,1400.0,13145.0,500.0,900.0,28027.0,700.0,1500.0,357.0,220.0,270.0,127.9,24.3,28.5
