# Summary

### Did exploratory analysis on 1990 CA housing price data

### Computed a population density proxy measure (done on my computer using ad-hoc C++ algo; uploaded onto Kaggle)

### Analyzed distribution of datapoints w/ respect to the density measure. Found clear clustering

### Ran a quick multiple regression model

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib
from matplotlib import cm
import matplotlib.pyplot as plt
from scipy.spatial import Voronoi, voronoi_plot_2d

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
FPATH = '/kaggle/input/housing/'
GPATH = '/kaggle/input/caliboundaries/'
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Processing dataframes

## Main dataframe and feature engineering

In [None]:
df = pd.read_csv(FPATH+"housing.csv")
l0 = -125
df['long_mercator'] = df['longitude']-l0
df['lat_mercator'] = np.arcsinh(np.tan(df['latitude']*np.pi/180))
df['pop_per_household'] = df['population']/df['households']
df['rooms_per_house'] = df['total_rooms']/df['households']
df['bedrooms_per_house'] = df['total_bedrooms']/df['households']
df['bedrooms_per_room'] = df['total_bedrooms']/df['total_rooms']
df['rooms_per_person'] = df['total_rooms']/df['population']
df['bedrooms_per_person'] = df['total_bedrooms']/df['population']

## Reading CA boundary points
(for visualization purposes, mainly)

In [None]:
f = open(GPATH+'pts.txt', 'r')
pts = []
for i in range(1500):
    a = f.readline()
    if a == '':
        break
    if ',' not in a:
        pts.append([])
    else:
        tup = list(map(float,a[:-1].split(',')))
        tup[0] -= l0
        tup[1] = np.arcsinh(np.tan(tup[1]*np.pi/180))
        pts[-1].append(tup)

In [None]:
for poly in pts:
    x = [t[0] for t in poly]
    y = [t[1] for t in poly]
    plt.fill(x, y, facecolor='lightblue')

## Reading population density proxy measure
(I calculated this using population and long/lat data via a separate C++ program)

In [None]:
f2 = open(GPATH+'densities.txt', 'r')
f2.readline()
ds = []
for i in range(20640):
    ds.append(float(f2.readline()[:-1]))
df['density'] = ds

# Preliminary Analysis

## Function to easily display summary statistics for any sub-dataframe of df
I decided not to use .describe() since each block group (geographical district) has a different # households. The "cumulative" function calculates summary stats, weighted by # households

In [None]:
critical = ['housing_median_age', 'median_income', 'median_house_value', 'pop_per_household', 'rooms_per_house',
            'bedrooms_per_house', 'bedrooms_per_room', 'rooms_per_person', 'bedrooms_per_person', 'density']
sumlist = ['longitude','latitude','housing_median_age','total_rooms','total_bedrooms','median_income','median_house_value',
           'long_mercator','lat_mercator','pop_per_household','rooms_per_house','bedrooms_per_house','bedrooms_per_room',
           'rooms_per_person','bedrooms_per_person','density']

In [None]:
# This calculates summary statistics for a df, weighted by # households

def cumulative(duf, cols):
    df_summary = pd.DataFrame(index=['count', 'mean', 'std' , 'min' , '25%' , '50%' , '75%', 'max'])
    dtf = duf.copy()
    for s in cols:
        df_summary[s] = pd.Series(float)
        dtf.sort_values(s, inplace=True)
        cumsum = dtf['households'].cumsum()
        df_summary[s].loc['count'] = dtf[s].count()
        df_summary[s].loc['min'] = dtf[s].min()
        df_summary[s].loc['25%'] = dtf[s][cumsum >= dtf['households'].sum() / 4.0].iloc[0]
        df_summary[s].loc['50%'] = dtf[s][cumsum >= dtf['households'].sum() / 2.0].iloc[0]
        df_summary[s].loc['75%'] = dtf[s][cumsum >= 3*dtf['households'].sum() / 4.0].iloc[0]
        df_summary[s].loc['max'] = dtf[s].max()

        avg = (dtf[s]*dtf['households']).sum()/dtf['households'].sum()
        df_summary[s].loc['mean'] = avg
        df_summary[s].loc['std'] = np.sqrt(((dtf[s]-avg)**2 * dtf['households']).sum()/dtf['households'].sum())

    return df_summary

In [None]:
cumulative(df,sumlist)

In [None]:
df.describe()

## Correlation and Outlier Analysis
I chose certain variables, examined their relationships with other variables, and looked at outliers

### Each variable vs. median_house_value

In [None]:
for c in critical:
    plt.scatter(df[c], df['median_house_value'], s=.1)
    plt.title(c)
#     if c == 'bedrooms_per_room':
#         plt.xlim([0,0.5])
    plt.show()

### Density proxy measure vs. each variable

In [None]:
for c in critical:
    plt.scatter(df['density'], df[c], s=.1)
    plt.title(c)
    plt.ylim([0,df[c].quantile(.95)])
#     if c == 'bedrooms_per_room':
#         plt.xlim([0,0.5])
    plt.show()

In [None]:
plt.scatter(df['density'], df['median_house_value'], s=.1)

In [None]:
cumulative(df, critical)

### Price vs. Median Income

In [None]:
plt.scatter(df['median_income'], df['median_house_value'], s=.1)
plt.plot([6, 14], [0, 400000])

In [None]:
df_outlier1 = df.loc[(df['median_house_value']-0) < (400000-0)/(14-6)*(df['median_income']-6)]

In [None]:
cumulative(df, critical)

In [None]:
df_outlier1

In [None]:
# 18501 is the most visible outlier (i.e. high income, low house value)
    # State Park. 
# 19006 has an insanely high pop_per_household
    # Rooms per house stats are all fine, but WTF with household size
    # Checked coordinates on Google Maps. Turns out it's a state prison + med facility
# Outliers are in much less dense areas
# More rooms per house
# But ppl per house roughly the same
df_outlier1[critical]

### Price vs. Rooms per House

In [None]:
plt.scatter(df['rooms_per_house'], df['median_house_value'], s = .1)
# plt.xlim([0,15])

In [None]:
# don't know why 1914 is so whack
    # all really small looking houses
    # maybe a 'house' is broken into many small structures?
    # maybe clerical error?
    
# all seem pretty rural. SUPER LOW DENSITY
    # a bunch seem clustered around Lake Tahoe (i created a map)
    # less pop per house, med house value, income
df[df['rooms_per_house'] > 40]

# df.loc[df['rooms_per_house'] > 40][critical].describe()

#### Where are districts with abnormally high rooms_per_house?

In [None]:
fig = plt.figure(figsize = (10,15))
axs = fig.add_axes([0,0,1,1])

for poly in pts:
    x = [t[0] for t in poly]
    y = [t[1] for t in poly]
    plt.fill(x, y,facecolor='whitesmoke')

def f(x):
    if (x['median_house_value']-0) < (400000-0)/(14-6)*(x['median_income']-6):
        return 'red'
    else:
        return 'whitesmoke'

df['A'] = df[['median_income','median_house_value']].apply(f, axis=1)

# .loc[(df['median_house_value']-0) < (400000-0)/(14-6)*(df['median_income']-6)]


# df.plot.scatter('long_mercator', 'lat_mercator', 20, ax = axs, c=df['A'], figsize = (10,15), cmap=cm.get_cmap('Spectral'), zorder = 2, alpha = 1)
df.loc[df['rooms_per_house'] > 40].plot.scatter('long_mercator', 'lat_mercator', 10, ax=axs, figsize = (10,15), zorder = 2, alpha=1)
fig.savefig("densities.svg")

#### Created overlapping histograms comparing high rooms-per-house districts (orange) vs. all districts (blue) on several important variables

In [None]:
for c in critical:
    print ("PLOTTING: ", c)
    fig, ax1 = plt.subplots()
    ax2 = ax1.twinx()
    x = df[c]
    ydf = df.loc[df['rooms_per_house'] > 40]
    y = ydf[c]
    y.hist(ax=ax2, color='orange')
    x.hist(ax=ax1)
    plt.tight_layout()
    if (c == 'density'):
        plt.xlim(0,5000)
    plt.show()

### Price vs. Bedrooms_per_person

In [None]:
plt.scatter(df['bedrooms_per_person'], df['median_house_value'], s = .1)
# plt.xlim([0,4])

In [None]:
# 1979 is a ski resort. A lot of overlap with rooms per house outliers. Again, very few houses

df.loc[df['bedrooms_per_person'] > 6]

### Price vs. bedrooms_per_room

In [None]:
cumulative(df,critical)

In [None]:
# rooms_per_house is centered about 1.5-2 (vs. 5-ish for total), which means these houses are generally quite small
# Surprisingly, these are more expensive than on average. But that's also bc they're in denser regions (higher rent)
cumulative(df.loc[df['bedrooms_per_room'] > .6], critical)

## Geographical Analysis

### In the next 2 cells, we examine the distribution of population density values for the dataset. We find several distinct clusters.

In [None]:
l0 = -125
# df['R'] = df['housing_median_age'].apply(lambda x: 'blue' if x > 30 else 'red')
col = ['darkred', 'tomato', 'darkorange','forestgreen','lightskyblue','thistle']
# col = ['whitesmoke', 'darkorange','whitesmoke','whitesmoke','whitesmoke']
df['R'] = df['density'].apply(lambda x: col[0] if x >= 20000 else (col[1] if x >= 9200 else (col[2] if x >= 6500 else (col[3] if x >= 3700 else (col[4] if x >= 500 else col[5])))))
df.head()

In [None]:
fig = plt.figure(figsize=(20,8))
axs = fig.add_axes([0,0,1,1])
df.hist('density', ax=axs, bins=800)
plt.title("Population Density Histogram", fontsize = 30)
plt.ylabel("Frequency", fontsize = 20)
plt.xlabel("Density Estimates", fontsize = 20)
plt.axvline(x=500, c='red', linestyle='--')
plt.axvline(x=3700, c='red', linestyle='--')
plt.axvline(x=6500, c='red', linestyle='--')
plt.axvline(x=9200, c='red', linestyle='--')
plt.axvline(x=20000, c='red', linestyle='--')
plt.savefig('histo.svg')
plt.ylim([0,600])

We find clear clustering on this histogram of density proxy values. Let's break the data points into 6 clusters, as seen above, and explore each. (NOTE: the cluster with the least density seems arbitrarily determined. I just used 500 ppl/mi, as defined in https://www.ers.usda.gov/topics/rural-economy-population/rural-classifications/what-is-rural/) 

### Cool visualization below

In [None]:
fig = plt.figure(figsize = (10,15))
axs = fig.add_axes([0,0,1,1])

for poly in pts:
    x = [t[0] for t in poly]
    y = [t[1] for t in poly]
    plt.fill(x, y,facecolor='whitesmoke')

df.plot.scatter('long_mercator', 'lat_mercator', .4, ax = axs, c=df['R'], figsize = (13,15), zorder = 2, alpha = 1)
fig.savefig("densities.svg")

### Now, let's name the 6 groups
* RR: rural
* RS: rural suburban
* SS: suburban
* SU: suburban-urban
* UU: urban
* CC: city center

In [None]:
df_RR = df.loc[df['density'] < 500]
df_RS = df.loc[(500 <= df['density']) & (df['density'] < 3700)]
df_SS = df.loc[(3700 <= df['density']) & (df['density'] < 6500)]
df_SU = df.loc[(6500 <= df['density']) & (df['density'] < 9200)]
df_UU = df.loc[(9200 <= df['density']) & (df['density'] < 20000)]
df_CC = df.loc[20000 <= df['density']]

In [None]:
df_list = [df_RR, df_RS, df_SS, df_SU, df_UU, df_CC]

### For each of the critical variables, we plot its median (red dot) and IQR (blue line) for each of the 6 groups (labeled 0 to 5)
#### There are some nice trends

In [None]:
for c in critical:
    fig = plt.figure(figsize=(10,10))
    axs = fig.add_axes([0,0,1,1])
    plt.title(c)
    for i, sub in enumerate(df_list):
        cm = cumulative(sub, critical)
        plt.plot([i,i],[cm[c].loc['25%'],cm[c].loc['75%']], color='lightblue')
        plt.plot(i,cm[c].loc['50%'], 'ro', markersize=12)
    plt.show()

We unravel some clean relationships between population density clusters and other variables in df.

Now let's analyze each cluster separately

SS: weaker pop per household,
SU, UU, and (less so) CC: weak bedrooms/rooms per person, pop per household, med income
CC: {weak: }, {moderate: }

In [None]:
for c in critical:
    plt.scatter(df_UU[c], df_UU['median_house_value'], s=.1)
    plt.title(c)
    plt.xlim([0,df_UU[c].quantile(.95)])
    plt.show()

### We find below that breaking the dataset into the 6 groups improves the "predictive power" (as judged by Pearson correlation coefficient) of some variables

In [None]:
df.corr()['median_house_value']

In [None]:
lizt = ['RR','RS','SS','SU','UU','CC']
for i, d in enumerate(df_list):
    print('-'*20+lizt[i]+'-'*20)
    cmat = d.corr()
    print(cmat.loc[abs(cmat['median_house_value']) > .4]['median_house_value'])

# Housing Value Prediction

## Multiple Regression

In [None]:
means = [154375.371074, 221558.781652, 209589.774185, 216382.263378, 209971.951837, 239581.831597]

In [None]:
# Making the ocean_proximity variable quantitative

def makebin(x):
    d = x['ocean_proximity']
    mp = {'NEAR BAY':2, '<1H OCEAN':1, 'INLAND':0, 'NEAR OCEAN':3, 'ISLAND':4}
    return mp[d]

df['ocean_proximity_bin'] = df[['ocean_proximity']].apply(makebin, axis=1)
df = df.drop(['R', 'A', 'long_mercator', 'lat_mercator', 'ocean_proximity'],axis=1)
df.head()

### Multiple regression for each cluster, separately
(Printed RMSE, but didn't do further analysis of model)

In [None]:
from sklearn import linear_model

df_list_names = ['RR','RS','SS','SU','UU','CC']

for i, d in enumerate(df_list):
    d['ocean_proximity_bin'] = d[['ocean_proximity']].apply(makebin, axis=1)
    d = d.drop(['R', 'A', 'long_mercator', 'lat_mercator', 'ocean_proximity'],axis=1)
#     print(d.columns)
    variables = list(d.columns)
    variables.remove('median_house_value')
#     variables = ['median_income']
    d = d.dropna()
    X = d[variables]
    y = d['median_house_value']
    regr = linear_model.LinearRegression()
    regr.fit(X, y)
    
    # evaluate
    pred = regr.predict(np.array(d.drop('median_house_value', axis=1)))
#     pred = regr.predict(np.array(pd.DataFrame(d['median_income'])))
    actu = np.array(d['median_house_value'])
    dumb = np.full((len(d.index),), means[i])
    print(df_list_names[i]+' RMSE: ', (sum((pred-actu)**2)/len(d.index))**.5)
    
# 1: 60363.56455980125