In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import geopandas as gpd
import matplotlib.pyplot as plt
from shapely.geometry import Point
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.linear_model import LinearRegression,Ridge, Lasso, SGDRegressor, RidgeCV

from mpl_toolkits.mplot3d import Axes3D 
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('../input/california-housing-prices/housing.csv')
df

In [None]:
GEO_X_FEATURE = ['longitude','latitude']
NUM_X_FEATURE = ['housing_median_age','median_income','ocean_proximity']
Y_FEATURE = 'median_house_value'

In [None]:
geo_x = df[GEO_X_FEATURE]
num_x = df[NUM_X_FEATURE]
y = df[Y_FEATURE]

In [None]:
# Step 0: Visualization

california_shp = gpd.read_file('../input/california-basemap-shapefile/cnty19_1.shp')
california_shp = california_shp.set_crs(epsg=3395, inplace=True, allow_override=True)
california_shp = california_shp.to_crs(epsg=4326)
california_shp.plot()
plt.show()

In [None]:
geom = [Point(xy) for xy in zip(df['longitude'], df['latitude'])]
geom_df = gpd.GeoDataFrame(df, geometry=geom)

In [None]:
fig = plt.figure(figsize=(20,100))
ax1 = fig.add_subplot(111)
california_shp.plot(ax=ax1, alpha=1, color='grey')
geom_df.plot(ax=ax1, column='median_house_value',cmap='YlOrRd', markersize=1)


In [None]:
df.median_house_value.describe()

In [None]:
sns.kdeplot(df.median_house_value, cumulative=False)

In [None]:
sns.kdeplot(df.median_house_value, cumulative=True)

In [None]:
fig = plt.figure(figsize=(40,40))
ax1 = fig.add_subplot(221)
ax2 = fig.add_subplot(222)
ax3 = fig.add_subplot(223)
ax4 = fig.add_subplot(224)

# lower 25% house
california_shp.plot(ax=ax1, alpha=1, color='grey')
geom_df[(geom_df.median_house_value >=0) & (geom_df.median_house_value<=119600)].plot(ax=ax1, column='median_house_value',cmap='YlOrRd', markersize=3)

# 25% - 50% house
california_shp.plot(ax=ax2, alpha=1, color='grey')
geom_df[(geom_df.median_house_value >119600) & (geom_df.median_house_value<=179700)].plot(ax=ax2, column='median_house_value',cmap='YlOrRd', markersize=3)

# 50% - 75% house
california_shp.plot(ax=ax3, alpha=1, color='grey')
geom_df[(geom_df.median_house_value >179700) & (geom_df.median_house_value<=264725)].plot(ax=ax3, column='median_house_value',cmap='YlOrRd', markersize=3)


# top 25% house
california_shp.plot(ax=ax4, alpha=1, color='grey')
geom_df[(geom_df.median_house_value >264725) & (geom_df.median_house_value<=500001)].plot(ax=ax4, column='median_house_value',cmap='YlOrRd', markersize=3)



In [None]:
fig = plt.figure(figsize=(20,100))
ax1 = fig.add_subplot(111)
california_shp.plot(ax=ax1, alpha=1, color='grey')
geom_df.plot(ax=ax1, column='median_income',cmap='YlOrRd', markersize=1)


In [None]:
df[['median_income','median_house_value']].corr()

In [None]:
# STEP 1: Train Geo feature of X by KNN

geo_x_train, geo_x_test, y_train, y_test = train_test_split(geo_x, y, test_size=0.2, random_state=42)

klist = np.arange(5,20)
blist = np.arange(10000,20000,250)

def train_hyperpara_in_knn_oneinstance(k_value, bin_para, geo_x_train, y_train, geo_x_test, y_test):
    knnc = KNeighborsClassifier(n_neighbors=k_value)
    y_train_binned = y_train.apply(lambda x:bin_para*np.floor(x/bin_para))
    knnc.fit(geo_x_train,y_train_binned)
    pred = knnc.predict(geo_x_test)
    return mean_squared_error(pred,y_test)



def train_hyperpara_in_knn(k_value_list, bin_para_list, geo_x_train, y_train, geo_x_test, y_test):
    kb_s_map = {}
    for k_value in k_value_list:
        for b_value in bin_para_list:
            ret = train_hyperpara_in_knn_oneinstance(k_value, b_value, geo_x_train, y_train, geo_x_test, y_test )
            kb_s_map[(k_value, b_value)] = ret
    return kb_s_map


In [None]:
ret = train_hyperpara_in_knn(klist, blist, geo_x_train,y_train, geo_x_test, y_test)

In [None]:
def plot_by_res_3d(result_dict):
    d1, d2 = zip(*result_dict.keys())
    d3 = list(result_dict.values()) 
    fig = plt.figure() 
    pr = fig.gca(projection='3d') 
    return pr.scatter(d1,d2,d3)
    
plot_by_res_3d(ret)

In [None]:
def draw_k(k_v):
    x_t = []
    y_t = []
    for _ in ret.keys():
        if _[0] == k_v:
            x_t.append(_[1])
            y_t.append(ret[_])
    plt.plot(x_t,y_t)
def draw_b(b_v):
    x_t = []
    y_t = []
    for _ in ret.keys():
        if _[1] == b_v:
            x_t.append(_[0])
            y_t.append(ret[_])
    plt.plot(x_t,y_t)

In [None]:
draw_k(8);draw_k(9);draw_k(10)

In [None]:
draw_b(13750);draw_b(14000);draw_b(14250)

In [None]:
# Now we got the optimal k and b for knn model
OPT_K = 9
OPT_B = 14000
opt_knnc = KNeighborsClassifier(n_neighbors=OPT_K)
y_binned = y.apply(lambda x:OPT_B*np.floor(x/OPT_B))
opt_knnc.fit(geo_x,y_binned)
pred_knn = opt_knnc.predict(geo_x)

In [None]:
# STEP 2: Train (result of knn + num feature of x) by linear regression

num_x_onehot = pd.get_dummies(num_x)
merged_x = pd.concat([num_x_onehot, pd.Series(pred_knn, name="knn_pred")],axis=1)
num_x_train, num_x_test, y_train, y_test = train_test_split(merged_x, y, test_size=0.2, random_state=42)



In [None]:
lmlist = [LinearRegression, Ridge, Lasso, RidgeCV]

def trainLinear_oneinstance(lm):
    lmc = lm()
    lmc.fit(num_x_train, y_train)
    #pred = lmc.predict(num_x_test)
    return lmc.score(num_x_test, y_test)

def trainLinear(model_list):
    m_s_map = {}
    for m_ in model_list:
        ret = trainLinear_oneinstance(m_)
        m_s_map[m_] = ret
    return m_s_map

In [None]:
ret = trainLinear(lmlist)

In [None]:
ret

In [None]:
OPT_LM = LinearRegression
opt_lmc = OPT_LM()
opt_lmc.fit(merged_x,y)
pred_lm = opt_lmc.predict(merged_x)

In [None]:
# Step 3: Integrate the models

overall_df = pd.concat([geo_x, num_x, pd.Series(pred_knn, name="knn_pred"), y, pd.Series(pred_lm, name="lm_pred")],axis=1)
overall_df['RelErr'] = abs(overall_df['median_house_value'] - overall_df['lm_pred']) / overall_df['median_house_value'] 

In [None]:
overall_df

In [None]:
overall_df['RelErr'].describe()

In [None]:
# What is next?

#Cluster Spatial Points First
#Then K-NN
#Then LM
