In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import folium
import json
import plotly.express as px

from matplotlib import style
from folium import plugins
from plotly.subplots import make_subplots

#from mlxtend.evaluate import bias_variance_decomp

from warnings import filterwarnings
filterwarnings('ignore')

%matplotlib inline

In [None]:
# Choropleth Function

#Zipcode choropleth maps with average values per a zipcode (King County)
def map_choropleth_zip(df, column, title, column_name):
    fig=px.choropleth_mapbox(data_frame=df, locations='zipcode', geojson=KC_zip_json, color=column, 
                         mapbox_style='open-street-map', zoom=8.5, height=900, featureidkey='properties.ZCTA5CE10', 
                        center={'lat': 47.403768, 'lon': -122.005863}, opacity=0.4,
                        color_continuous_scale=px.colors.sequential.YlOrRd,
                        title=title,
                        template = "plotly_dark", 
                        labels={
                            column: column_name})
    fig.update_layout(
    font_family="Arial",
    font_size=16,
    font_color="white",
    title_font_family="Arial",
    title_font_color="white",
    title_font_size=20)
    
    fig.update_layout(
    title={
        'y':0.98,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top',
    })
    
    fig.show()
    return None


In [None]:
# Importing raw data
df=pd.read_csv('data/kc_house_data.csv')
df_test=pd.read_csv('data/df_test.csv', dtype={'recent_renovation_new_str': str})
df_zipcode_viz=pd.read_csv('data/df_zipcode_vs.csv')
KC_zip_json=json.load(open('data/wa_washington_zip_codes_geo.min.json', 'r'))

><b>Displaying DataFrames</b>

In [None]:
df_test.info()

In [None]:
df_zipcode_viz

><b>Visualization</b>

In [None]:
# Regplots for all four variables

sns.set_style("darkgrid", {"axes.facecolor": ".6"})
fig, axes = plt.subplots(figsize=(20,20), ncols=2, nrows=2)

g1=sns.regplot(data=df_test, x="sqft_living_st", y="price", color="#003300", fit_reg=True,
               ax=axes[0,0], line_kws={"color": "red", "lw":5});
g2=sns.regplot(data=df_test, x="distance_st", y="price", color="#000066", 
               ax=axes[0,1], line_kws={"color": "red", "lw":5});
g3=sns.regplot(data=df_test, x="bathrooms_st", y="price", color="#009900", 
               ax=axes[1,0], line_kws={"color": "red", "lw":5});
g4=sns.regplot(data=df_test, x="grade_st", y="price", color="#0000ff", 
               ax=axes[1,1], line_kws={"color": "red", "lw":5});

axes[0,0].set_title("Prive vs Living Space", fontsize=26);
axes[0,0].set_ylabel('Price', fontsize=20)
axes[0,0].set_xlabel('Living Footage, adjusted', fontsize=20)
axes[0,0].set_xlim(-0.01, 1.0)
ylabels = ['{:,.1f}'.format(x) + 'M' for x in g1.get_yticks()/1000000]
axes[0,0].set_yticklabels(ylabels, size=15)
axes[0,0].grid(color='lightgrey')

axes[0,1].set_title("Price vs Distance from City Center", fontsize=26);
axes[0,1].set_ylabel('Price', fontsize=20)
axes[0,1].set_xlabel('Distance, adjusted', fontsize=20)
axes[0,1].set_xlim(-0.01, 1.0)
ylabels = ['{:,.1f}'.format(x) + 'M' for x in g2.get_yticks()/1000000]
axes[0,1].set_yticklabels(ylabels, size=15)
axes[0,1].grid(color='lightgrey')

axes[1,0].set_title("Price vs Number of Bathrooms", fontsize=26);
axes[1,0].set_ylabel('Price', fontsize=20)
axes[1,0].set_xlabel('Number of Bathrooms, adjusted', fontsize=20)
axes[1,0].set_xlim(-0.01, 1.0)
ylabels = ['{:,.1f}'.format(x) + 'M' for x in g3.get_yticks()/1000000]
axes[1,0].set_yticklabels(ylabels, size=15)
axes[1,0].grid(color='lightgrey')

axes[1,1].set_title("Price vs Building Grade", fontsize=26);
axes[1,1].set_ylabel('Price', fontsize=20)
axes[1,1].set_xlabel('Building Grade, adjusted', fontsize=20)
axes[1,1].set_xlim(-0.01, 1.0)
ylabels = ['{:,.1f}'.format(x) + 'M' for x in g4.get_yticks()/1000000]
axes[1,1].set_yticklabels(ylabels, size=15)
axes[1,1].grid(color='lightgrey')


plt.suptitle("Regression plots of Price vs Four Independent Variables", size=30, c="Blue")
plt.tight_layout(pad=3)


In [None]:
# Regplots for all four variables

sns.set_style("darkgrid", {"axes.facecolor": ".6"})
fig, axes = plt.subplots(figsize=(20,20), ncols=2, nrows=2)

g1=sns.regplot(data=df_test, x="sqft_living_st", y="log_price", color="#003300", fit_reg=True,
               ax=axes[0,0], line_kws={"color": "red", "lw":5});
g2=sns.regplot(data=df_test, x="distance_st", y="log_price", color="#000066", 
               ax=axes[0,1], line_kws={"color": "red", "lw":5});
g3=sns.regplot(data=df_test, x="bathrooms_st", y="log_price", color="#009900", 
               ax=axes[1,0], line_kws={"color": "red", "lw":5});
g4=sns.regplot(data=df_test, x="grade_st", y="log_price", color="#0000ff", 
               ax=axes[1,1], line_kws={"color": "red", "lw":5});

axes[0,0].set_title("Ln(Price) vs Living Space", fontsize=26);
axes[0,0].set_ylabel('Ln(Price)', fontsize=20)
axes[0,0].set_xlabel('Living Footage, adjusted', fontsize=20)
axes[0,0].set_xlim(-0.01, 1.0)
ylabels = ['{:,.1f}'.format(x) for x in g1.get_yticks()]
axes[0,0].set_yticklabels(ylabels, size=15)
axes[0,0].grid(color='lightgrey')

axes[0,1].set_title("Ln(Price) vs Distance from City Center", fontsize=26);
axes[0,1].set_ylabel('Ln(Price)', fontsize=20)
axes[0,1].set_xlabel('Distance, adjusted', fontsize=20)
axes[0,1].set_xlim(-0.01, 1.0)
ylabels = ['{:,.1f}'.format(x) for x in g2.get_yticks()]
axes[0,1].set_yticklabels(ylabels, size=15)
axes[0,1].grid(color='lightgrey')

axes[1,0].set_title("Ln(Price) vs Number of Bathrooms", fontsize=26);
axes[1,0].set_ylabel('Ln(Price)', fontsize=20)
axes[1,0].set_xlabel('Number of Bathrooms, adjusted', fontsize=20)
axes[1,0].set_xlim(-0.01, 1.0)
ylabels = ['{:,.1f}'.format(x) for x in g3.get_yticks()]
axes[1,0].set_yticklabels(ylabels, size=15)
axes[1,0].grid(color='lightgrey')

axes[1,1].set_title("Ln(Price) vs Building Grade", fontsize=26);
axes[1,1].set_ylabel('Ln(Price)', fontsize=20)
axes[1,1].set_xlabel('Building Grade, adjusted', fontsize=20)
axes[1,1].set_xlim(-0.01, 1.0)
ylabels = ['{:,.1f}'.format(x) for x in g4.get_yticks()]
axes[1,1].set_yticklabels(ylabels, size=15)
axes[1,1].grid(color='lightgrey')

plt.suptitle("Regression plots of Ln(Price) vs Four Independent Variables", size=30, c="Blue")
plt.tight_layout(pad=3)

In [None]:
df_test_sample=df_test.sample(n=250, random_state=123)

In [None]:
fig = px.scatter(df_test_sample, x='sqft_living', y='price',  trendline='ols', trendline_color_override='white',
                 color='grade', size='distance',width=1000, height=800, size_max=20, 
                 color_continuous_scale=px.colors.sequential.Blackbody_r,
                 labels={
                     "price": "Price",
                     "sqft_living": "Living Space (sq ft)",
                     "grade": "Building Grade"
                  },
                title="<b>Correlation: Property Price vs Living Space Footage</b><br><i>Sized by Distance</i>",  template = "plotly_dark")

fig.update_traces(marker=dict(
            line=dict(
                color='coral',
                width=0.5
            )))
fig.update_layout(
    font_family="Arial",
    font_size=18,
    font_color="white",
    title_font_family="Arial",
    title_font_color="white",

)
fig.update_layout(
    title={
        'y':0.95,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top',
})


fig.show()

In [None]:
fig = px.scatter(df_test_sample, x='bathrooms', y='price', color_continuous_scale=px.colors.sequential.Blackbody_r,
                 size='sqft_living',size_max=20,
                 trendline='ols', trendline_color_override='white', color='grade',
                 width=1000, height=800, labels={
                     "price": "Price",
                     "bathrooms": "Number of bathrooms",
                     "grade": "Building Grade"
                 },
                title="<b>Correlation: Property Price vs Number of Bathrooms</b><br><i>Sized by Living Space</i>",
                 template = "plotly_dark")


fig.update_traces(marker=dict(
            line=dict(
                color='coral',
                width=0.5
            )))
fig.update_layout(
    font_family="Arial",
    font_size=18,
    font_color="white",
    title_font_family="Arial",
    title_font_color="white",

)

fig.update_layout(
    title={
        'y':0.95,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top',
})


fig.show()

In [None]:
fig = px.scatter(df_test_sample, x='distance', y='price',  trendline='ols', trendline_color_override='white',
                 color='grade', size='bathrooms',width=1000, height=800, size_max=20, 
                 color_continuous_scale=px.colors.sequential.Blackbody_r,
                 labels={
                     "price": "Price",
                     "distance": "Distance",
                     "grade": "Building Grade"
                  },
                title="<b>Correlation: Property Price vs Distance from the City Center</b><br><i>Sized by Number of Bathrooms</i>",  template = "plotly_dark")

fig.update_traces(marker=dict(
            line=dict(
                color='coral',
                width=0.5
            )))
fig.update_layout(
    font_family="Arial",
    font_size=18,
    font_color="white",
    title_font_family="Arial",
    title_font_color="white",

)
fig.update_layout(
    title={
        'y':0.95,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top',
})


fig.show()

In [None]:
fig = px.scatter(df_test_sample, x='sqft_living', y='price',  trendline='ols',
                 color='recent_renovation_new_str', width=1000, height=800, size_max=20, 
                 labels={
                     "price": "Price",
                     "sqft_living": "Living Space (sq ft)",
                     "recent_renovation_new_str": "Newer(1)/Older(0)"
                  },
                title="Correlation: Property Price vs Living Space Footage of Newer vs Older Properties",
                 template = "plotly_dark")

fig.update_traces(marker=dict(
            line=dict(
                color='coral',
                width=0.5
            )))
fig.update_layout(
    font_family="Arial",
    font_size=18,
    font_color="white",
    title_font_family="Arial",
    title_font_color="white",

)
fig.update_layout(
    title={
        'y':0.95,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top',
})


fig.show()

In [None]:
fig = px.scatter_3d(df_test_sample, x='bathrooms', z='grade', y='sqft_living',
              color='price', size='distance', size_max=50, opacity=1, width=1000, height=800,
                   color_continuous_scale=px.colors.sequential.Blackbody_r, 
                   labels={
                     "bathrooms": "Number of Bathrooms",
                     "sqft_living": "Living Space (sq ft)",
                     "grade": "Grade",
                       "price": "Price"
                  },
                title="3D plot: Living Space Footage, Number of Bathrooms and Grade of Sold Properties",
                   template = "plotly_dark")

fig.update_traces(marker=dict(
            line=dict(
                color='coral',
                width=0.5
            )))
fig.update_layout(
    font_family="Arial",
    font_size=16,
    font_color="white",
    title_font_family="Arial",
    title_font_color="white"
)
fig.update_layout(
    title={
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top',
})


fig.show()

In [None]:
df_cut_off=df.copy()
df_cut_off=df_cut_off[(df_cut_off['price']<1500000)]

fig = px.scatter_mapbox(df_cut_off, lat="lat", lon="long", color="price", 
                  color_continuous_scale=px.colors.sequential.Plasma, zoom=10, 
                        mapbox_style='open-street-map', width=900, height=900,
                        title="Properties Sold in King County in 2014-2015",
                   template = "plotly_dark")

fig.update_layout(
    font_family="Arial",
    font_size=20,
    font_color="white",
    title_font_family="Arial",
    title_font_color="white"
)
fig.update_layout(
    title={
        'y':0.98,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top',
})


fig.show()

In [None]:
map_choropleth_zip(df_zipcode_viz, 'price', "Average Prices of Sold Properties per Zipcode (King County, 2014-2015)", 
                   "Price")

In [None]:
map_choropleth_zip(df_zipcode_viz, 'sqft_lot', "Average Lot Size of Sold Properties per Zipcode (King County, 2014-2015)",
                   "Lot size (sq ft)")

In [None]:
map_choropleth_zip(df_zipcode_viz, 'sqft_living', "Average Living Space of Sold Properties per Zipcode (King County, 2014-2015)", 
                   "Living Space (sq ft) ")

In [None]:
map_choropleth_zip(df_zipcode_viz, 'view', "Average View Category of Sold Properties per Zipcode (King County, 2014-2015)", 
                   "View Category")

In [None]:
map_choropleth_zip(df_zipcode_viz, 'yr_built', "Average Year Built of Sold Properties per Zipcode (King County, 2014-2015)", 
                   "Year Built")