<a id="1.1"></a>
<h3 style="background-color:orange;font-family:newtimeroman;font-size:200%;text-align:center;border-radius: 15px 50px;">Libraries And Utilities</h3>


In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
import plotly.express as ex
import plotly.graph_objs as go
import plotly.offline as pyo
from plotly.subplots import make_subplots
pyo.init_notebook_mode()
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.metrics import r2_score,mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans,DBSCAN
import plotly.graph_objs as go
def RMSE(Y,Y_HAT):
    return np.sqrt(mean_squared_error(Y_HAT,Y))


plt.rc('figure',figsize=(20,11))

def set_seed(seed=31415):
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    os.environ['TF_DETERMINISTIC_OPS'] = '1'
set_seed()

<a id="1.1"></a>
<h3 style="background-color:orange;font-family:newtimeroman;font-size:200%;text-align:center;border-radius: 15px 50px;">Data Loading And Assessment</h3>


In [None]:
p_data = pd.read_csv('/kaggle/input/california-housing-prices/housing.csv')
p_data.head(3)
pcopy = p_data.copy()

In [None]:
plt.title('Amount Of Missing Values',fontsize=20)
ax = sns.heatmap(pd.DataFrame(p_data.isna().sum()),annot=True,fmt='d')
plt.show()

In [None]:
info = p_data.describe()
info.loc['median'] = p_data.median()
info.loc['skew'] = p_data.skew()
info.loc['kurtosis'] = p_data.kurt()

info

<a id="1.1"></a>
<h3 style="background-color:orange;font-family:newtimeroman;font-size:200%;text-align:center;border-radius: 15px 50px;">Exploratory Data Analysis</h3>


In [None]:
plt.subplot(2,1,1)
plt.title('Distribution Of Median age of a house within a block; a lower number is a newer building',fontsize=20)
sns.kdeplot(p_data['housing_median_age'],color='teal')
plt.show()
plt.subplot(2,1,2)
plt.title('Distribution Of Median house value for households within a block (measured in US Dollars)',fontsize=20)
sns.kdeplot(p_data['median_house_value'],color='teal')
plt.show()

<p style="text-align: center;"><span style='font-family: "Times New Roman", Times, serif; font-size: 24px;'>We can clearly see that both features follow a multimodal distribution, meaning we have underlaying groups in our data.</span></p>
<p style="text-align: center;"><span style='font-family: "Times New Roman", Times, serif; font-size: 24px;'>We have no categories in our data set which may indicate the different groups so later in our analysis we will use clustering to try and divide the house value distribution into what will represent the potential underlaying groups.</span></p>

In [None]:
plt.subplot(2,1,1)
plt.title('Distribution Of Room Numbers',fontsize=20)
sns.kdeplot(p_data['total_rooms'])
plt.show()
plt.subplot(2,1,2)
plt.title('Distribution Of Total Bedrooms',fontsize=20)
sns.kdeplot(p_data['total_bedrooms'])
plt.show()

In [None]:
plt.subplot(2,1,1)
plt.title('Distribution Of The Total number of people residing within a block',fontsize=20)
sns.kdeplot(p_data['population'],color='green')
plt.show()
plt.subplot(2,1,2)
plt.title('Distribution Of Total number of households, a group of people residing within a home unit, for a block',fontsize=20)
sns.kdeplot(p_data['households'],color='green')
plt.show()

<p style="text-align: center;"><span style='font-family: "Times New Roman", Times, serif; font-size: 24px;'>The 4 features above have a strong positive skew which most likely can be derived from the large gaps between house prices in comparison to the average.</span></p>
<p style="text-align: center;"><span style='font-family: "Times New Roman", Times, serif; font-size: 24px;'>We will later transform the data in order to normalize our distribution.</span></p>

In [None]:
plt.title('Distribution Of The income for households within a block of houses (measured in tens of thousands of US Dollars)',fontsize=20)
sns.kdeplot(p_data['median_income'],color='teal')
plt.show()

<p style="text-align: center;"><span style='font-family: "Times New Roman", Times, serif; font-size: 24px;'>In comparison to the 4 features, we saw prior to the median income feature (the plot above) we see that there is a slight positive skew but the distribution is fairly normal which is surprising considering we are dealing with median incomes.</span></p>

In [None]:
ex.pie(p_data,names='ocean_proximity',title='Proportion of Locations of the house w.r.t ocean/sea')

<a id="1.1"></a>
<h3 style="background-color:orange;font-family:newtimeroman;font-size:200%;text-align:center;border-radius: 15px 50px;">Data Preprocessing</h3>


In [None]:
plt.subplot(2,1,1)
plt.title('Normalized Distribution Of The Total number of people residing within a block',fontsize=20)
p_data['population'] =np.log(p_data['population'])
sns.kdeplot(p_data['population'],color='green')
plt.show()
plt.subplot(2,1,2)
plt.title('Normalized Distribution Of Total number of households, a group of people residing within a home unit, for a block',fontsize=20)
p_data['households'] =np.log(p_data['households'])
sns.kdeplot(p_data['households'],color='green')
plt.show()

In [None]:
plt.subplot(2,1,1)
plt.title('Normalized Distribution Of Room Numbers',fontsize=20)
p_data['total_rooms'] =np.log(p_data['total_rooms'])
sns.kdeplot(p_data['total_rooms'])
plt.show()
plt.subplot(2,1,2)
plt.title('Normalized Distribution Of Total Bedrooms',fontsize=20)
p_data['total_bedrooms'] =np.log(p_data['total_bedrooms'])
sns.kdeplot(p_data['total_bedrooms'])
plt.show()

<p style="text-align: center;"><span style='font-family: "Times New Roman", Times, serif; font-size: 24px;'>Now that we applied a log transformation to the 4 features we saw above with the high right skewness we see that we are left with a fairly normal distriubtion.</span></p>

In [None]:
ocean_prox_vec = pd.get_dummies(p_data['ocean_proximity']).drop(columns=['NEAR BAY'])

N_COMPONENTS = 2

SVM_T = TruncatedSVD(n_components=N_COMPONENTS)
dc_mat = SVM_T.fit_transform(ocean_prox_vec)

desc_ex_var = np.cumsum(SVM_T.explained_variance_ratio_)

tr1 = go.Scatter(x=np.arange(0,len(desc_ex_var)),y=desc_ex_var,name='Cumulative EV')
tr2 = go.Scatter(x=np.arange(0,len(desc_ex_var)),y=SVM_T.explained_variance_ratio_,name='Individual Component Variance')
fig = go.Figure(data=[tr1,tr2],
          layout=dict(title='Ocean Proximity Explained Variance Ratio Using {} Components'.format(N_COMPONENTS),xaxis_title='# Componenets',yaxis_title='Total Variance Explained'))

fig.show()
op_vct = pd.DataFrame(dc_mat,columns=['Ocean_1','Ocean_2'])
p_data = pd.concat([p_data,op_vct],axis=1)

<p style="text-align: center;"><span style='font-family: "Times New Roman", Times, serif; font-size: 24px;'>The ocean proximity feature was encoded into one-hot vectors and reduced to only 2 features which explain 80% of the variance in the feature.</span></p>
<p style="text-align: center;"><span style='font-family: "Times New Roman", Times, serif; font-size: 24px;'>We strive to use the minimum amount of features in our model and it is redundant to use 4 one-hot vectors when only 2 can be used.</span></p>

<a id="1.1"></a>
<h3 style="background-color:orange;font-family:newtimeroman;font-size:200%;text-align:center;border-radius: 15px 50px;">Correlation Assessment</h3>


In [None]:
fig = make_subplots(rows=2, cols=1,shared_xaxes=True,subplot_titles=('Perason Correaltion',  'Spearman Correaltion'))


s_val =p_data.corr('pearson')
s_idx = s_val.index
s_col = s_val.columns
s_val = s_val.values
fig.add_trace(
    go.Heatmap(x=s_col,y=s_idx,z=s_val,name='pearson',showscale=False),
    row=1, col=1
)


s_val =p_data.corr('spearman')
s_idx = s_val.index
s_col = s_val.columns
s_val = s_val.values
fig.add_trace(
    go.Heatmap(x=s_col,y=s_idx,z=s_val),
    row=2, col=1
)

fig.update_layout(height=700, width=900, title_text="Pearson And Spearman Correlations Between Features")
fig.show()

In [None]:
plt.subplot(2,1,1)
sns.regplot(data=p_data,x='median_income',y='median_house_value',line_kws=dict(color='red',label='Regression Line'))
plt.legend()
plt.show()
plt.subplot(2,1,2)
ax = sns.regplot(data=p_data,x='Ocean_2',y='median_house_value',line_kws=dict(color='red',label='Regression Line'))
plt.legend()
plt.show()


<a id="1.1"></a>
<h3 style="background-color:orange;font-family:newtimeroman;font-size:200%;text-align:center;border-radius: 15px 50px;">House Value Clustering</h3>


In [None]:
km_model = KMeans(3)
cp_data = p_data.copy()
km_model.fit(p_data[['median_income','population','households','total_rooms']])
cp_data['label'] = km_model.labels_


fig = make_subplots(
    rows=3, cols=2,
    #column_widths=[0.6, 0.4],
    #row_heights=[0.6, 0.5],
    
    specs=[
           [{"type": "scatter3d", "rowspan": 3}, {"type": "histogram"}],
           [            None                    , {"type": "histogram"}],
           [            None                    , {"type": "histogram"}]
           
          ])

fig.add_trace(
    go.Histogram(x=cp_data.query('label==0')['median_house_value'],name='label 0 median_house_price'),
    row=1, col=2
)
fig.add_trace(
    go.Histogram(x=cp_data.query('label==1')['median_house_value'],name='label 1 median_house_price'),
    row=2, col=2
)
fig.add_trace(
    go.Histogram(x=cp_data.query('label==2')['median_house_value'],name='label 2 median_house_price'),
    row=3, col=2
)

fig.add_trace(
    go.Scatter3d(x=cp_data['median_income'], y=cp_data['population'],z=cp_data['total_rooms'],mode='markers',name='Clusters',
        marker=dict(
        color=km_model.labels_,                
        colorscale='Viridis',   
        opacity=0.8
    )),
    row=1, col=1
)


fig.update_layout(scene = dict(
                    xaxis_title='Median Income',
                    yaxis_title='Population',
                    zaxis_title='Total Rooms'),
                    )

fig.update_layout(title='Clustering Of Location And Distribution Of Cluster Median House Price')
fig.show()

<a id="1.1"></a>
<h3 style="background-color:orange;font-family:newtimeroman;font-size:200%;text-align:center;border-radius: 15px 50px;">XGB Model Evaluation</h3>


In [None]:
cp_data = pd.concat([cp_data,pd.get_dummies(cp_data['label']).drop(columns=[2])],axis=1)
X = cp_data[['total_rooms','median_income','Ocean_2','Ocean_1','housing_median_age',0,1]].copy()
y = cp_data['median_house_value']

train_x,test_x,train_y,test_y = train_test_split(X,y,random_state=42)

In [None]:
def evaluate_xgb(lr,train_x,test_x,train_y,test_y):
    scores = []
    for i in lr:
        xgb_model = Pipeline(steps= [
            ('scale',StandardScaler()),
            ('xgb',XGBRegressor(random_state=42,verbose=False,learning_rate=i))
        ])
        xgb_model.fit(train_x,train_y)
        scores.append(RMSE(xgb_model.predict(test_x),test_y))
    return scores


In [None]:
xgb_scores =evaluate_xgb([0.09,0.08,0.07,0.05,0.03],train_x,test_x,train_y,test_y)

In [None]:
plt.title('Different Learning Rates XGB Model RMSE',fontsize=20)
ax = sns.lineplot(x=np.arange(0,5),y=xgb_scores)
plt.xlabel('Learning Rate')
plt.ylabel("RMSE")
ax.set_xticks(np.arange(0,5))
ax.set_xticklabels([0.09,0.08,0.07,0.05,0.03])
plt.show()

In [None]:
xgb_model = Pipeline(steps= [
    ('scale',StandardScaler()),
    ('xgb',XGBRegressor(random_state=42,verbose=False,learning_rate=0.09))
])

xgb_model.fit(X,y)
xgb_prediction = xgb_model.predict(X)
output = pd.DataFrame({'Actual':y,'Prediction':xgb_prediction})

In [None]:
plt.title('Current Model Residual Plot',fontsize=20)
sns.residplot(x=xgb_prediction,y=y)
plt.show()

<p style="text-align: center;"><span style='font-size: 24px; font-family: "Times New Roman", Times, serif;'>With the current layout besides having high values of RMSE when looking at the residual plot of our predictions, we can observe heteroskedasticity.</span></p>
<p style="text-align: center;"><span style='font-size: 24px; font-family: "Times New Roman", Times, serif;'>The variance of our residuals isn&apos;t randomly distributed and we can see a certain behavior.</span></p>

In [None]:
fig = make_subplots(
    rows=3, cols=2,subplot_titles=('','Actual','Predictions','Residuals'),
    vertical_spacing=0.09,
    specs=[[{"type": "table","rowspan": 3}     ,{"type": "scatter"}] ,
           [None                               ,{"type": "scatter"}]            ,           
           [None                               ,{"type": "scatter"}]                           
          ]
)

fig.add_trace(
    go.Scatter(
        x=np.arange(0,len(output["Actual"])),
        y=output["Actual"],
        mode="markers",
    ),
    row=1, col=2
)

fig.add_trace(
    go.Scatter(
        x=np.arange(0,len(output["Prediction"])),
        y=output["Prediction"],
        mode="markers",
    ),
    row=2, col=2
)

fig.add_trace(
    go.Scatter(
        x=np.arange(0,len(output["Prediction"])),
        y=output["Prediction"]-output["Actual"],
        mode="markers",
    ),
    row=3, col=2
)

fig.add_trace(
    go.Table(
        header=dict(
            values=['Prediction','Actual'],
            font=dict(size=10),
            align="left"
        ),
        cells=dict(
            values=[output[k].tolist() for k in output.columns],
            align = "left")
    ),
    row=1, col=1
)



fig.add_shape(type="line",
    x0=0, y0=(output["Prediction"]-output["Actual"]).mean(), x1=len(output["Prediction"]), y1=(output["Prediction"]-output["Actual"]).mean(),
    line=dict(
        color="Red",
        width=2,
        dash="dashdot",
    ),
        name='Mean',
        xref='x3', 
        yref='y3'
)

fig.update_layout(
    height=800,
    showlegend=False,
    title_text="Prediction Evaluation",
)

fig.show()

<a id="1.1"></a>
<h3 style="background-color:orange;font-family:newtimeroman;font-size:200%;text-align:center;border-radius: 15px 50px;">Future Directions</h3>


<ol>
    <li><span style='font-size: 24px; font-family: "Times New Roman", Times, serif;'> Evaluate different models (XGB was used in the following kernel by the task requirement)</span></li>
    <li><span style="font-family: 'Times New Roman', Times, serif;"><span style="font-size: 24px;">In-depth clustering (it is clear that there are underlying groups in our data can we successfully extract a proxy for those groups ?)</span></span></li>
    <li><span style='font-size: 24px; font-family: "Times New Roman", Times, serif;'><span style="color: rgb(0, 0, 0); font-style: normal; font-variant-ligatures: normal; font-variant-caps: normal; font-weight: 400; letter-spacing: normal; orphans: 2; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; widows: 2; word-spacing: 0px; -webkit-text-stroke-width: 0px; text-decoration-style: initial; text-decoration-color: initial; float: none; display: inline !important;">Consider model blending as a way to cover the different behaviors in our data and create an overall stable predictor (minimizing the patterns seen in the residual plot [ heteroskedasticity])</span> </span></li>
</ol>