# Predicting median house value

## Load

In [None]:
from sklearn import datasets
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl

In [None]:
plt.style.use('ggplot')
mpl.rcParams['figure.figsize'] = (12, 8)

In [None]:
ca_housing = datasets.fetch_california_housing(as_frame=True)
df = ca_housing['frame']

In [None]:
df.head()

## Clean

In [None]:
sns.histplot(df.HouseAge)

In [None]:
# let's say we're only interested in newer homes, so we define this filtering
# rule
df = df[df.HouseAge <= 30]

In [None]:
sns.histplot(x=df.AveBedrms)

In [None]:
sns.boxplot(x=df.AveBedrms)

In [None]:
# let's also remove big houses
df = df[df.AveBedrms <= 4]

In [None]:
# distribution of our target variable
sns.histplot(df.MedHouseVal)

## Train test split

In [None]:
from sklearn.model_selection import train_test_split  # noqa

In [None]:
X = df.drop('MedHouseVal', axis='columns')
y = df.MedHouseVal

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=42)

## Linear regression

In [None]:
from sklearn.linear_model import LinearRegression  # noqa

In [None]:
lr = LinearRegression()

In [None]:
lr.fit(X_train, y_train)

In [None]:
y_pred = lr.predict(X_test)

In [None]:
sns.scatterplot(x=y_test, y=y_pred)

## Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor  # noqa

In [None]:
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
sns.scatterplot(x=y_test, y=y_pred)