In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression  
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
X = pd.read_csv('../input/california-housing-prices/housing.csv')
X = X.drop(['ocean_proximity'], axis = 1)
y = X.pop('median_house_value')
miss_value = {feature:len(X[X[feature].isna()])/len(X)*100 for feature in X.columns}
miss_value

There is missing values in "total_bedrooms" and it is about 1% of the total column. Thus we can replace the missing values with the meadian value...

In [None]:
X['total_bedrooms'] = X['total_bedrooms'].fillna(X['total_bedrooms'].median())

In [None]:
X_mean, X_std = X.mean(axis = 0), X.std(axis = 0)
y_mean, y_std = y.mean(axis = 0), y.std(axis = 0)
X =(X - X_mean)/X_std
y = (y - y_mean)/y_std

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)


In [None]:
def validate(X_train, X_test, y_train, y_test, model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(type(model).__name__ + ': ')
    print('MSE =', mean_squared_error(y_pred, y_test))
    print('R^2 score =', r2_score(y_pred, y_test))

In [None]:
lm = LinearRegression()
dtm = DecisionTreeRegressor()
rfm = RandomForestRegressor()
svr = SVR()
models = [rfm, dtm, lm, svr]
for model in models:
    validate(X_train, X_test, y_train, y_test, model)
    print('*'*8)