# Housing Dataset: regression models

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import KFold, GridSearchCV, StratifiedKFold
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

## About the data

The data pertains to the houses found in a given California district and some summary stats about them based on the 1990 census data.

We can see that all variables are numerical, continuous and on different scales, except for the ocean_proximity variable which is a string and so catergorical.

In [None]:
df = pd.read_csv('../input/california-housing-prices/housing.csv')
df.head()

## Data types and null values

Only total_bedrooms has null values, we could simply drop these rows or assign them mean or median value, but in order to be more precise, I will fill these values using a regression model.

In [None]:
df.info()

Let's check the ocean_proximity variable

In [None]:
df.ocean_proximity.value_counts()

Let's transform this column into five dummy variables, then we will drop the ISLAND column, because it will always be 0, except for 5 cases on 20640.

In [None]:
df = pd.concat([df.drop('ocean_proximity', axis=1), pd.get_dummies(df.ocean_proximity)], axis=1) #getting columns for dummy variables
df = df.drop('ISLAND', axis=1) #dropping the column having five times 1 and otherwise 0

I decided to handle null values in total_bedroom column by predicting them using the other features. We can take the rows where total_bedrooms is null, in order to predict this value.

In [None]:
to_predict = df[df.total_bedrooms.isnull()].drop(['total_bedrooms'], axis=1) #all rows where total_bedroom is null

In [None]:
train = df.dropna() #original dataset without null values

scaler = StandardScaler() #scaler
X = train.drop('total_bedrooms', axis=1) #dropping target column
X = scaler.fit_transform(X) #scaling X
y = train.total_bedrooms #target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=44) #splitting in training and test set

Trying three algorithms, with default parameters, Linear Regression, Gradient Boosting Regression and Random Forest Regression, we can see that Gradient Boosting has a lower Root Mean Squared Error and a better R^2 score, so we will predict the missing total_bedrooms values through this model.

In [None]:
models = [('Linear Regresion', LinearRegression()),
          ('Gradient Boosting', GradientBoostingRegressor()),
          ('Random Forest', RandomForestRegressor())]



for model in models: #for loop through the three models
    reg = model[1]  #initialize the model object
    reg.fit(X_train,y_train)  #fitting the training data
    pred = reg.predict(X_test)  #predict target
    print(model[0])
    print('R2: ',r2_score(y_test, pred))  #check r2 score
    print('RMSE: ', np.sqrt(mean_squared_error(y_test, pred)))  #check root mean squared error
    print('-'*30)

Now we can impute the predicted values to fill the null ones

In [None]:
reg = GradientBoostingRegressor() #best model
reg.fit(X, y) #fitting to the data
filled_values = reg.predict(to_predict) #getting predicted data to fill null values

to_predict['total_bedrooms'] = filled_values  #filling the null values 
df = pd.concat([df.dropna(),to_predict]).reindex(df.index)  #adding the rows in which total_bedrooms was null and reset indexes
df.info()

Now we can go ahead in order to find the best model for predicting median_house_value, so we will try: Linear Regression, KNN Regressor, Support Vector Regressor, Decision Tree and Random Forest.

I will evaluate the model using these metrics:
- R2 score
- Root Mean Squared Error
- Time for computation

In [None]:
X = df.drop('median_house_value', axis=1)
y = df.median_house_value

scaler = StandardScaler()  #scaler object

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=22)  #splitting into training and test
kfold = KFold(n_splits=10, random_state=99, shuffle=True)   #kfold cross validation object with 10 splits

X_train = scaler.fit_transform(X_train)  #scaling training set
X_test = scaler.transform(X_test)  #scaling test set

## Linear Regression

In [None]:
reg = LinearRegression() 
start = datetime.now()
reg.fit(X_train, y_train)
pred = reg.predict(X_test)
stop = datetime.now()
delta = stop - start

print('Linear Regression\n')
r2 = r2_score(y_test, pred)
print('R2: ', r2)
err = np.sqrt(mean_squared_error(y_test, pred))
print('Root Mean Squared Error: ', err)
seconds = delta.seconds + delta.microseconds/1E6
print('Time to compute: ', seconds, 'seconds')

linear_reg = ('Linear Regression', r2, err, seconds)

## K-Nearest Neighbors Regressor

In [None]:
params = {
    'n_neighbors': [9],  #'n_neighbors': [3,4,5,6,7,8,9,10,11,12], Number of neighbor points to consider for prediction
    'weights': ['distance'],  #'weights': ['uniform', 'distance'], weight function used in prediction
    'p': [1]  #'p': [1,2] # p=1 compute manhattan distance, p=2 compute euclidean distance
    }

knn = KNeighborsRegressor()
rs = GridSearchCV(estimator=knn, param_grid=params, cv=10, n_jobs=-1, scoring='neg_mean_squared_error')
rs.fit(X_train, y_train)
print(rs.best_estimator_)
knn = rs.best_estimator_
start = datetime.now()
knn.fit(X_train, y_train)
pred = knn.predict(X_test)
stop = datetime.now()
delta = stop - start

print('-'*30)
r2 = r2_score(y_test, pred)
print('R2: ', r2)
err = np.sqrt(mean_squared_error(y_test, pred))
print('Root Mean Squared Error: ', err)
seconds = delta.seconds + delta.microseconds/1E6
print('Time to compute: ', seconds, 'seconds')

knn_reg = ('KNN', r2, err, seconds)

## Support Vector Regressor

Due to high computational time, I decided to run the model on a balanced subsample of the dataset in order to find the best hyperparameters. I just commented this for running the notebook fastly. The chosen hyperparameters are the ones obtaining with the commented lines of code.

In [None]:
#skf = StratifiedKFold(n_splits=3,shuffle=True, random_state=22)  #Stratified k-fold object
#
#train_index, test_index = next(skf.split(X, y))  #obtaining indexes of 1/3 of original dataset rows##
#
#X_fold = X.iloc[test_index]
#y_fold = y.iloc[test_index]
#
#scaler_train_fold = StandardScaler()
#scaler_test_fold = StandardScaler()
#
#X_train_fold, X_test_fold, y_train_fold, y_test_fold = train_test_split(X_fold, y_fold, test_size=0.3, random_state=22)
#
#X_train_fold = scaler_train_fold.fit_transform(X_train_fold)
#X_train_fold = scaler_test_fold.fit_transform(X_test_fold)
#
#params = {'C': [100],         #'C': [0.1,1,10,100]
#         'gamma': [1],     #'gamma': [0.01,0.1,1,10]
#         'kernel': ['linear'],   
#        }
#
#svr = SVR()
#rs = GridSearchCV(estimator=svr, param_grid=params, cv=3, n_jobs=-1, scoring='neg_mean_squared_error')
#rs.fit(X_train_fold, y_train_fold)
#print(rs.best_estimator_)
#svr = rs.best_estimator_

svr = SVR(C=100, gamma=1, kernel='linear')
start = datetime.now()
svr.fit(X_train, y_train)
pred = svr.predict(X_test)
stop = datetime.now()
delta = stop - start

print('-'*30)
r2 = r2_score(y_test, pred)
print('R2: ', r2)
err = np.sqrt(mean_squared_error(y_test, pred))
print('Root Mean Squared Error: ', err)
seconds = delta.seconds + delta.microseconds/1E6
print('Time to compute: ', seconds, 'seconds')

support_vector_reg = ('SVR', r2, err, seconds)

## Decision Tree Regressor

Tree based algorithms don't need scaling, so we will get training e test set without scaling them

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=22) #getting not scaled data

params = {'max_depth': [7], #np.linspace(1, 10, 10) The maximum depth of the tree
          'max_features': ['auto', 'sqrt'], #The number of features to consider when looking for the best split
          'min_samples_leaf': [7], #[3,4,5,6,7,8] The minimum number of samples required to be at a leaf node
          'min_samples_split': [0.1], #np.linspace(0.1, 1.0, 10) The minimum number of samples required to split an internal node
          'criterion': ['mse'] #The function to measure the quality of a split
         }

tree = DecisionTreeRegressor()
rs = GridSearchCV(estimator=tree, param_grid=params, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')
rs.fit(X_train, y_train)
print(rs.best_estimator_)

tree = rs.best_estimator_
start = datetime.now()
tree.fit(X_train, y_train)
pred = tree.predict(X_test)
stop = datetime.now()
delta = stop - start

print('-'*30)
r2 = r2_score(y_test, pred)
print('R2: ', r2)
err = np.sqrt(mean_squared_error(y_test, pred))
print('Root Mean Squared Error: ', err)
seconds = delta.seconds + delta.microseconds/1E6
print('Time to compute: ', seconds, 'seconds')

decision_tree = ('Tree', r2, err, seconds)

## Random Forest Regressor

Just as done for SVR, due to high computation time I decided to run the model on a balanced subsample of the dataset in order to find the best hyperparameters

In [None]:
skf = StratifiedKFold(n_splits=3,shuffle=True, random_state=22)
train_index, test_index = next(skf.split(X, y))  #obtaining indexes of 1/3 of original dataset rows##

X_fold = X.iloc[test_index]
y_fold = y.iloc[test_index]

X_train_fold, X_test_fold, y_train_fold, y_test_fold = train_test_split(X_fold, y_fold, test_size=0.3, random_state=22)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

params = { 
          'n_estimators': [1000],
          'max_depth': [8],                  #'max_depth': [4,5,6,7,8,9]
          'max_features': ['auto', 'sqrt'],  #'max_features': ['auto', 'sqrt']
          'min_samples_leaf': [4],           #'min_samples_leaf': [2,3,4,5,6,7]
          'min_samples_split' : [0.01],      #'min_samples_split' : [0.01]
         }

rf = RandomForestRegressor()
rs = GridSearchCV(estimator=rf, param_grid=params, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')
rs.fit(X_train_fold, y_train_fold)
print(rs.best_estimator_)

rf = rs.best_estimator_
start = datetime.now()
rf.fit(X_train, y_train)
pred = rf.predict(X_test)
stop = datetime.now()
delta = stop - start

print('-'*30)
r2 = r2_score(y_test, pred)
print('R2: ', r2)
err = np.sqrt(mean_squared_error(y_test, pred))
print('Root Mean Squared Error: ', err)
seconds = delta.seconds + delta.microseconds/1E6
print('Time to compute: ', seconds, 'seconds')

random_forest = ('Random Forest', r2, err, seconds)

## Results: which is the best model?

As can be seen from the table below, K-nearest neighbors resulted to be the best model for this dataset because of:
- highest R^2 score
- lowest root mean squared error

Considering computation time, we can see that K-nearest neighbors algorithm is 15 times slower than Decision tree and 50 times slower than a Linear regression, but the performances in terms of R2 and error are considerably better than these two models, maybe in situations with really high dimensional data, simpler models can be preferred.

In [None]:
df_results = pd.DataFrame([linear_reg, knn_reg, support_vector_reg, decision_tree, random_forest], columns=['model', 'R2','RMSE','comp_time'])
df_results.sort_values('R2',ascending=False)