# Predict housing price in Brazil
## *Comparing 8 regression algorithms*
*This dataset contains 10962 houses to rent with 13 diferent features.*

![housing brazil](https://i.imgur.com/wOKxor1.pnghttps://i.imgur.com/wOKxor1.png)




# Table of contents

[<h3>1. Data Analysis & Data Processing</h3>](#1)

[<h3>2. Model comparison</h3>](#2)

[<h3>3. Prediction metrics of the best model using the test set</h3>](#3)

[<h3>4. Visualization of the result</h3>](#4)



# Load the libraries

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, LinearRegression, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from time import perf_counter
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Markdown, display

def printmd(string):
    # Print with Markdowns    
    display(Markdown(string))

import warnings
warnings.filterwarnings(action='ignore')


# 1. Data Analysis & Data Preprocessing<a class="anchor" id="1"></a><a class="anchor" id="1"></a>

In [None]:
df = pd.read_csv('../input/brasilian-houses-to-rent/houses_to_rent_v2.csv')
df.head()

In [None]:
df['total (R$)'].plot.box(by='total (R$)', color = '#ff8c8e')
plt.title('Display all the price\nInclusive outliers')
plt.show()

In [None]:
# As we can see there are strong outliers. We'll filter them out and 
# keep only the prices lower or igual to 20.000.
df = df[df['total (R$)'] <= 20000]

In [None]:
df['total (R$)'].plot.box(by='total (R$)', color = '#ff8c8e')
plt.title('Display the price\n(max 20.000 R$)')
plt.show()

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(15, 5))

df['total (R$)'].plot.hist(by='total (R$)',ax = axes[0], color = '#ff8c8e')
axes[0].set_title('total (R$)\'s histogram\n', fontsize = 15)

df['total (R$)'].plot.box(ax = axes[1])
axes[1].set_title('total (R$)\'s Boxplot\n', fontsize = 15)

sns.violinplot(ax = axes[2], y = 'total (R$)', data = df, color = '#ff8c8e')
axes[2].set_title('total (R$)\'s distribution\n(violinplot)', fontsize = 15)

plt.show()

In [None]:
df['area'].plot.box(by='total (R$)', color = '#ff8c8e')
plt.title('Display all the areas\nInclusive outliers')
plt.show()

In [None]:
# As we can see there are strong outliers. We'll filter them out and 
# keep only the prices lower or igual to 500.
df = df[df['area'] <= 500]

In [None]:
df['area'].plot.box(by='total (R$)', color = '#ff8c8e')
plt.title('Display the areas\n(max 500)')
plt.show()

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(15, 5))

df['area'].plot.hist(by='area',ax = axes[0])
axes[0].set_title('Area\'s histogram\n', fontsize = 15)

df['area'].plot.box(ax = axes[1])
axes[1].set_title('Area\'s Boxplot\n', fontsize = 15)

sns.violinplot(ax = axes[2], y = 'area', data = df)
axes[2].set_title('Area\'s distribution\n(violinplot)', fontsize = 15)

plt.show()

In [None]:
printmd(f'### Number of rows in the dataset: {df.shape[0]}')

In [None]:
# Select the columns to display
cols = df.columns
cols = list(cols)
cols.remove('area')
cols.remove('floor')

fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(15, 10),
                        subplot_kw={'xticks': [], 'yticks': []})

for i, ax in enumerate(axes.flat):
    col = cols[i]
    sns.boxplot(x=col,y='total (R$)', data = df, ax = ax)
    ax.set_title(f"Boxplot: Price distribution depending on the {col.capitalize()}")
plt.tight_layout()
plt.show()

In [None]:
df.columns

In [None]:
# Drop the column with various prices
# because we want to keep only the total price
cols_drop = ['hoa (R$)', 'rent amount (R$)','property tax (R$)', 'fire insurance (R$)']
df = df.drop(cols_drop, axis = 1)

# Create dummies for the columns with strings
cols_dummies = ['city','animal','furniture']
df = pd.get_dummies(df, columns = cols_dummies )

# Show the result
df.head(5)

In [None]:
# Where there is no floor, it is signalized as "-"
# in the dataset
# Replace "-" with "0"
df.loc[df['floor'] == '-','floor']= 0

# Change the column "floor" to the integer data type
df = df.astype({'floor': 'int64'})

In [None]:
def preprocessing(df):
    df = df.copy()
       
    # Shuffle the data
    df = df.sample(frac=1.0, random_state=0).reset_index(drop=True)
    cols = ['city', 'area', 'rooms', 'bathroom', 'parking spaces', 'floor','animal', 'furniture']
    X = df.drop('total (R$)', axis = 1)
    y = df['total (R$)']
    
    X = pd.DataFrame(X, index=X.index, columns=X.columns)
    
    return X, y

# Preprocessing
X,y = preprocessing(df)

# Split into a training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Scale the datasets
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Display the result
X_train[:2], y_train[:2]

# 2. Model comparison<a class="anchor" id="2"></a>

In [None]:
models = {
    "LinearRegression":{"model":LinearRegression() },
    "Lasso":{"model":Lasso() },
    "Ridge":{"model":Ridge() },
    "DecisionTreeRegressor":{"model":DecisionTreeRegressor() },
    "RandomForestRegressor":{"model":RandomForestRegressor() },
    "MLPRegressor":{"model":MLPRegressor() },
    "GradientBoostingRegressor":{"model":GradientBoostingRegressor() },
    "AdaBoostRegressor":{"model":AdaBoostRegressor() }
}

# Use the K-fold cross validation for each model
# to get the mean validation accuracy and the mean training time
k = 5
for name, m in models.items():
    # Cross validation of the model
    model = m['model']
    result = cross_validate(model, X_train,y_train, cv = k, scoring='neg_mean_squared_error')
    
    # Mean accuracy and mean training time
    result['test_score'] = result['test_score']
    mean_RMSE = [(-x)**0.5 for x in result['test_score']] # Root Mean Square Error
    mean_RMSE = sum(mean_RMSE)/len(mean_RMSE)
    mean_RMSE = int(mean_RMSE)
    mean_fit_time = round( sum(result['fit_time']) / len(result['fit_time']), 4)
    
    # Add the result to the dictionary witht he models
    m['mean_RMSE'] = mean_RMSE
    m['Training time (sec)'] = mean_fit_time
    
    # Display the result
    print(f"{name:27} mean RMSE for {k}-fold CV: {mean_RMSE} - mean training time {mean_fit_time} sec")

In [None]:
# Create a DataFrame with the results
models_result = []

for name, v in models.items():
    lst = [name, v['mean_RMSE'],v['Training time (sec)']]
    models_result.append(lst)

df_results = pd.DataFrame(models_result, 
                          columns = ['model','RMSE','Training time (sec)'])
df_results.sort_values(by='RMSE', ascending=True, inplace=True)
df_results.reset_index(inplace=True,drop=True)
df_results

In [None]:
plt.figure(figsize = (15,5))
sns.barplot(x = 'model', y = 'RMSE', data = df_results)
plt.title(f'{k}-fold mean RMSE for each Model\nSmaller is better', fontsize = 15)
# plt.ylim(0.8,1.005)
plt.xlabel('Model', fontsize=15)
plt.ylabel('RMSE',fontsize=15)
plt.xticks(rotation=90, fontsize=12)
plt.show()

In [None]:
plt.figure(figsize = (15,5))
sns.barplot(x = 'model', y = 'Training time (sec)', data = df_results)
plt.title('Training time for each Model in sec\nSmaller is better', fontsize = 15)
plt.xticks(rotation=90, fontsize=12)
plt.xlabel('Model', fontsize=15)
plt.ylabel('Training time (sec)',fontsize=15)
plt.show()

# 3. Prediction metrics of the best model using the test set<a class="anchor" id="4"></a>

In [None]:
# Get the model with the highest mean validation accuracy
best_model = df_results.iloc[0]

# Fit the model
model = models[best_model[0]]['model']
model.fit(X_train,y_train)

# Predict the labels with the data set
pred = model.predict(X_test)

RMSE = mean_squared_error(y_test,pred)**0.5
RMSE = int(RMSE)

# Display the results
printmd(f'### Best Model: {best_model[0]} with a RMSE of {RMSE} on the test set')
printmd(f'### Trained in: {best_model[2]} sec')

In [None]:
# Concatenate the ratings of the test set
# with the predictions of those ratings
pred_s = pd.Series(pred)
y_test_s = y_test.reset_index(drop=True)

df_result = pd.concat([y_test_s,round(pred_s,0)], axis = 1)
df_result.columns = ['Real Rating', 'Predicted Rating']
df_result = df_result.astype({'Predicted Rating': 'int64'})
df_result.head(10)

In [None]:
df_result.plot.box()
plt.title('Boxplot Real Rating VS Predicted Rating', fontsize = 12)
plt.show()

df_result.plot.scatter(x='Real Rating', y='Predicted Rating', alpha = 0.1)
plt.title('Scatterplot Real Rating VS Predicted Rating', fontsize = 12)
plt.show()

![Bazial Rio de Janeiro](https://i.imgur.com/phixAyv.png)