In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

#### Loading dataset

In [None]:
df = pd.read_csv('/kaggle/input/brasilian-houses-to-rent/houses_to_rent_v2.csv')

new_column_names = {'hoa (R$)':'hoa',
                    'rent amount (R$)': 'rent_amount',
                    'property tax (R$)':'property_tax',
                    'fire insurance (R$)':'fire_insurance',
                    'total (R$)':'total',
                    'parking spaces': 'parking_spaces'}

df = df.rename(new_column_names, axis='columns')
df.head()

In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
df_copy = df.copy()
df_copy['floor'].replace('-',0, inplace=True)
df_copy['floor'] = df_copy['floor'].astype('int')

#### What is the most popular city among brazilians

In [None]:
df.groupby(['city'])['city'].aggregate(lambda x: x.count()/ 10692).plot(kind='pie',autopct='%.2f',fontsize=11);

#### What is the average area of a house in each city

In [None]:
df.groupby(['city'])['area'].aggregate(lambda x: x.mean()).plot(kind='bar',color=['r','g','b','y','c']\
    , ylabel='Average area', title='Average area in a city', fontsize=12);

#### Maximum number of rooms of a house in each city

In [None]:
df.groupby(['city'])['rooms'].aggregate(lambda x: x.max()).plot(kind='barh',color=['r','g','b','y','c']\
    ,xlabel='City', title='Maximum number of rooms of a house in each city', fontsize=12);

#### Average number of rooms of a house in each city

In [None]:
df.groupby(['city'])['rooms'].aggregate(lambda x: x.mean()).plot(kind='barh',color=['r','g','b','y','c']\
    ,xlabel='City', title='Average number of rooms of a house in each city', fontsize=12);

#### Maximum number of bathrooms of a house in each city

In [None]:
df.groupby(['city'])['bathroom'].aggregate(lambda x: x.max()).plot(kind='barh',color=['r','g','b','y','c']\
    ,xlabel='City', title='Maximum number of bathrooms of a house in each city', fontsize=12);

#### Average number of bathrooms of a house in each city

In [None]:
df.groupby(['city'])['bathroom'].aggregate(lambda x: x.mean()).plot(kind='barh',color=['r','g','b','y','c']\
    ,xlabel='City', title='Average number of bathrooms of a house in each city', fontsize=12);

#### Average number of floors of a house in each city

In [None]:
df_copy.groupby(['city'])['floor'].aggregate(lambda x: x.mean()).plot(kind='barh',color=['r','g','b','y','c']\
    ,xlabel='City', title='Average number of floors of a house in each city', fontsize=12);

#### It is evident that most of the houses accept animals and most of the houses are not furnished

In [None]:
fig, axes = plt.subplots(1,2, figsize=(14,5))

sns.countplot(x='city', hue='animal', data=df, palette=sns.color_palette(), ax=axes[0]);
sns.countplot(x='city', hue='furniture', data=df, palette='Set1', ax=axes[1]);

#### It is also evident that 60 percent of all houses are not furnished and do accept animals

In [None]:
sns.heatmap(df.groupby(['animal','furniture']).size().unstack(), annot=True, fmt="d");

#### fire insurance relationship with rent amount

In [None]:
fig, axes = plt.subplots(1,2, figsize=(14,5))
sns.scatterplot(data=df, y='fire_insurance', x='rent_amount', hue="furniture", ax=axes[0]);
sns.scatterplot(data=df, y='fire_insurance', x='rent_amount', hue="animal", ax=axes[1]);

#### rent amount relationship with area -> log-transformed dependency
- It can be seen from the plots that there are several outliers in our dataset

In [None]:
fig, axes = plt.subplots(1,2, figsize=(16,5))
sns.scatterplot(data=df, y=np.log(df['area']), x=np.log(df['rent_amount']), hue="furniture", ax=axes[0]);
sns.scatterplot(data=df, y=np.log(df['area']), x=np.log(df['rent_amount']), hue="animal", ax=axes[1]);

#### 46335 square meters !!!!

In [None]:
df_copy['area'].sort_values().values[-10:]

### Does rental housing differ from city to city
#### Hoa, Fire Insurance, and Property Tax impact on Rent Amount

In [None]:
fig, axes = plt.subplots(2,2, figsize=(18,8), sharex=True)
list_of_metrics = [['rent_amount','hoa'],['property_tax','fire_insurance']]

def bar_plot_func(ax, metric):
    df.groupby(['city'])[f'{metric}'].aggregate(lambda x: x.mean()).plot(kind='bar',color=['r','g','b','y','c']\
    , ylabel='Average ' + f'{metric}', title='Average ' + f'{metric}' + ' in a city', fontsize=10, ax=ax);
    
bar_plot_func(axes[0,0],list_of_metrics[0][0])
bar_plot_func(axes[0,1],list_of_metrics[0][1])
bar_plot_func(axes[1,0],list_of_metrics[1][0])
bar_plot_func(axes[1,1],list_of_metrics[1][1])

#### Boxplot of Rent amount by (Parking spaces, Bathroom, Room, City)
- Does housing rent increases when the # of bathrooms increases?
- Does housing rent increases when the # of rooms increases?
- Does housing rent increases when the # of parking spaces increases?

In [None]:
list_of_metrics = [['parking_spaces','rooms'], ['bathroom', 'city']]

def box_plot_func(ax, metric):
    df.boxplot(column='rent_amount', by=f'{metric}', fontsize=12, ax=ax);
    ax.set_title("Boxplot of Rent amount by " + f"{metric}")
    ax.set_xlabel(f"{metric}")
    ax.set_ylabel('Rent amount')
    plt.suptitle("")

    
fig, axes = plt.subplots(2,2, figsize=(16,13))

box_plot_func(axes[0,0], list_of_metrics[0][0])
box_plot_func(axes[0,1], list_of_metrics[0][1])
box_plot_func(axes[1,0], list_of_metrics[1][0])
box_plot_func(axes[1,1], list_of_metrics[1][1])

#### Data types

In [None]:
df_copy.dtypes

#### Check for Nan values

In [None]:
df_copy.isna().sum()

#### Train/Test split

In [None]:
from sklearn.model_selection import train_test_split

del df_copy['total']
x, y = df_copy.drop('rent_amount',axis=1), df_copy['rent_amount']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
print(f'X_train shape: {x_train.shape}, X_test shape: {x_test.shape}')

#### Data Transformation

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
x_train.loc[:,'city'] = le.fit_transform(x_train['city'])
x_test.loc[:,'city'] = le.transform(x_test['city'])

x_train.loc[:,'animal'] = le.fit_transform(x_train['animal'])
x_test.loc[:,'animal'] = le.transform(x_test['animal'])

x_train.loc[:,'furniture'] = le.fit_transform(x_train['furniture'])
x_test.loc[:,'furniture'] = le.transform(x_test['furniture'])

In [None]:
sns.pairplot(x_train);

In [None]:
formula_str = x_train.columns[-1]+ ' ~ '+ ' + '.join(x_train.columns[:-1])
formula_str

In [None]:
import statsmodels.formula.api as sm

lm = sm.ols(formula=formula_str,data=x_train)

fitted = lm.fit()

In [None]:
print(fitted.summary())

The Durbin Watson (DW) statistic is a test for autocorrelation in the residuals from a statistical regression analysis. A value of <b> 2.0 means that there is no autocorrelation detected in the sample</b>.

Not all p-values are significant, so we should remove the non-significant features from the model and test it again. The nong-significant values indicate that there is insufficient evidence in your sample to conclude that a non-zero correlation exists.

In [None]:
plt.figure(figsize=(8,5))
p=plt.scatter(x=fitted.fittedvalues,y=fitted.resid,edgecolor='k')
xmin=min(fitted.fittedvalues)
xmax = max(fitted.fittedvalues)
plt.hlines(y=0,xmin=xmin*0.9,xmax=xmax*1.1,color='red',linestyle='--',lw=3)
plt.xlabel("Fitted values",fontsize=15)
plt.ylabel("Residuals",fontsize=15)
plt.title("Fitted vs. residuals plot",fontsize=18)
plt.grid(True)
plt.show()

<b>violation of the constant variance assumption - Heteroscedasticity </b>

In [None]:
plt.figure(figsize=(8,5))
plt.hist(fitted.resid_pearson,bins=20,edgecolor='k')
plt.ylabel('Count',fontsize=15)
plt.xlabel('Normalized residuals',fontsize=15)
plt.title("Histogram of normalized residuals",fontsize=18)
plt.show()

#### It can be inferred from the above results, we need to change our baseline model

### Regression models are sensitive to feature scaling

In [None]:
from numpy import mean, absolute
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.linear_model import HuberRegressor, LinearRegression
from sklearn.preprocessing import PowerTransformer, StandardScaler, MinMaxScaler
from sklearn.compose import TransformedTargetRegressor

# prepare the model with input scaling
pipeline = Pipeline(steps=[('power', PowerTransformer()), ('model', HuberRegressor())])
# prepare the model with target scaling
model = TransformedTargetRegressor(regressor=pipeline, transformer=PowerTransformer())
# evaluate model
cv = KFold(n_splits=10, shuffle=True, random_state=1)
scores = cross_validate(model, x_train, y_train, scoring=['neg_root_mean_squared_error', 'r2'], cv=cv, n_jobs=-1)
mean_mae = mean(absolute(scores['test_neg_root_mean_squared_error']))
mean_r2 = mean(absolute(scores['test_r2']))

print(f'Mean MAE: {mean_mae:.3f}, R2: {mean_r2:.3f}')

### Automatic Outlier Detection
Our baseline model: decision tree regressor

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, r2_score

def dart(*args):
    x_train, x_test, y_train, y_test = args
    model = DecisionTreeRegressor()
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    print(f'RMAE: {mean_absolute_error(y_test, y_pred)**.5:.3f}, R-squared: {r2_score(y_test, y_pred):.3f}')

dart(x_train, x_test, y_train, y_test)

#### Isolation Forest -> resulted in better performance

In [None]:
from sklearn.ensemble import IsolationForest

iso = IsolationForest(contamination=0.0001)
yhat = iso.fit_predict(x_train)
# select all rows that are not outliers
mask = yhat != -1
X_train, Y_train = x_train[mask], y_train[mask]
dart(X_train, x_test, Y_train, y_test)

#### Minimum Covariance Determinant -> resulted in even better performance:)
- Due to central limit theorem, we can say that our input has Gaussian distribution so we can use MCD

In [None]:
from sklearn.covariance import EllipticEnvelope

ee = IsolationForest(contamination=0.001)
yhat = ee.fit_predict(x_train)
# select all rows that are not outliers
mask = yhat != -1
X_train, Y_train = x_train[mask], y_train[mask]
dart(X_train, x_test, Y_train, y_test)

#### Local Outlier Factor: resulted in a better performance

In [None]:
from sklearn.neighbors import LocalOutlierFactor

lof = LocalOutlierFactor(contamination=0.001)
yhat = lof.fit_predict(x_train)
# select all rows that are not outliers
mask = yhat != -1
X_train, Y_train = x_train[mask], y_train[mask]
dart(X_train, x_test, Y_train, y_test)

#### I stick with Local Outlier Factor because it has a better performance overall.