In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

df = pd.read_csv('/kaggle/input/brasilian-houses-to-rent/houses_to_rent_v2.csv')
# Any results you write to the current directory are saved as output.

In [None]:
df.info()

In [None]:
df.dtypes

In [None]:
df.corr()['rent amount (R$)']

In [None]:
df[['hoa (R$)','total (R$)']]

In [None]:
# sns.pairplot(df)

# 1 Data Cleaning and Feature Engineering

In [None]:
df.info()

In [None]:
plt.figure(figsize=(15,10))
sns.heatmap(df.corr(),cmap='coolwarm')

In [None]:
df.corr()['rent amount (R$)'].sort_values(ascending=False)

> Fire insurance has a very strong positive correlation with rent amount which may indicate that the value is proportional to the value of rent. As such, we should consider taking this feature out of the data set
On another note, it would also make sense to exclude total, property tax, and hoa from the data set, like fire insurance, are computed during the point of knowing the rent. Retaining these features would only cause data leakage.

In [None]:
df.drop(['total (R$)','property tax (R$)','hoa (R$)','fire insurance (R$)'],axis=1,inplace=True)

In [None]:
for i in df.select_dtypes('object'):
    print(i,df[i].unique())

> Based on the observed unique instances of the feature `floor`, '1' and '-' seem to indicated the same thing. However, a better inference would be that `floor` indicates which floor of an appartment/condominium building is located. As such it would make sense to add another column to identify whether the property being rented is an appartment unit or not. Another, interpretation could be to to treat '-' as a null value and will be imputed and partnered with a isnull column. For the sake of having a baselin, let's indicate the '-' value first as 0 and observe performance and revisit abovementioned ideas later on.

In [None]:
df['floor'] = df['floor'].apply(lambda x: 0 if x == '-' else x).astype(int)    

In [None]:
df

# 2 Feature Engineering

> Below are some feature engineering ideas that would also be interesting to explore apart from the usual OneHotEncoding.

In [None]:
df['avg_area_per_division'] = df['area']/(df['bathroom']+df['rooms']+1) # plus 1 for common/living area
df['bathroom_room_ratio'] = df['bathroom']/df['rooms']
df['unique_inclusions_cnt'] = np.where(df['parking spaces']>0,1,0)+np.where(df['furniture']=='furnished',1,0)+np.where(df['animal']=='acept',1,0)

In [None]:
from sklearn.model_selection import train_test_split,cross_validate
from sklearn.preprocessing import OneHotEncoder,StandardScaler,PowerTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score


onehot = OneHotEncoder(sparse=False,handle_unknown='ignore')
scaler = PowerTransformer()
sd = StandardScaler()


X = df.drop(columns=['rent amount (R$)'])
y = df['rent amount (R$)']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=100)

In [None]:
cat_cols = X_train.select_dtypes('object').columns.tolist()

In [None]:
num_cols = X_train.select_dtypes('int').columns.tolist()+X_train.select_dtypes('float').columns.tolist()

In [None]:
col_transform_sd = ColumnTransformer([('onehot',onehot,cat_cols),('scaler',sd,num_cols)])
col_transform_pt = ColumnTransformer([('onehot',onehot,cat_cols),('scaler',scaler,num_cols)])

# 3 Baseline Model & Scaler Comparison

In [None]:
baseline = Pipeline(steps=[('preprocess',col_transform_sd),('model',LinearRegression())])

In [None]:
base_score = cross_validate(baseline,X_train,y_train,scoring='r2',cv=5,return_train_score=True)
print("Train Score: ",np.mean(base_score['train_score']))
print("Test Score: ",np.mean(base_score['test_score']))

In [None]:
power_tranform = Pipeline(steps=[('preprocess',col_transform_pt),('model',LinearRegression())])

In [None]:
pt_score = cross_validate(power_tranform,X_train,y_train,scoring='r2',cv=5,return_train_score=True)
print("Train Score: ",np.mean(pt_score['train_score']))
print("Test Score: ",np.mean(pt_score['test_score']))

> It is clear that applying an exponential scaling transforms the numerical data into normal distributions which are highly compatible with LinearRegression models. Apart from this, we can also observe that with StandardScaler the model overfits. As such, we shall be using PowerTransformer moving forward.

# 4 Feature Interactions & Improvements

In [None]:
add_cat = []
for x in cat_cols:
    for y in cat_cols:
        pass
    add_cat.append(pd.Series(df[x]+'_'+df[y],name=x+'_'+y))
cat_interactions = pd.concat(add_cat,axis=1)

In [None]:
df_interactions = pd.concat([df,cat_interactions],axis=1)

X = df_interactions.drop(columns=['rent amount (R$)'])
y = df_interactions['rent amount (R$)']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=100)

In [None]:
# Update ColumnTransformer
num_cols = X_train.select_dtypes('int').columns.tolist()+X_train.select_dtypes('float').columns.tolist()
cat_cols = X_train.select_dtypes('object').columns.tolist()

col_transform_pt = ColumnTransformer([('onehot',onehot,cat_cols),('scaler',scaler,num_cols)])

power_tranform = Pipeline(steps=[('preprocess',col_transform_pt),('model',LinearRegression())])

In [None]:
pt_score = cross_validate(power_tranform,X_train,y_train,scoring='r2',cv=5,return_train_score=True)
print("Train Score: ",np.mean(pt_score['train_score']))
print("Test Score: ",np.mean(pt_score['test_score']))

> Slight Improvement on Train and Test scores. However, the model overfits slightly as well.

> Now let's try feature interactions on numerical features via PolynomialFeatures to see if there is improvement on the model

In [None]:
from sklearn.preprocessing import PolynomialFeatures

num_inter = PolynomialFeatures(interaction_only=True)

X = df.drop(columns=['rent amount (R$)'])
y = df['rent amount (R$)']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=100)

In [None]:
num_cols = X_train.select_dtypes('int').columns.tolist()+X_train.select_dtypes('float').columns.tolist()
cat_cols = X_train.select_dtypes('object').columns.tolist()

num_preprocess = Pipeline(steps=[('interactions',num_inter),('scaler',scaler)])

col_transform_pt = ColumnTransformer([('onehot',onehot,cat_cols),('num_preprocess',num_preprocess,num_cols)])

power_tranform = Pipeline(steps=[('preprocess',col_transform_pt),('model',LinearRegression())])

In [None]:
pt_score = cross_validate(power_tranform,X_train,y_train,scoring='r2',cv=5,return_train_score=True)
print("Train Score: ",np.mean(pt_score['train_score']))
print("Test Score: ",np.mean(pt_score['test_score']))

> There is a sizeable increase on performance with Numerical Feature Interactions. Let's see what happens when we incorporate both numerical and categorical.

In [None]:
X = df_interactions.drop(columns=['rent amount (R$)'])
y = df_interactions['rent amount (R$)']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=100)

In [None]:
num_cols = X_train.select_dtypes('int').columns.tolist()+X_train.select_dtypes('float').columns.tolist()
cat_cols = X_train.select_dtypes('object').columns.tolist()

num_preprocess = Pipeline(steps=[('interactions',num_inter),('scaler',scaler)])

col_transform_pt = ColumnTransformer([('onehot',onehot,cat_cols),('num_preprocess',num_preprocess,num_cols)])

power_tranform = Pipeline(steps=[('preprocess',col_transform_pt),('model',LinearRegression())])

In [None]:
pt_score = cross_validate(power_tranform,X_train,y_train,scoring='r2',cv=5,return_train_score=True)
print("Train Score: ",np.mean(pt_score['train_score']))
print("Test Score: ",np.mean(pt_score['test_score']))

> Very minute improvement. We can get away with not including Categorical Feature Interactions but for now let's just stick with it as if we are fighting for every point of improvement.

# 5 Validation

In [None]:
power_tranform.fit(X_train,y_train)
train_preds = power_tranform.predict(X_train)
test_preds = power_tranform.predict(X_test)

print("Train Score :",r2_score(y_train,train_preds))
print("Test Score :",r2_score(y_test,test_preds))