In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
data1=pd.read_csv("../input/brasilian-houses-to-rent/houses_to_rent.csv")
data2=pd.read_csv("../input/brasilian-houses-to-rent/houses_to_rent_v2.csv")

In [None]:
data=data2.copy()

In [None]:
 data

# Exploratory Data Analysis (EDA)

1.shape

In [None]:
print("the data has {} rows and {} columns in dataset".format(data.shape[0],data.shape[1]))

In [None]:
data.shape[1]

Basic datatype in dataset

In [None]:
data.info()


Basic description

In [None]:
data.describe()

checking null values in the dataset if any

In [None]:
print("checking null values in the dataset if any:{}".format(data.isnull().sum().any()))

In [None]:
data.isnull().sum()

City

In [None]:
data.city.unique()

In [None]:
sns.countplot(data.city)

Histogram

In [None]:
data['city'].unique()

In [None]:
cities=['São Paulo', 'Porto Alegre', 'Rio de Janeiro', 'Campinas',
       'Belo Horizonte']

In [None]:
plt.figure(figsize=(18, 8))

i = 1
for city in cities:    
    if city == 'São Paulo':
        continue
    
    plt.subplot(2, 3, i)
    plt.title(city)
    city_name = data.loc[data['city'] == city]
    sns.distplot(city_name['rent amount (R$)'])
    plt.xticks(np.arange(city_name['rent amount (R$)'].min(), city_name['rent amount (R$)'].max(), step=2000))
    i+=1
    

plt.tight_layout()
plt.show()


Rent amount (R$) - Analysis

Histogram

In [None]:
plt.figure(figsize=(12, 6))
sns.distplot(data['rent amount (R$)'])
plt.xticks(np.arange(data['rent amount (R$)'].min(),data['rent amount (R$)'].max(), step=3000));

Boxplot

In [None]:
plt.figure(figsize=(10, 7))

sns.boxplot(data['rent amount (R$)'])
plt.xticks(np.arange(data['rent amount (R$)'].min(),data['rent amount (R$)'].max(), step=3000))

plt.show()

Boxplot

In [None]:
plt.figure(figsize=(16, 8))

i = 1
step = 5000
for city in cities:
    if step < 2000:
        step = 2000
    plt.subplot(2, 3, i)
    plt.title(city)
    city_name =data.loc[data['city'] == city]
    sns.boxplot(city_name['rent amount (R$)'])
    plt.xticks(np.arange(city_name['rent amount (R$)'].min(), city_name['rent amount (R$)'].max(),
                        step=step))
    step-=3000
    i+=1

    

plt.tight_layout()
plt.show()


Correlations

In [None]:
numData = data._get_numeric_data()
var_num_corr = numData.corr()

sns.heatmap(var_num_corr, vmin=-1, vmax=1, annot=True, linewidth=0.01, linecolor='black', cmap='RdBu_r')

In [None]:
var_num_corr['rent amount (R$)'].round(3)

Analysis of important features

rooms

In [None]:
sns.barplot(x=data['rooms'], y=data['rent amount (R$)'])

In [None]:
sns.boxplot(x=data['rooms'])

Parking spaces

In [None]:
sns.barplot(x=data['parking spaces'], y=data['rent amount (R$)'])

In [None]:
sns.boxplot(x=data['parking spaces'])

Fire insurance

In [None]:
sns.regplot(x=data['fire insurance (R$)'], y=data['rent amount (R$)'], line_kws={'color': 'r'})

Furniture

In [None]:
furniture = data['furniture'].value_counts()
pd.DataFrame(furniture)

# ML models

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

rent price (R$) with outliers

In [None]:
sns.boxplot(data['city'],data['rent amount (R$)'])

Select quantiles

In [None]:
city_group = data.groupby('city')['rent amount (R$)']

In [None]:
Q1 = city_group.quantile(.25)
Q3 = city_group.quantile(.75)

In [None]:
IQR = Q3 - Q1

# Limits
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR

Remove outliers

In [None]:
new_data = pd.DataFrame()
for city in city_group.groups.keys():
    is_city = data['city'] == city
    accepted_limit = ((data['rent amount (R$)'] >= lower[city]) &
                     (data['rent amount (R$)'] <= upper[city]))
    
    select = is_city & accepted_limit
    data_select = data[select]
    new_data = pd.concat([new_data, data_select])

new_data.head()



with vs without (outliers)

In [None]:
plt.figure(figsize=(15, 6))

plt.subplot(1, 2, 1)
plt.title('With outliers')
sns.boxplot(data['city'], data['rent amount (R$)'])

plt.subplot(1, 2, 2)
plt.title('Without outliers')
sns.boxplot(new_data['city'], new_data['rent amount (R$)'])

plt.tight_layout(pad=5.0)
plt.show()

In [None]:
new_data

In [None]:
from sklearn.preprocessing import LabelEncoder

lb_make = LabelEncoder()
new_data['city']= lb_make.fit_transform(new_data['city'])
new_data['furniture']= lb_make.fit_transform(new_data['furniture'])
new_data['animal']= lb_make.fit_transform(new_data['animal'])
new_data

In [None]:
new_data['floor']=new_data['floor'].replace('-','0').inplace=True

In [None]:
new_data['floor']=new_data['floor'].astype(int).inplace=True

In [None]:
new_data

In [None]:
cols = ['city', 'rooms', 'bathroom', 'parking spaces', 'fire insurance (R$)',
        'furniture']

X = new_data[cols]
X.shape

In [None]:
y = new_data['rent amount (R$)']
y.shape

In [None]:
catFeatures = X.select_dtypes(include=['category']).columns
catFeatures

numFeatures = X.select_dtypes(include=['int64', 'float64']).columns
numFeatures

In [None]:
from sklearn.compose import ColumnTransformer
numTransformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])


catTransformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(
    transformers=[
        ('numeric', numTransformer, numFeatures),
        ('categoric', catTransformer, catFeatures)
    ])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [None]:
ss=StandardScaler()
X_test_ss=ss.fit_transform(X_test)
X_train_ss=ss.fit_transform(X_train)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from xgboost import XGBRegressor

In [None]:
regressors = [
    DecisionTreeRegressor(),
    RandomForestRegressor(),
    SVR(),
    LinearRegression(),
    XGBRegressor()
]

In [None]:
for reg in regressors:
    estimator = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', reg)
    ])
    estimator.fit(X_train, y_train)
    preds = estimator.predict(X_test)
    
    print(reg)

    print('MAE:', mean_absolute_error(y_test, preds))
    print('RMSE:', np.sqrt(mean_squared_error(y_test, preds)))
    print('R2:', r2_score(y_test, preds))
    print('-' * 40)
