## If you **LIKE** this notebook,please do **UPVOTE**.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data=pd.read_csv('/kaggle/input/brasilian-houses-to-rent/houses_to_rent_v2.csv')
data.head()

In [None]:
data=data.rename(columns={'hoa (R$)':'hoa',
                           'rent amount (R$)':'rent amount',
                           'property tax (R$)':'property tax',
                           'fire insurance (R$)':'fireinsurance',
                           'total (R$)':'total'})
data.head()

We have to perform the following for creating a model:
1. EDA (exploratory data analysis)
2. Feature engineering 
3. Feature selection
4. Regression models

## EDA

We have to predict the **rent amount** using the input columns given above.Lets analyse each column to get insights over the dataset.

In [None]:
data.info()

In [None]:
data['floor'].unique()

column 'floor' has '-' as its value, we will replace it with 0 as - means a house with only groundfloor and change its dtype to 'int64'.

In [None]:
data['floor']=data['floor'].replace('-',0)
data['floor']=data['floor'].astype('int64')

In [None]:
data.describe()

### Rent amount analysis

In [None]:
sns.boxplot('rent amount',data=data);

Rent amount column is right skewed.

In [None]:
#plot city vs rentamount
sns.boxplot(x='city',y='rent amount',data=data);

There are lot of outliers in the rent amount column for each city.Values after 15000 are in all cities are outliers and they should be removed.

### Correlations

In [None]:
corr=data.corr()
plt.figure(figsize=(12,6))
sns.heatmap(corr,annot=True,fmt='.2f',cmap=plt.cm.Blues);

Observations:
1. rent amount have positive correlation with rooms, bathroom ,parking spaces columns as they increase the rent amount with them.
2. rent amount is highly correlated with fireinsurance column.
3. total and hoa are also highly correlated and does not correlate much to the rent amount.
4. floor and area are also not so useful in predicting rent amount.
    

In [None]:
#plot of rooms vs rentamount
plt.subplot(2,1,1)
sns.boxplot(data['rooms'])

plt.subplot(2,1,2)
sns.barplot(x='rooms',y='rent amount',data=data);

Most of the houses have rooms less than 4 with somw outliers having rooms extending upto 13 and also rent amount increase with the number of rooms.

In [None]:
#parking spaces vs rent amount
plt.subplot(2,1,1)
sns.boxplot(data['parking spaces'])

plt.subplot(2,1,2)
sns.barplot(x='parking spaces',y='rent amount',data=data);

There is increase in the rent amount with increase in the parking spaces upto 7 and rent amount starts decreasing.

In [None]:
#plot of rooms vs rentamount
plt.subplot(2,1,1)
sns.boxplot(data['bathroom'])

plt.subplot(2,1,2)
sns.barplot(x='bathroom',y='rent amount',data=data);

Also, the rent amount increases with the number of bathrooms in the house.

In [None]:
#furniture 
sns.countplot(x='furniture',data=data);

Most houses are not furnished in the dataset and also the rent amount more if the house is furnished as compared to the house that is not furnished.

In [None]:
#animal
sns.countplot(x='animal',data=data);

In [None]:
#plot of rooms vs rentamount
plt.subplot(2,1,1)
sns.boxplot(x='rent amount',data=data[data['animal']=='acept']);

plt.subplot(2,1,2)
sns.boxplot(x='rent amount',data=data[data['animal']=='not acept']);

In [None]:
#fireinsurance vs rent amount
sns.scatterplot(x='rent amount',y='fireinsurance',data=data);

 'animal' column does not have much affect on the rent amount of houses.

## Feature engineering

### Removing outliers

In [None]:
city_group=data.groupby('city')['rent amount']

In [None]:
Q1 = city_group.quantile(.25)
Q3 = city_group.quantile(.75)

# IQR = Interquartile Range
IQR = Q3 - Q1

# Limits
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR

In [None]:
print(lower)
print(upper)

In [None]:
new_data=pd.DataFrame()

for city in city_group.groups.keys():
    df_select=data[(data['city']==city)&(data['rent amount']>lower[city])&(data['rent amount']<upper[city])]
    new_data=pd.concat([new_data,df_select])
    
new_data.head()
    

In [None]:
#plot of rent amount after removing outliers
sns.boxplot(x='city',y='rent amount',data=new_data);

## Feature selection


In [None]:
#removing the rows that contain single valued column as it may give an error while one hot encoding if there is no instance in training data with that value. 
new_data=new_data[new_data['parking spaces']!=10]

In [None]:
features=['city','rooms','bathroom','parking spaces','furniture','fireinsurance']
X=new_data[features]

In [None]:
y=new_data['rent amount']

In [None]:
X.head()

In [None]:
for col in X.columns[:-1]:
    X[col]=X[col].astype('category')
    
X['fireinsurance']=X['fireinsurance'].astype('int64')
X.info()

In [None]:
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

from sklearn.model_selection import train_test_split,RandomizedSearchCV
from sklearn.metrics import r2_score,accuracy_score,mean_squared_error,mean_absolute_error


#### Dealing Categorical columns 

In [None]:
catTransformer=Pipeline(steps=[('onehot',OneHotEncoder())])

#### Dealing Numerical columns

In [None]:
numTransformer=Pipeline(steps=[('scaler',StandardScaler())])

#### numerical features

In [None]:
numFeatures=X.select_dtypes(include=['int','float']).columns
numFeatures

#### categorical features

In [None]:
catFeatures=X.select_dtypes(include=['category']).columns
catFeatures

#### Dealing numerical features and categorical features

In [None]:
preprocessor=ColumnTransformer(transformers=[('numeric',numTransformer,numFeatures),
                                             ('categoric',catTransformer,catFeatures)])

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=12)

In [None]:
regressors=[DecisionTreeRegressor(),
           LinearRegression(),
           SVR(), 
           RandomForestRegressor(),
           XGBRegressor()]

In [None]:
np.random.seed(123)

for regressor in regressors:
    
    estimator=Pipeline([('preprocessor',preprocessor),
                        ('regressor',regressor)])
    estimator.fit(X_train,y_train)
    preds=estimator.predict(X_test)
    
    print(regressor)
    print('Mean squared error: ',mean_squared_error(y_test,preds))
    print('mean_absolute_error: ',mean_absolute_error(y_test,preds))
    print('r2_score: ',r2_score(y_test,preds))
    print('-------------------------------------------------------')
    

Xgboost model performs better than other models.We can gridsearch the parameters of xgboost to increase the accuracy of model like learning_rate, n_estimators, max_depth, colsample_bytree, gamma.