# Analyzing the rent price at Brazil

### At first, It will be done a exploratory analysis, plotting and checking stats' data. In a future notebook will be build a predictor model

In [None]:
#Importing libs
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("darkgrid")
sns.set_palette("Dark2_r")

### Columns description
**city:** Cidade onde o imóvel está localizada / City where the property is located <br>
**area:** Area do imovel / Property area<br>
**rooms:** Numero de quartos/ Quantity of rooms<br>
**bathroom:** Numero de banheiros / Quantity of bathroom<br>
**parking spaces:** Numero de vagas / Quantity of parking spaces<br>
**floor:** Andar / Floor<br>
**animal:** Aceita animais? / Acept animals?<br>
**furnitur:** eMobilhada? / Furniture?<br>
**hoa:** Valor do condominio / Homeowners association tax<br>
**rent amount:** Valor do Aluguel / Rent amount<br>
**property tax:** IPTU / Property tax<br>
**fire insurance:** Seguro Incendio / Fire Insurance<br>
**total:** Valor total / Total<br>

#### Checking both datasets

In [None]:
df1 = pd.read_csv('/kaggle/input/brasilian-houses-to-rent/houses_to_rent.csv', index_col = 0)
df2 = pd.read_csv('/kaggle/input/brasilian-houses-to-rent/houses_to_rent_v2.csv')

In [None]:
print(df1.head())
print('------------------------------------------------------')
print(df2.head())

#### Considering that the first dataset have values 0 or 1 for the city column, I will make my analysis at the second dataset

In [None]:
df = df2.copy()
print('Printing dataset`s info\n')
print(df.info())
print('-----------------------------------------------')
print('Checking the quantity of null values\n')
print(df.isna().sum())

#### There isn't missing values! However, the number of the floor is categorical, Let's chance to numeric.

In [None]:
df.head(10)

#### There is some "-" values

In [None]:
df[df['floor'].str.contains('\-')==True]

In [None]:
print('Checking the mode for each column')
df.mode()

In [None]:
print('Checking the values quantity\n')
print(df.floor.value_counts())

#### The dataset owner not specified what means the - values, so I will assume that is a place with the lower number of floor, that is 1 floor

In [None]:
df['floor'].replace(to_replace=r'\-', value=1, regex=True, inplace = True)
df.head(10)

In [None]:
df[df['floor'].str.contains('\-')==True]

#### It's work!

In [None]:
#Transform floor to integer
df = df.astype({'floor': 'int64'})
df.head(10)

In [None]:
df.describe().round(2)

### Plotting

In [None]:
ax = sns.boxplot(df['area'])
ax.figure.set_size_inches(20,6)
ax.set_title('Area', fontsize=20)
ax.set_xlabel('Area (m²)', fontsize=16)
ax;

In [None]:
ax = sns.distplot(df['area'])
ax.figure.set_size_inches(20,6)
ax.set_title('Area', fontsize=20)
ax.set_xlabel('Area (m²)', fontsize=16)
ax;

#### It's seen there is some outliers at the area column

In [None]:
ax = sns.boxplot(df['rooms'])
ax.figure.set_size_inches(20,6)
ax.set_title('Rooms', fontsize=20)
ax.set_xlabel('Number of rooms', fontsize=16)
ax;

In [None]:
ax = sns.distplot(df['rooms'])
ax.figure.set_size_inches(20,6)
ax.set_title('Rooms', fontsize=20)
ax.set_xlabel('Number of rooms', fontsize=16)
ax;

In [None]:
ax = sns.boxplot(df['bathroom'])
ax.figure.set_size_inches(20,6)
ax.set_title('Bathrooms', fontsize=20)
ax.set_xlabel('Number of bathroom', fontsize=16)
ax;

In [None]:
ax = sns.distplot(df['bathroom'])
ax.figure.set_size_inches(20,6)
ax.set_title('Bathrooms', fontsize=20)
ax.set_xlabel('Number of bathroom', fontsize=16)
ax;

In [None]:

ax = sns.boxplot(df['parking spaces'])
ax.figure.set_size_inches(20,6)
ax.set_title('Parking spaces', fontsize=20)
ax.set_xlabel('Number of parking spaces', fontsize=16)
ax;

In [None]:
ax = sns.distplot(df['parking spaces'])
ax.figure.set_size_inches(20,6)
ax.set_title('Parking spaces', fontsize=20)
ax.set_xlabel('Number of parking spaces', fontsize=16)
ax;

In [None]:
ax = sns.boxplot(df['total (R$)'])
ax.figure.set_size_inches(20,6)
ax.set_title('Final price', fontsize=20)
ax.set_xlabel('Price (R$)', fontsize=16)
ax;

In [None]:
ax = sns.distplot(df['total (R$)'])
ax.figure.set_size_inches(20,6)
ax.set_title('Final price', fontsize=20)
ax.set_xlabel('Price (R$)', fontsize=16)
ax;

In [None]:
sns.pairplot(df);

#### Let's see the last line closer

In [None]:
ax = sns.pairplot(df, y_vars='total (R$)', x_vars=['area', 'rooms', 'bathroom', 'parking spaces', 'floor'], height=5, kind='reg')
ax;

#### There is some outliers, mostly in the area columns that make more difficult have a good analysis visualization.

-----------------------------------------------------------------------------------------------------

#### Checking outliers

In [None]:
#Area outliers
np.sort(df['area'].unique())[-20:]

In [None]:
df[df['area']>1000]

In [None]:
df.drop(df[df['area']>1000].index, inplace = True)

In [None]:
ax = sns.boxplot(df['area'])
ax.figure.set_size_inches(20,6)
ax.set_title('Area', fontsize=20)
ax.set_xlabel('Area (m²)', fontsize=16)
ax;

In [None]:
ax = sns.distplot(df['area'])
ax.figure.set_size_inches(20,6)
ax.set_title('Area', fontsize=20)
ax.set_xlabel('Area (m²)', fontsize=16)
ax;

In [None]:
ax = sns.distplot(df['rooms'])
ax.figure.set_size_inches(20,6)
ax.set_title('rooms', fontsize=20)
ax.set_xlabel('rooms', fontsize=16)
ax;

In [None]:
#Total outliers
print('Highest prices')
print(np.sort(df['total (R$)'].unique())[-50:])
print('---------------------------------------------------')
print('Total price describe')
print(df['total (R$)'].describe().round(2))

In [None]:
df.drop(df[df['total (R$)']>20000].index, inplace = True)

In [None]:
ax = sns.distplot(df['total (R$)'])
ax.figure.set_size_inches(20,6)
ax.set_title('Final price', fontsize=20)
ax.set_xlabel('Price (R$)', fontsize=16)
ax;

In [None]:
ax = sns.boxplot(df['total (R$)'])
ax.figure.set_size_inches(20,6)
ax.set_title('Final price', fontsize=20)
ax.set_xlabel('Price (R$)', fontsize=16)
ax;

#### The visualization is much better now

In [None]:
#Checking the last pairplot line again
ax = sns.pairplot(df, y_vars='total (R$)', x_vars=['area', 'rooms', 'bathroom', 'parking spaces', 'floor'], height=5, kind='reg')
ax;

----------------------------------------------------------------------------

#### Let's see first a correlation between columns

In [None]:
plt.figure(figsize=(10,10))
ax = sns.heatmap(df.corr())
ax.set_title('Correlation', fontsize=20)
ax;

#### *Although the correlation between hoa column and total columns is 0.955, this doesn't mean hoa column is a good approach to estimate total column. Total column values is a sum of hoa, rent amount, property tax and fire insurance values. All of then is already calculated based on the remain columns, so We only will have hoa column at the same time we have total column. However, It is possible to try to estimate the columns that generate total values with the remaining columns. <p><br> Usually, the rent amount is the main column for caculate other fees. And It's easier to see a correlation between rent amount and not prices columns.*

In [None]:
df.city.unique()

In [None]:
city_test = pd.get_dummies(df) #Let's see what city is the most expensive
city_test.corr().head(15)

In [None]:
ax = sns.boxplot(data = df, x = 'city', y = 'total (R$)', orient = 'v')
ax.figure.set_size_inches(20,6)
ax.set_title('Final price per city', fontsize=20)
ax.set_xlabel('City', fontsize=16)
ax.set_ylabel('Price (R$)', fontsize=16)
ax;

#### *It's easy to see that São Paulo is the most expensive city to live.*

### See you at the next notebook to build a predictive model for this dataset

____________________________________________________________________________________________________