# Exploratory Data Analysis
## Python Introduction Course 
## Final task group members: Camila Cancio, Erick Garcia & Others



# 0. Loading libraries and dataset


## 0.1 Loading libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import warnings

## 0.2 Loading dataset

In [None]:
df = pd.read_csv('/kaggle/input/brasilian-houses-to-rent/houses_to_rent_v2.csv')
df_backup = df.copy()
warnings.filterwarnings("ignore")

# 1. First approaching

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.info()

### This dataset is related to five major metropolitan cities in Brazil and contains 10692 properties for rent with 13 different characteristics (variables).

1. city: city where the property is located (categorical)
2. area: area of the property in m² (continuous)
3. rooms: number of rooms (discrete)
4. bathroom: number of bathrooms (discrete)
5. parking spaces: parking spaces (discrete)
6. floor: floor (discrete)
7. animals: do you accept animals? (categorical)
8. furniture: furnished? (categorical)
9. hoa: condominium fee (continuous)
10. rent amount: rent amount (continuous)
11. property tax: property tax (continuous)
12. fire insurance (R $): fire insurance (continuous)
13. total: sum of all fees (continuous)

# 2. Cleaning dataset

## 2.1 Renaming variables

In [None]:
df.rename(columns={'hoa (R$)':'hoa','parking spaces':'parking_spaces', 'floor': 'floors',
                   'rent amount (R$)' : 'rent_amount',
                   'property tax (R$)' : 'property_tax',
                   'fire insurance (R$)' : 'fire_insurance',
                   'total (R$)' : 'total'}, inplace = True)

## 2.2 Handling missing data 

In [None]:
def m_data(data):
    m_data = df.isnull().sum() #total de null    
    total = m_data.sort_values(ascending=True) #ordenação 
    percent = (m_data / len(df.index)*100).round(2).sort_values(ascending=True) #cálculo percentual
    m_table = pd.concat([total, percent], axis=1, keys=['Qty DA', '% DA'])
    return m_table.tail(13)

In [None]:
m_data(df)

We can see that we have no missing data (DA) in our dataset.

# 3. Analyzing variables distribution 

## 3.1 Numerics variables

In [None]:
df.describe().T

In [None]:
sns.set_style("darkgrid")
fig = plt.figure(1, figsize=(16,10))
spec2 = plt.GridSpec(ncols=3, nrows=3, figure=fig)
ax1 = fig.add_subplot(spec2[0, 0])
ax2 = fig.add_subplot(spec2[0, 1:2])
ax3 = fig.add_subplot(spec2[0, 2:])
ax4 = fig.add_subplot(spec2[1, 0])
ax5 = fig.add_subplot(spec2[1, 1:2])
ax6 = fig.add_subplot(spec2[1, 2:])
ax7 = fig.add_subplot(spec2[2, 0])
ax8 = fig.add_subplot(spec2[2, 1:2])
ax9 = fig.add_subplot(spec2[2, 2:])
var1 = (df['area'])
var2 = (df['rooms'])
var3 = (df['bathroom']) 
var4 = (df['parking_spaces']) 
var5 = (df['hoa']) 
var6 = (df['rent_amount'])
var7 = (df['property_tax'])
var8 = (df['fire_insurance'])
var9 = (df['total'])
ax1.boxplot(var1),ax1.set_xlabel("area",fontsize=10),ax1.set_ylabel("m²",fontsize=10)
ax2.boxplot(var2),ax2.set_xlabel("rooms",fontsize=10),ax2.set_ylabel("un",fontsize=10)
ax3.boxplot(var3),ax3.set_xlabel("bathroom",fontsize=10),ax3.set_ylabel("un",fontsize=10)
ax4.boxplot(var4),ax4.set_xlabel("parking_spaces",fontsize=10),ax4.set_ylabel("un",fontsize=10)
ax5.boxplot(var5),ax5.set_xlabel("hoa",fontsize=10),ax5.set_ylabel("R$",fontsize=10)
ax6.boxplot(var6),ax6.set_xlabel("rent_amount",fontsize=10),ax6.set_ylabel("R$",fontsize=10)
ax7.boxplot(var7),ax7.set_xlabel("property_tax",fontsize=10),ax7.set_ylabel("R$",fontsize=10)
ax8.boxplot(var8),ax8.set_xlabel("fire_insurance",fontsize=10),ax8.set_ylabel("R$",fontsize=10)
ax9.boxplot(var9),ax9.set_xlabel("total",fontsize=10),ax9.set_ylabel("R$",fontsize=10)
plt.show()

According central deviations and dispersion values, we can see that many variables have values to be verified, such as maximum value of the area, fire insurance rate and condominium rate very far from the quartiles. Beside that, there are outliers in others several variables identified from boxplots.
We will analyze each variable individually, including the categorical ones.

## 3.2 City

The variable "city" has no missing data, it is nominal qualitative type and the values are typed correctly. Therefore, they do not need any pre-treatment.

In [None]:
df.city.unique()

In [None]:
sp = df.city.value_counts()['São Paulo']
pa = df.city.value_counts()['Porto Alegre']
rj = df.city.value_counts()['Rio de Janeiro']
cp = df.city.value_counts()['Campinas']
bh = df.city.value_counts()['Belo Horizonte']
cities = [sp, pa, rj, cp, bh]
fig = plt.figure() 
fig, ax = plt.subplots(figsize=(8,8))
ax.set_title("Properties Distribution by City",fontsize=20)
labels='São Paulo', 'Porto Alegre', 'Rio de Janeiro', 'Campinas','Belo Horizonte'
ax.pie(cities, labels=labels, autopct='%1.1f%%',startangle=90)
plt.show()

It is noticed that the largest amount of real estate offers is in the city of São Paulo, with 55% of the offers.

## 3.3 Area

In [None]:
plt.figure(1, figsize=(15,6))
plt.subplot(1, 2, 1)
sns.boxplot(df.area)
plt.subplot(1, 2, 2)
sns.distplot(df.area)
plt.show()

The "area" variable does not have a good distribution, the observations above 10,000 m² are outliers data biasing the average value.

In [None]:
df.area.describe()

We have a maximum value of 46335 m², a minimum value of 11 m² and a standard deviation of 537 m², which shows a very large dispersion of the data.

In [None]:
df.query('area>=10000')

The dataset has 3 values above 10000 m². We will remove these values because they are causing a very large variation in the data.

In [None]:
df = df.query('area<10000')
df.shape

In [None]:
plt.figure(1, figsize=(15,6))
plt.subplot(1, 2, 1)
sns.boxplot(df.area,color='red')
plt.subplot(1, 2, 2)
sns.distplot(df.area,color='orange')
plt.show()

In [None]:
df.area.describe()

After ran a new analysis of the data, dispersion dropped to 134 m² after took out areas above 10000 m². However, there are still outliers above 1000 m² that have some distance from the others obervations, causing dispersion yet.

In [None]:
maior_1000 = df.query('area>1000')
maior_1000

In [None]:
maior_1000.shape

We found 6 values greater than 1000 m² that continue as outliers. We will not remove the value of 1020 m² because it is closer to the others observations.

In [None]:
df = df.query('area<1100')
df.shape

We have a new dataset with 10684 values.

In [None]:
df.area.describe()

Before we had a standard deviation of 537 m², now we have 130 m². The data are less dispersed.

In [None]:
plt.figure(1, figsize=(15,6))
plt.subplot(1, 2, 1)
sns.boxplot(df.area,color='yellow')
plt.subplot(1, 2, 2)
sns.distplot(df.area,color='green')
plt.show()

## 3.4 Rooms

In [None]:
df.rooms.describe()

The maximum value of this variable is very high, far from the average. Let's take a closer look.

In [None]:
plt.figure(1, figsize=(15,6))
plt.subplot(1, 2, 1)
sns.boxplot(df.rooms)
plt.subplot(1, 2, 2)
sns.distplot(df.rooms,bins = 10)
plt.show()

In the boxplot we can see values above 4 rooms away from the others observations.

In [None]:
df.query('rooms>4')

Searching in the dataset we found 406 properties with a number of rooms greater than 4. As it is a considerable volume of data, we will remain with them in the dataset.

In [None]:
df.rooms.value_counts()

In [None]:
r1 = df.rooms.value_counts()[1]
r2 = df.rooms.value_counts()[2]
r3 = df.rooms.value_counts()[3]
r4 = df.rooms.value_counts()[4]
r_5 = df.rooms.value_counts()[5]+[6]+[7]+[8]+[9]+[10]+[13]
rooms = [r1,r2,r3,r4,r_5]
fig = plt.figure() 
fig, ax1 = plt.subplots(figsize=(8,8))
ax1.set_title("Properties Distribution by Rooms",fontsize=20)
labels='1 Room', '2 Rooms', '3 Rooms','4 Rooms','More than 4 Rooms',
ax1.pie(rooms,labels=labels, autopct='%1.1f%%',startangle=90)
plt.show()

It is noticed that the largest number of real estate offers are properties with up to 3 bedrooms, with 4 out of 5 properties offered having this amount of bedrooms.

## 3.5 Bathrooms

In [None]:
df.bathroom.describe()

The maximum value for this variable is high. Let's take a closer look.

In [None]:
plt.figure(1, figsize=(15,6))
plt.subplot(1, 2, 1)
sns.boxplot(df.bathroom)
plt.subplot(1, 2, 2)
sns.distplot(df.bathroom,bins = 10)
plt.show()

The graph shows us that the number of bathrooms above 6 is far from the others observations.

In [None]:
df.query(' bathroom>6')

Searching in the dataset we found 102 properties with a number of bathrooms greater than 6. As it is a considerable volume of data, we will remain with them in the dataset.

In [None]:
df. bathroom.value_counts()

In [None]:
broom1 = df.bathroom.value_counts()[1]
broom2 = df.bathroom.value_counts()[2]
broom3 = df.bathroom.value_counts()[3]
broom4 = df.bathroom.value_counts()[4]
broom_5 = df.bathroom.value_counts()[5]+[6]+[7]+[8]+[9]+[10]
brooms = [broom1,broom2,broom3,broom4,broom_5]
fig = plt.figure() 
fig, ax1 = plt.subplots(figsize=(8,8))
ax1.set_title("Properties Distribution by Bathrooms",fontsize=20)
labels='1 Bathroom', '2 Bathrooms', '3 Bathrooms','4 Bathrooms','More than 4 Bathrooms'
ax1.pie(brooms,labels=labels, autopct='%1.1f%%',startangle=90)
plt.show()

As the same way the number of rooms, it can be seen that the major number of real estate offers are with up to 2 bathrooms, with 7 out of 10 properties offered having this amount of bathrooms.

## 3.6 Parking Spaces

In [None]:
df['parking_spaces'].describe()

The observations are distributed between 1 and 2 (evaluating the median and the 3 quartile), with a maximum value of 10 parking spaces and standard deviation of 1.5 parking spaces.

In [None]:
plt.figure(1, figsize=(15,6))
plt.subplot(1, 2, 1)
sns.boxplot(df.parking_spaces)
plt.subplot(1, 2, 2)
sns.distplot(df.parking_spaces,bins = 10)
plt.show()

The graph shows us that the number of parking spaces over 6 is far from the others observations.

In [None]:
df.query('parking_spaces> 5')

Searching in the dataset we found 319 properties with a parking space greater than 5. As it is a considerable volume of data, we will remain with them in the dataset.

In [None]:
df.parking_spaces.value_counts()

In [None]:
ps0 = df.parking_spaces.value_counts()[0]
ps1 = df.parking_spaces.value_counts()[1]
ps2 = df.parking_spaces.value_counts()[2]
ps3 = df.parking_spaces.value_counts()[3]
ps4 = df.parking_spaces.value_counts()[4]
ps_5 = df.parking_spaces.value_counts()[5]+[6]+[7]+[8]+[10]
pspace = [ps0,ps1,ps2,ps3,ps4,ps_5]
fig = plt.figure() 
fig, ax1 = plt.subplots(figsize=(8,8))
ax1.set_title("Properties Distribution by Parking Spaces",fontsize=20)
labels='No Parking Spaces','1 Parking Space', '2 Parking Spaces', '3 Parking Spaces','4 Parking Spaces','More than 4 Parking Spaces',
ax1.pie(pspace,labels=labels, autopct='%1.1f%%',startangle=90)
plt.show()

The parking spaces variable shows a very interesting distribution, which requires further analysis with another variable, such as rent amount and city. About 1 in 4 properties do not have a garage, and the largest offer of properties is with 1 parking space followed by properties without a parking space. Properties with more than 1 parking space are less than half of the properties offered.

## 3.7 Floors

In [None]:
df.floors.unique()

We realized that the variable "floors" has the symbol "-", which according to the creator of the dataset are values to designate the house type property. So that, to allow a better analysis on the variable, we will replace the symbol "-" with the value 0.

In [None]:
df.floors = df.floors.replace('-','0')

In [None]:
df.floors.unique()

Observed in the code above that the "floors" attribute is classified as "object", which is a categorical attribute. So we will convert it to numeric values "dtype = int64".

In [None]:
df.floors = df.floors.astype(int)

In [None]:
df.info()

The variable "floors" was defined as qualitative and we transformed it into quantitative to facilitate data analysis.

In [None]:
df.floors.describe()

The observations are distributed between 1 and 8 floors (evaluating the 2nd and 3rd quartiles), with a maximum value of 301 floors and standard deviation of 6 floors. This maximum value is too high and needs to be better evaluated.

In [None]:
plt.figure(1, figsize=(15,6))
plt.subplot(1, 2, 1)
sns.boxplot(df.floors)
plt.subplot(1, 2, 2)
sns.distplot(df.floors,bins = 10)
plt.show()

In [None]:
df.query('floors>300')

According to the creator of the dataset, it refers to residential properties. Conducting a survey we found that the largest residential building in Brazil is located on the northern coast of Santa Catarina, Camburiu, and has 81 floors. Floor 301 is a property located in Belo Horizonte. We will replace the value 301 by 3 because we believe that the number of the apartment was mistakenly entered instead of the floor.

In [None]:
df.floors = df.floors.replace(301,3)

In [None]:
df.floors.value_counts()

In [None]:
plt.figure(1, figsize=(15,6))
plt.subplot(1, 2, 1)
sns.boxplot(df.floors,color='yellow')
plt.subplot(1, 2, 2)
sns.distplot(df.floors,bins = 10,color='green')
plt.show()

The distribution was improved after replacing the value 301 but the data are still varying between 0 and a little over 50, but we will remain with these data as they can bring us important insights.

In [None]:
len(df)

In [None]:
house = df.floors.value_counts()[0]
apart = len(df)-house
tp = [house,apart]
fig = plt.figure() 
fig, ax1 = plt.subplots(figsize=(8,8))
ax1.set_title("Types of Properties Offered",fontsize=20)
labels='Houses','Apartments'
ax1.pie(tp,labels=labels, autopct='%1.1f%%',startangle=90)
plt.show()

We have the majority of offers in apartment type properties (77%).

## 3.8 Animal & Furniture

In [None]:
df.animal.unique()

In [None]:
df.furniture.unique()

They are two categorical variables. We will make the following substitutions:

1. 'acept' and 'not acept' for true and false, respectively.
2. 'furnished' and 'not furnished' for true and false, respectively.


In [None]:
df.animal = df.animal.replace('acept','true')
df.animal = df.animal.replace('not acept','false')
df.animal.unique()

In [None]:
df.furniture = df.furniture.replace('furnished','true')
df.furniture = df.furniture.replace('not furnished','false')
df.furniture.unique()

## 3.9 Others variables

Now let's look at the variables "hoa", "rent_amount", "property_tax" and "fire_insurance".

In [None]:
df[['hoa','rent_amount','property_tax','fire_insurance']]

In [None]:
df[['hoa','rent_amount','property_tax','fire_insurance']].describe().T

### 3.9.1 Hoa

In [None]:
plt.figure(1, figsize=(15,6))
plt.subplot(1, 2, 1)
sns.boxplot(df.hoa)
plt.subplot(1, 2, 2)
sns.distplot(df.hoa)
plt.show()

We noted values above R$ 10,000 that are impacting the distribution.

In [None]:
df.query('hoa>10000')

Conducting a survey we found 9 data above R$ 10,000. We are going to remove them as they are very far from the data set causing a great variation of them.

In [None]:
df = df.query('hoa<10000')

In [None]:
df.shape

In [None]:
plt.figure(1, figsize=(15,6))
plt.subplot(1, 2, 1)
sns.boxplot(df.hoa,color='yellow')
plt.subplot(1, 2, 2)
sns.distplot(df.hoa,color='green')
plt.show()

After removing the amounts above R$ 10,000, the data distribution was improved.

### 3.9.2 Rent amount

In [None]:
plt.figure(1, figsize=(15,6))
plt.subplot(1, 2, 1)
sns.boxplot(df.rent_amount)
plt.subplot(1, 2, 2)
sns.distplot(df.rent_amount)
plt.show()

We noted values above R$ 20,000 that are impacting the data distribution.

In [None]:
df.query('rent_amount>20000')

Carrying out a survey we found 4 data above R$ 20,000. We are going to remove them because they are very far from the others observations causing a big variation of them.

In [None]:
df = df.query('rent_amount<20000')

In [None]:
df.shape

In [None]:
plt.figure(1, figsize=(15,6))
plt.subplot(1, 2, 1)
sns.boxplot(df.rent_amount,color='yellow')
plt.subplot(1, 2, 2)
sns.distplot(df.rent_amount,color='green')
plt.show()

After removing the amounts above R$ 20,000, the data distribution was improved.

### 3.9.3 Property tax

In [None]:
plt.figure(1, figsize=(15,6))
plt.subplot(1, 2, 1)
sns.boxplot(df.property_tax)
plt.subplot(1, 2, 2)
sns.distplot(df.property_tax)
plt.show()

We noted values above R$ 30,000 that are impacting the distribution of data.

In [None]:
df.query('property_tax>300000')

Carrying out a survey we found 1 data above R$ 30,000. We are going to remove it because it is very far from the others observations causing a big variation of them.

In [None]:
df = df.query('property_tax< 300000')

In [None]:
df.shape

In [None]:
plt.figure(1, figsize=(15,6))
plt.subplot(1, 2, 1)
sns.boxplot(df.property_tax)
plt.subplot(1, 2, 2)
sns.distplot(df.property_tax)
plt.show()

After removing the amount above R$ 30,000, we continue with amounts that strongly impacts the distribution of data.

Let's check observations with values above R$ 20,000.

In [None]:
df.query('property_tax> 20000')

Eliminating amounts above R$ 20,000.

In [None]:
df = df.query('property_tax< 20000')

In [None]:
df.shape

In [None]:
plt.figure(1, figsize=(15,6))
plt.subplot(1, 2, 1)
sns.boxplot(df.property_tax,color='yellow')
plt.subplot(1, 2, 2)
sns.distplot(df.property_tax,color='green')
plt.show()

After removing the observations above R$ 20,000 the data distribution was improved but we continue with observations that impact the variation, but we will remain with them in the dataset as they can provide us with interesting insights.

### 3.9.4 Fire insurance

In [None]:
plt.figure(1, figsize=(15,6))
plt.subplot(1, 2, 1)
sns.boxplot(df.fire_insurance,color='yellow')
plt.subplot(1, 2, 2)
sns.distplot(df.fire_insurance,color='green')
plt.show()

We noted that the fire insurance variable has a good data distribution.

### 3.9.5 Total

In [None]:
plt.figure(1, figsize=(15,6))
plt.subplot(1, 2, 1)
sns.boxplot(df.total,color='yellow')
plt.subplot(1, 2, 2)
sns.distplot(df.total,color='green')
plt.show()

Although we have values above R$ 25,000 away from the others observations, we will remain with them in the dataset, as this variable is a sum of the values of hoa, rent_amount, property_tax and fire_insurance; and these rates have already undergone an analysis and pre-treatment.

# 4. BIVARIATE ANALYSIS

## 4.1 Correlations

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df.corr(), annot=True, cmap="Spectral")
plt.show()

From the heatmap above, we can identify the following most significant information:

1. The floors variable has very little influence on the other variables;
2. The variable rent amount and fire insurance are those that have the highest correlation with the total variable, indicating a possible greater contribution;
3. The value of the fire insurance is almost entirely proportional to the value of the rent amount;
4. Hoa value has little correlation with the characteristics of the property. 


## 4.2 What are the most expensive cities to live in?

In [None]:
df.groupby('city')['rent_amount'].mean().sort_values(ascending = False)

In [None]:
plt.figure(figsize=(10,4))
ax = sns.barplot(x= df.city, y= df.total, ci= False, estimator= np.median,order=["São Paulo", "Rio de Janeiro",
                                                                                 "Belo Horizonte","Campinas","Porto Alegre"])
plt.xticks(rotation=0)
ax.set_title('Cost to Rent in 5 Brazilian Cities', fontsize=18 )
ax.set_xlabel('City',fontsize = 12)
ax.set_ylabel('Total',fontsize=12)
for p in ax.patches:
  height = p.get_height()
  ax.text(p.get_x()+p.get_width()/2,height +3,'{:1.2f}'.format((height)),ha='center')
plt.show()

Based on the bar chart, we can say that São Paulo, Rio de Janeiro and Belo Horizonte are the cities that have the highest cost of renting real estate.

## 4.3 Which cities have the most expensive m²?

In [None]:
df['cost_m2'] = (df.rent_amount/df.area)
df

In [None]:
df.groupby('city')['cost_m2'].mean().sort_values(ascending = False)

São Paulo followed by Rio de Janeiro have the most expensive m² in the regions, continuing to be the most expensive cities. However, analyzing this calculation of the value per square meter, we realize that the cities of Porto Alegre, Belo Horizonte and Campinas changed in positions, showing, for example, that Porto Alegre is more expensive per m² than Belo Horizonte for renting a property.

In [None]:
df = df.explode('cost_m2')
df.preco_m2 = df['cost_m2'].astype(int)
plt.figure(figsize=(10,4))
ax = sns.barplot(x= df.city, y= df.cost_m2, ci= False, 
            estimator= np.mean,order=["São Paulo", "Rio de Janeiro",
                                      "Porto Alegre","Belo Horizonte","Campinas"])
plt.xticks(rotation=0)
ax.set_title('Most expensive m² of 5 Brazilian Cities', fontsize=18 )
ax.set_xlabel('City',fontsize = 12)
ax.set_ylabel('R$\m² ',fontsize=12)
plt.yticks(range(0,40,5))
plt.ylim((0,40))
for p in ax.patches:
  height = p.get_height()
  ax.text(p.get_x()+p.get_width()/2,height,'{:1.2f}'.format((height)),ha='center')
plt.show()

## 4.4 Does having animals impact prices?

Yes. Properties that accept animals are on average R$ 500.00 more expensive.

In [None]:
animal = df.groupby('animal')['rent_amount']
display(animal.agg(['mean', 'median']))

Analyzing by city we can say that in Porto Alegre and Rio de Janeiro the presence of animals does not increase the rental price.

In [None]:
plt.figure(figsize= (15,5))
sns.barplot(x= df.city, y= df.rent_amount, hue=df.animal, ci= False, estimator= np.median,
            order=["São Paulo", "Rio de Janeiro","Belo Horizonte","Campinas","Porto Alegre"])
plt.show()

In [None]:
ax = sns.countplot(df['animal'], hue = df['city'])
ax.figure.set_size_inches(12, 8)
ax.set_xlabel('Accept Animals?', fontsize=13)
ax.set_ylabel('Qty Properties', fontsize=13)
ax.set_xticklabels(['Yes','No'], fontsize=13)
plt.show()

It was noted more properties for rent that accept animals in all cities.

## 4.5 What is the relationship between furnished properties and animals acceptance?

In [None]:
sns.heatmap(df.groupby(['furniture','animal']).size().unstack(), annot=True, fmt="d", cmap="YlGnBu");


Noticed that 60% of the properties are not furnished and accept animals, being an strong indication of the relationship between acceptance of animals and the fact that the property is not furnished.

## 4.6 Properties with a higher hoa rate have the highest rent amount?

In [None]:
df1 = df[df['city']=='São Paulo']
df2 = df[df['city']=='Rio de Janeiro']
df3 = df[df['city']=='Porto Alegre']
df4 = df[df['city']=='Campinas']
df5 = df[df['city']=='Belo Horizonte']

f, axes = plt.subplots(1, 5,figsize=(15,8))
sns.despine(left=True)
sns.regplot(x=df1['hoa'], y=df1['rent_amount'], line_kws={'color': 'r'}, ax=axes[0]).set(title='São Paulo')
sns.regplot(x=df2['hoa'], y=df2['rent_amount'], line_kws={'color': 'r'}, ax=axes[1]).set(title='Rio de Janeiro')
sns.regplot(x=df3['hoa'], y=df3['rent_amount'], line_kws={'color': 'r'}, ax=axes[2]).set(title='Porto Alegre')
sns.regplot(x=df4['hoa'], y=df4['rent_amount'], line_kws={'color': 'r'}, ax=axes[3]).set(title='Campinas')
sns.regplot(x=df5['hoa'], y=df5['rent_amount'], line_kws={'color': 'r'}, ax=axes[4]).set(title='Belo Horizonte')
plt.subplots_adjust(top=0.92, bottom=0.08, left=0.10, right=0.95, hspace=0.25, wspace=0.35)

plt.show()

The scatter graphs with the regression line presented by the model show that Porto Alegre has a greater linearity trend between the increase of the hoa value and the increase of the rent value. However, due to the dispersions, it is clear that there is no linear correlation showing that we can assure that properties with a higher hoa rate have the highest rental value.

## 4.7 Have furnished properties a higher rent value?


Yes, furnished properties are on average R$ 1,300 more expensive. Evaluating by cities, we noticed in the graph that the pattern is the same: furnished properties are more expensive.

In [None]:
df.groupby('furniture')['rent_amount'].mean().sort_values(ascending = False)

In [None]:
plt.figure(figsize=(15,5))
sns.catplot(x ='city',y ='rent_amount',col = 'furniture',kind= 'bar',ci= False, data = df)
plt.show()

## 4.8 Does the number of parking spaces impact the rent amount?

Yes, there is an influence on the rent value up to a limit of 7 parking spaces. However, when we have 8 parking spaces the rent value is a little less than 7 and with 10 parking spaces it is less than 2 parking spaces. That is, from 8 parking spaces the amount of parking spaces no longer interferes with the rent amount.

In [None]:
plt.figure(figsize=(10,4))
sns.barplot(x= 'parking_spaces', y= 'rent_amount', data= df, ci= False)
plt.show()

## 4.9 What is the influence of the number of rooms on area size?

In [None]:
ax = sns.barplot(x='rooms', y='area', data = df, 
palette = 'GnBu_d')
ax.figure.set_size_inches(12, 8)
ax.set_xlabel('Rooms', fontsize=13)
ax.set_ylabel('Area', fontsize=13)
plt.show()

Up to 5 rooms there is an influence on the area, however above 5 rooms there is a tendency of stabilization in the size of the area.

## 4.10 What is the influence of the number of bathrooms on the area?

In [None]:
ax = sns.barplot(x='bathroom', y='area', data = df, 
palette = 'GnBu_d')
ax.figure.set_size_inches(12, 8)
ax.set_xlabel('Bathroom', fontsize=13)
ax.set_ylabel('Area', fontsize=13)
plt.show()

There is a strong correlation between the number of bathrooms and the size of the area.

In [None]:
dfcor = df[['bathroom','rooms','area']]
plt.figure(figsize=(6,6))
sns.set(font_scale=1)
sns.heatmap(dfcor.corr(), annot=True, cmap="flare")
plt.show()

The correlation matrix above proves that the variable "bathroom" actually has a little bigger influence on the value of the area than the variable "rooms".

## 4.11 Rental Classes

Using the descriptive statistics of the rent value variable, we will create a new target variable: rental class (rent_class). It categorizes the rent value into cheap, medium, expensive and very expensive.

In [None]:
df['rent_class'] = pd.cut(x=df['rent_amount'], bins=[450, 1530, 2661, 5000, 20000], 
                          labels=['cheap', 'medium', 'expensive', 'very expensive'], include_lowest=True)

In [None]:
df.head()

In [None]:
plt.figure(figsize=(12,8))
ax = df['rent_class'].value_counts().sort_values().plot(kind="barh")
totals= []
for i in ax.patches:
    totals.append(i.get_width())
total = sum(totals)
for i in ax.patches:
     ax.text(i.get_width()+.3, i.get_y()+.20, 
     str(round((i.get_width()/total)*100, 2))+'%', 
     fontsize=10, color='black')
ax.grid(axis="x")
plt.suptitle('Rental Classes', fontsize=20)
plt.show()

We noted that approximately each rental class represents 1/4 of the rental class of the entire dataset.

## 4.12 Available properties by number of rooms

In [None]:
df['5rooms_more'] = (df.rooms)
df.loc[df['5rooms_more'] > 4, '5rooms_more'] = 5
plt.figure(figsize=(15,6))
ax=sns.countplot(x =df['5rooms_more'] , hue = df['city'], data=df)
plt.xlabel("Rooms", fontsize=12)
plt.ylabel("Properties Qty", fontsize=12)
plt.title("Properties by Rooms: 1, 2, 3, 4 and above 5 rooms", fontsize=20)
plt.show()

According to bar chart, 1 to 3 bedroom properties are in high demand among people looking to rent property. São Paulo has the majority of the properties.

## 4.13 Contribution values to the total rent price

In [None]:
dfcor2 = df[['hoa','rent_amount','property_tax','fire_insurance','total']]
plt.figure(figsize=(10, 10))
sns.heatmap(dfcor2.corr(), annot=True, cmap="Oranges")
plt.show()

In addition to the rent amount, fire insurance showed a strong correlation with the total amount. This may be due to the fact that the total amount of the rent may be strongly related to the value of the apartment to be insured.