In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
data = pd.read_csv("../input/brasilian-houses-to-rent/houses_to_rent_v2.csv")

In [None]:
data.tail()

In [None]:
data.shape

In [None]:
data.dtypes

### Formatting the column names

In [None]:
data.columns = [x.strip().replace(" (R$)",'').replace(' ','_') for x in data.columns]
data.columns

### No missing values found

In [None]:
data.isnull().sum()

### Amount of samples for each city

In [None]:
counts = data.city.value_counts()
counts

### Percentage of samples for each city

In [None]:
plt.figure(figsize = (16, 6))
data.city.value_counts().plot(kind='pie', autopct='%.2f%%')

## **Q1**: Wich city is more pet-friendly?

### First, we get the number of samples that accept animals

In [None]:
pet_accept_count = data[data.animal == 'acept'].city.value_counts()
pet_accept_count

### Then we calculate the mean for each city

In [None]:
pet_dict = {}
for i, v in pet_accept_count.items():
    pet_dict[i] = (v/counts[i])*100

pet_series = pd.Series(pet_dict)
print(pet_series.sort_values(ascending=False))
pet_series.sort_values(ascending=False).plot(kind='bar')

### So, we can conclude that **Porto Alegre** is the most pet-friendly city

## **Q2**: What is the most common number of rooms, bathrooms and parking spaces

In [None]:
data['rooms'].value_counts().plot(kind='bar')

In [None]:
data['bathroom'].value_counts().plot(kind='bar')

In [None]:
data['parking_spaces'].value_counts().plot(kind='bar')

### So, places with 3 roooms, 1 bathroom and 1 parking space are the most common.

## **Q3**: Based on the analysis before, what is the avarage price of rent for these places for each city?

In [None]:
common_places = data.loc[(data.rooms==3) & (data.bathroom==1) & (data.parking_spaces==1)]
common_places

In [None]:
plt.title("Avarage rent of common places for each city")
plt.ylabel("Rent")
print(common_places.groupby('city').rent_amount.mean())
common_places.groupby('city').rent_amount.mean().plot(kind='bar')

## **Q4**: What's the mean price of hoa, rent_amount, property_tax, fire_insurance and total columns?

In [None]:
sb.set_style('darkgrid')
def mean_prices_plot(feature):
    plt.figure(figsize = (18, 6))
    sb.barplot(x=data.city, y=data[feature])

In [None]:
feature_prices = data[['hoa', 'rent_amount', 'property_tax', 'fire_insurance', 'total']]

for feature in feature_prices:
    mean_prices_plot(feature)


### Looking at the plots above we can see that are some outliers

#### Let's try to found the Belo Hoizonte outliers for 'hoa' column

In [None]:
bh_hoa = data[data['city'] == 'Belo Horizonte']
mean_bh_hoa = bh_hoa.hoa.mean()
std_bh_hoa = bh_hoa.hoa.std()

print("Belo Horizonte hoa mean and std -> {} | {}".format(mean_bh_hoa, std_bh_hoa))

### We get the samples that are higher than mean+std

In [None]:
out_hoa = bh_hoa.hoa.mean() + bh_hoa.hoa.std()
bh_hoa[bh_hoa.hoa>out_hoa]

### Those two samples are certainly outliers and also duplicated. So let's remove them

In [None]:
outliers_index = bh_hoa[bh_hoa.hoa>out_hoa].index

print("Deleted samples: ", outliers_index)
for index in outliers_index:
    data.drop(index)

### We'll do the same with the 'property_tax' column for São Paulo samples

In [None]:
sp_prop_tax = data[data.city=='São Paulo']
sp_prop_tax.describe()

In [None]:
mean_sp_tax = sp_prop_tax.property_tax.mean()
std_sp_tax = sp_prop_tax.property_tax.std()

print("São Paulo property tax mean and std -> {} | {}".format(mean_sp_tax, std_sp_tax))

In [None]:
out_tax = mean_sp_tax+std_sp_tax
sp_prop_tax[sp_prop_tax.property_tax>out_tax]

In [None]:
outliers_index = sp_prop_tax[sp_prop_tax.property_tax>out_tax].index
print("Deleted samples: ", outliers_index)
for index in outliers_index:
    data.drop(index)

## **Q5**: What is the correlation between the features?

In [None]:
correlations = data.corr(method='pearson')
plt.figure(figsize = (16, 8))
sb.heatmap(correlations, vmin=0, vmax=1, annot=True, cmap = plt.cm.RdYlBu_r, linewidths=.7)

## **Q6**: What is the relationship between 'hoa', 'rent_amount', 'property_tax', 'fire_insurance' and the 'total' column?

In [None]:
plt.figure(figsize = (10, 6))
sb.scatterplot(x=data['hoa'], y=data['total'], hue=data.city, style=data.city)

In [None]:
plt.figure(figsize = (10, 6))
sb.scatterplot(x=data['rent_amount'], y=data['total'], hue=data.city, style=data.city)

In [None]:
plt.figure(figsize = (10, 6))
sb.scatterplot(x=data['property_tax'], y=data['total'], hue=data.city, style=data.city)

In [None]:
plt.figure(figsize = (10, 6))
sb.scatterplot(x=data['fire_insurance'], y=data['total'], hue=data.city, style=data.city)