# Importing Libraries

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

### The task is to define what is the best investment in the city related to sale x rent. So we want to define what kind of property and what district of the city has the best return rate. The return rate is defined as rent value / sale value normalized by size. So for each Brazilian Real invested in sale how much will be the return in rent.


# Downloading csv file

In [None]:
df = pd.read_csv('../input/sao-paulo-real-estate-sale-rent-april-2019/sao-paulo-properties-april-2019.csv')

In [None]:
df.head()

# Data Wrangling

### Verifying missing values

In [None]:
df.isnull().sum()

### Verifying types of variables

In [None]:
df.info()

### Verifying dummie values in descrete, categorical and boolean variables

In [None]:
df.columns

In [None]:
df['Rooms'].unique()

In [None]:
df['Toilets'].unique()

In [None]:
df['Suites'].unique()

In [None]:
df['Parking'].unique()

In [None]:
df['Elevator'].unique()

In [None]:
df['Furnished'].unique()

In [None]:
df['Swimming Pool'].unique()

In [None]:
df['New'].unique()

In [None]:
df['Property Type'].unique()

In [None]:
df['District'].unique()

### Spliting districts and cities

In [None]:
df[['District', 'City']] = df['District'].str.split('/', expand=True)

In [None]:
df.head()

In [None]:
df['City'].unique()

There is just one single city and one single type of property, so this varibles can be removed

In [None]:
df = df.drop(['City', 'Property Type'], axis=1)

### Verifying the coordinates

In [None]:
sns.scatterplot(data=df, x='Longitude', y='Latitude')

The city of Sao Paulo is located close to the -23° Latitude / -46° Longitude, so there are several inconsistent values.

In [None]:
df[df['Longitude'] > -10]

data without coordinates was set with zero.

In [None]:
df[df['Latitude'] < -30]

here we see values with inverted position (Latitude x Longitude)

In [None]:
df[(df['Longitude'] == 0) | (df['Latitude'] == 0)]

There is a total of 881 missing values of coordinates and several other with wrong values. But as the data is located by districts we consider that is enough to clustering these data. So we can drop the coordenates in order to avoid mistakes in the future predictive model, but assuming that districts is enough to categorize them. 

In [None]:
df = df.drop(['Latitude', 'Longitude'], axis=1)

In [None]:
df.head()

### Evaluating numercial variables by Negotiation Type

In [None]:
df[df['Negotiation Type']=='rent'].describe()[['Price', 'Condo', 'Size']]

In [None]:
df[df['Negotiation Type']=='sale'].describe()[['Price', 'Condo', 'Size']]

### Creating two new data frames based on negotiation type

In [None]:
dfr = df[df['Negotiation Type'] == 'rent']
dfs = df[df['Negotiation Type'] == 'sale']

In [None]:
plt.figure(figsize=(12,8), dpi=150)
sns.histplot(data=dfr, x='Price', bins=50, kde=True)

In [None]:
plt.figure(figsize=(12,8), dpi=150)
sns.histplot(data=dfs, x='Price', bins=50, kde=True)

### Creating a new varible - Price per squared meter for rent and sale

In [None]:
dfr['Price m2'] = dfr['Price'] / dfr['Size']
dfs['Price m2'] = dfs['Price'] / dfs['Size']

In [None]:
dfr.sort_values('Price m2', ascending=False)[0:10]

In [None]:
dfs.sort_values('Price m2', ascending=False)[0:10]

### Describing the Price m2 variable

In [None]:
dfr['Price m2'].describe()

In [None]:
dfs['Price m2'].describe()

In [None]:
dfr['Price m2'].plot.box()

In [None]:
dfs['Price m2'].plot.box()

In [None]:
plt.figure(figsize=(18,8),dpi=200)
plt.title("Price m2 - Rent")
plt.ylabel("Price m2")
dfr.groupby('District')['Price m2'].mean().sort_values(ascending=False).plot.bar();

In [None]:
plt.figure(figsize=(18,8),dpi=200)
plt.title("Price m2 - Sale")
plt.ylabel("Price m2")
dfs.groupby('District')['Price m2'].mean().sort_values(ascending=False).plot.bar();

We can see there are several outliers but, removing them based on entire sample will introduce a mistake because tradicionaly there are some districts with high values, so in order to remove the outlier is necessary to consider two things: the value per squared meter and the values grouped by districts. 

In [None]:
lista_district = dfr['District'].unique()

In [None]:
lista_district

In [None]:
for district in lista_district:
    med = dfr[dfr['District'] == district]['Price m2'].mean()
    q1 = dfr[dfr['District'] == district]['Price m2'].quantile(.25)
    q3 = dfr[dfr['District'] == district]['Price m2'].quantile(.75)
    outlier1 = ((q3 - q1) * 3) + med
    outlier2 = med - ((q3 - q1) * 3)
    dfr = dfr.drop(dfr[(dfr['District'] == district) & (dfr['Price m2'] > outlier1)].index)
    dfr = dfr.drop(dfr[(dfr['District'] == district) & (dfr['Price m2'] < outlier2)].index)

In [None]:
dfr['Price m2'].plot.box()

In [None]:
for district in lista_district:
    med = dfs[dfs['District'] == district]['Price m2'].mean()
    q1 = dfs[dfs['District'] == district]['Price m2'].quantile(.25)
    q3 = dfs[dfs['District'] == district]['Price m2'].quantile(.75)
    outlier1 = ((q3 - q1) * 3) + med
    outlier2 = med - ((q3 - q1) * 3)
    dfs = dfs.drop(dfs[(dfs['District'] == district) & (dfs['Price m2'] > outlier1)].index)
    dfs = dfs.drop(dfs[(dfs['District'] == district) & (dfs['Price m2'] < outlier2)].index)

In [None]:
dfs['Price m2'].plot.box()

Instead of considering 1.5 times the interquartile difference, we considered 3 times in order to remove just the extreme values.

# Defining the return rate based on districts

In [None]:
rrd = (dfr.groupby('District')['Price m2'].mean() / dfs.groupby('District')['Price m2'].mean())*100

In [None]:
rrd.sort_values(ascending=False)[:50]

Some periferic districts have a good return rate because the price m2 for sale is so small.

In [None]:
rrd.sort_values(ascending=False)[50:].dropna()

In [None]:
plt.figure(figsize=(18,8),dpi=200)
plt.title("Return Rate by Districts")
plt.ylabel("Return Rate")
rrd.sort_values(ascending=False).dropna().plot.bar();

# Defining the return rate based on rooms

In [None]:
rrr = (dfr.groupby('Rooms')['Price m2'].mean() / dfs.groupby('Rooms')['Price m2'].mean())*100

In [None]:
rrr.sort_values(ascending=False).dropna()

One bedroom has the best return rate

In [None]:
plt.figure(figsize=(8,4),dpi=100)
rrr.sort_values(ascending=False).dropna().plot.bar()
plt.title("Return Rate by Bedrooms")
plt.ylabel("Return Rate")
plt.xticks(rotation = 0);

# Defining the return rate base on districts and rooms

In [None]:
rrdr = (dfr.groupby(['District','Rooms'])['Price m2'].mean() / dfs.groupby(['District','Rooms'])['Price m2'].mean())*100

In [None]:
rrdr.sort_values(ascending=False).nlargest(50)

In [None]:
rrdr.sort_values(ascending=False).nsmallest(50)

In [None]:
plt.figure(figsize=(8,4),dpi=100)
rrdr.sort_values(ascending=False).dropna()[:50].plot.bar()
plt.title("Return Rate by District and Bedrooms - Top 50")
plt.ylabel("Return Rate")
plt.xticks(rotation = 90);

In general small apartments has better return rates than the big ones. Apartments far from the center of the city has the best return rate, because the price for rent by squared meter is not so different from the best places in the city, but the price for sale is very small. One bedroom in Vila Madelena District has the best return rate in the city.