In [None]:
import pandas as pd, numpy as np, seaborn as sns, matplotlib.pyplot as plt

In [None]:
df2 = pd.read_csv('../input/brasilian-houses-to-rent/houses_to_rent_v2.csv') # we are only going to analyse version 2 of the dataframe 


In [None]:
print("Number of NaN values in the DataFrame is %s"%df2.isna().sum().sum())

In [None]:
print("Shape of the DataFrame is %s samples, %s features"%df2.shape)

In [None]:
df2.info() 

In [None]:
df2.animal.value_counts()

In [None]:
df2.select_dtypes(include=['object']).describe()

In [None]:
df2.describe() 

**Initial Observations : **
* We have no NaN values
* We have 13 features and 10692 instances
* Most Columns have integer data type 
* Money amounts aren't in USD rather they are in Brazil's Curruency ( Brazilian Real ) noted (R$) in the dataset
* Most houses are not furnitured
* Most houses allow pets
* More than the half of houses in the dataset are located in São Paulo
* The max values for each column is far away from the column's median and the mean that means they are potential outliers (except the numeric categorical columns (rooms, bathroom and parking spaces) )
* The money here not in US Dollars rather in Brazilian Real (Brazil's Curruency)

In [None]:
sns.pairplot(df2)

**Looks Like we have a lot of extremes (you can consider them outliers or a typos), like houses with high area and low rent price and vice versa,
im gonna remove them so we can have more representative plots that generalize to most of the population**

In [None]:
for col in df2.columns:
    if 'float' in str(df2[col].dtype) or 'int' in str(df2[col].dtype) and col not in ['bathroom', 'rooms', 'parking spaces']:
        df2 = df2[ df2[col] < df2[col].mean()+df2[col].std()*4 ]
        #remove any instance that is 4 std away from it's distribution's mean

In [None]:
sns.pairplot(df2)

**That's better, Looks like we have multicolinearity since some of pair variables look like they follow a linear trend, let's check the pearson's correaltion table**

In [None]:
plt.subplots(figsize=(10, 10))
corr = df2.corr() 
sns.heatmap(corr, center=0, vmin=-1, square=True, linewidths=.5, cbar_kws={"shrink": .5}, annot=True, fmt='.2f', cmap="coolwarm") 


What is Multicolinearity? Multicollinearity is a statistical phenomenon in which two or more variables in a regression model are dependent upon the other in such a way that one can be linearly predicted from the other with a high degree of accuracy.
It can be a problem only in gradient based models for example we are doing a linear regression and most of independent variables have a correlation above .9 with each others the model will have hard time seting the coefficients since most variables are highly correlated and important to predict the target variable, it will skew the interpretation of the coefficients , some of the solutions are to just ignore it if your goal is to make predictions and you don’t need to understand the role of each independent variable, you can also perform dimentionality reduction or you can remove the variables that gives roughly the same amount of information (for example if you are trying to predict salary of a lawyer and you got age and years of experience as features they'll be highly correlated most of the time and have similaire information then go ahead and remove one of them).
best article I found that explaines Multicolinearity: https://statisticsbyjim.com/regression/multicollinearity-in-regression-analysis/

In [None]:
df2.columns = ['city', 'area', 'rooms', 'bathroom', 'parking spaces', 'floor',
               'animal', 'furniture', 'hoa', 'rent amount', 'property tax', 'fire insurance', 'total']

In [None]:
catc = ['city', 'rooms', 'bathroom', 'parking spaces']
fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(15, 11))
for i, axes in enumerate(ax.flatten()):
    sns.countplot(df2[catc[i]], order=df2[catc[i]].value_counts().keys(), ax=axes)

**Some Observations : **
* the higher the number of bathrooms in a house the less frequent it's in the dataset
* houses with number of rooms between 1 - 4 tend to be more commun than houses with rooms of 5 and more

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))
df2.floor.replace(to_replace={"-": 0}, inplace=True)
sns.countplot(df2.floor, order=df2.floor.value_counts().keys(), ax =ax)

**Generally the higher the floor category number the less frequent it is**

In [None]:
#A homeowners association fee (HOA fee) is an amount of money that must be paid monthly by owners of certain types of residential properties
fig, ax = plt.subplots(figsize=(13, 7))
sns.boxplot(data=df2, x='city', y='hoa', ax=ax)

**Home owners pay the lowest HOA fee in Porto Alegre**

In [None]:
fig, ax = plt.subplots(figsize=(12, 7))
sns.boxplot(data=df2, x='city', y='total', palette='spring_r')

for city in df2.city.unique():
    print(city+' = MAD : {}, STD : {}'.format(df2[df2.city == city].total.mad(), df2[df2.city == city].total.std()))


**Observations : **
* São Paulo : largest IQR & highest median
* Belo Horizonte : meduim IQR & meduim  median
* Rio de janeiro : meduim IQR & meduim median
* Porto Alegre : smallest IQR & lowest median
* Campinas : small IQR & low median
* Houses in São Paulo tend to have high rent price and prices tend to varey a lot from the 'average' price when comparing it to other cities
* Houses in Porto Alegre tend to have the lowest rent price and prices does not varey a lot from the 'average' when comparing it to other cities
* **HOLIDAY TIP** : if you are thinking to to go to Brazil and rent house in one of these 5 cities and you are on a budget you should go to Campinas or Porto Alegro :)))
* rent amount, file insurance, property tax features' boxplots tend to have same characteristic of total's column boxplot

In [None]:
bins = np.array([0]+np.percentile(df2.area.values, range(10, 110, 10)).tolist()+[9e10]).astype(np.int64)
labels = ["area({}-{})".format(bins[i-1], bins[i]) for i in range(1, len(bins)-1)]+['area({}-+inf)'.format(bins[-2])] 
df2['AreaLabel'] = pd.cut(df2.area, bins=bins, labels=labels)

g = pd.crosstab(df2['city'], df2['AreaLabel'])


fig, ax = plt.subplots(nrows=3, ncols=2, figsize=(15, 15), )
fig.tight_layout(pad=1.8)

ax = ax.flatten()

for i, city in enumerate(df2.city.value_counts().keys()):
    sns.barplot(x=g.columns, y=g.loc[city].values, ax = ax[i])
    fig.axes[i].tick_params(labelrotation=45)
    ax[i].title.set_text(city)
    

fig.tight_layout(pad=1.8)


**Observations : **
 #put it mind the assumption that the area varey between [0, 2000]#
* Sao Paulo tends to have houses with area between (0-40) nearly as much as it have houses between with area over 300, and generaly in Sao Paulo the more the Area Label the house you are searching for have the more available it is 
* Rio de janeiro : houses tend to be more available in areas between(0-40) and (50-157)
* Belo Horizonte: the more Area Label the house you are searching for have the more available it is.
* Porto Alegre : Genrally the less Area Label the house you are searching for have the more available it is
* Campinas : Generally the less Area Label you want your house to have the more available it is, 
  but there is a shortage of houses with area between (0-40)


In [None]:
g = sns.FacetGrid(df2, col="city")
g.map(sns.countplot, 'animal', order=['acept', 'not acept'])
#lets see the percentage of houses that allow pets in each city
for city in df2.city.unique():
    city_data = df2[df2.city == city]
    print('in city {} {}% of houses allow pets'.format(city, str(city_data.animal.value_counts()['acept']/city_data.shape[0])[:4]))

**Observations : **
* Most houses to rent in each city allow pets
* 84% of houses to rent in Porto Alegre allow pets, which make this city the top city in this dataset that it's renting houses allow pets

In [None]:
df2.columns

In [None]:
g = sns.FacetGrid(df2, col='city')
g.map(sns.countplot, 'rooms')


**Observations : **
* in São Paulo houses with rooms between 1 to 4 are the most commun
* in Porto Alegre houses with rooms between 1 to 3 are the most commun
* in Rio de Janeiro houses with rooms between 1 to 3 are the most commun
* in Campinas houses with rooms between 1 to 3 are the most commun
* in Belo Horizonte houses with rooms between 2 to 4 are the most commun

**Now we gonna perform KMeans clustering to our data**

In [None]:
df2.drop('AreaLabel', axis=1, inplace=True)
df2.replace(to_replace={'animal': {'acept': 1, 'not acept': 0},
                        'furniture': {'furnished': 1, 'not furnished':0},
                        'city': {'São Paulo': 0, 'Rio de Janeiro': 1, 'Belo Horizonte': 2, 'Porto Alegre': 3,
                         'Campinas': 4}
                       }, inplace=True)
df2.head()

In [None]:
from sklearn.cluster import KMeans
wcss = []
max_k = 20
for k in range(1, max_k):
    kmean = KMeans(n_clusters=k, verbose=2, n_jobs=-1)
    kmean.fit(df2.values)
    wcss.append(kmean.inertia_)
    
fig,ax = plt.subplots(figsize=(11, 7))
sns.lineplot(x=list(range(1, max_k)), y=wcss, ax =ax, marker='.', markersize=15,)
plt.xticks(list(range(1, max_k)))
plt.xlabel('K')
plt.ylabel('Total With-in Cluster Sum Squared Distances')
plt.title('Elbow Method')

In [None]:
#we are going to choose k = 4 but you could take 5 or 3
optimal_k = 4
kmean = KMeans(n_clusters=optimal_k, verbose=2, n_jobs=-1)
df2['cluster'] = kmean.fit_predict(df2.values)

In [None]:
fig, ax = plt.subplots(figsize=(9, 5))
sns.barplot(ax=ax, x=df2.cluster.value_counts().keys(), y=df2.cluster.value_counts().values)
plt.xlabel('Cluster')
plt.ylabel('Frequency')

In [None]:
df2.groupby('cluster')[['area', 'hoa', 'rent amount', 'property tax', 'fire insurance', 'total']].mean()

<center>**Stay Home and Machine Learn :))**</center>