In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
from matplotlib import pyplot
from sklearn.preprocessing import MinMaxScaler
from scipy import stats

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# importando os dados
base = pd.read_csv('../input/aula-2-ia-dataset/CasasParaAlugar.csv', index_col = 0)
original = base
base.head()


In [None]:
# separando os dados quantitativos 
dados_quantitativos = ['area','rooms','bathroom','hoa (R$)','rent amount (R$)','property tax (R$)','fire insurance (R$)','parking spaces','total (R$)']

base[dados_quantitativos].describe()


In [None]:
# separando os dados qualitativos 
dados_qualitativos = ['city','floor','animal','furniture']

base[dados_qualitativos].describe()


In [None]:
# mapa de calor
sns.heatmap(base.corr())

In [None]:
# graficos de dispersão
pyplot.scatter(base['total (R$)'], base['hoa (R$)'])
pyplot.title('Gráfico de Dispersão entre hoa (R$) e total (R$)')
pyplot.show()
pyplot.scatter(base['rent amount (R$)'], base['fire insurance (R$)'])
pyplot.title('Gráfico de Dispersão entre rent amount (R$) e fire insurance (R$)')
pyplot.show()

In [None]:
# graficos de barra
sns.displot(base['rooms'])
sns.displot(base['bathroom'])

In [None]:
#Data Cleaning 

# dados nulos/faltantes
faltantes = base.isnull().sum()
print(faltantes)
print("\n")
porcentagem_faltante = faltantes * 100/ len(base)
print(porcentagem_faltante)

In [None]:
# substituindo quantitativos pela media 

areaMedia = base['area'].mean()
base.update(base['area'].fillna(areaMedia))

quartosMedia = base['rooms'].mean()
base.update(base['rooms'].fillna(quartosMedia))

banheiroMedia = base['bathroom'].mean()
base.update(base['bathroom'].fillna(banheiroMedia))

impostoMedia = base['property tax (R$)'].mean()
base.update(base['property tax (R$)'].fillna(impostoMedia))

valorMedia = base['rent amount (R$)'].mean()
base.update(base['rent amount (R$)'].fillna(valorMedia))

seguroMedia = base['fire insurance (R$)'].mean()
base.update(base['fire insurance (R$)'].fillna(seguroMedia))

taxasMedia = base['hoa (R$)'].mean()
base.update(base['hoa (R$)'].fillna(taxasMedia))

totalMedia = base['total (R$)'].mean()
base.update(base['total (R$)'].fillna(totalMedia))

totalMedia = base['parking spaces'].mean()
base.update(base['parking spaces'].fillna(totalMedia))

# verificando dados nulos/faltantes 
base.isnull().sum()

In [None]:
# verificando dados qualitativos mais frequentes

print(base['city'].value_counts())
print('\n')
print(base['animal'].value_counts())
print('\n')
print(base['furniture'].value_counts())


In [None]:
# substituindo dados qualitativos faltantes pelos mais frequentes
base['furniture'] = base['furniture'].fillna('not furnished')
base['animal'] = base['animal'].fillna('acept')
base['city'] = base['city'].fillna('São Paulo')

#excluindo os dados faltantes do andar 
base = base.dropna(subset=['floor'])

# verificando dados nulos/faltantes 
base.isnull().sum()

In [None]:
# Outliers

# analisando assimetria
assimetria = base.skew()
print(assimetria)

In [None]:
# grafico de dispersão para analisar Outliers de hoa (R$) total(R$)
pyplot.scatter(base['total (R$)'], base['hoa (R$)'])
pyplot.title('Gráfico de Dispersão entre hoa e total')
pyplot.show()

In [None]:
# arrumando valores de hoa (R$) total(R$)

novo = base['hoa (R$)'] < (20000)
base = base[novo]
novo = base['total (R$)'] < (20000)
base = base[novo]


In [None]:
# grafico de dispersão para analisar Outliers de hoa (R$) total(R$)
pyplot.scatter(base['total (R$)'], base['hoa (R$)'])
pyplot.title('Gráfico de Dispersão entre hoa e total')
pyplot.show()

# analisando novamente a assimetria
assimetria = base.skew()
print(assimetria)

In [None]:
# grafico de dispersão para analisar Outliers de rent amount (R$) e fire insurance (R$)
pyplot.scatter(base['rent amount (R$)'], base['fire insurance (R$)'])
pyplot.title('Gráfico de Dispersão entre rent amount e fire insurance')
pyplot.show()

In [None]:
#Arrumando valores de rent amount (R$) e fire insurance (R$)
novo = base['rent amount (R$)'] < (17500)
base = base[novo]
novo = base['fire insurance (R$)'] < (300)
base = base[novo]


In [None]:
# grafico de dispersão para analisar Outliers de rent amount (R$) e fire insurance (R$)
pyplot.scatter(base['rent amount (R$)'], base['fire insurance (R$)'])
pyplot.title('Gráfico de Dispersão entre rent amount e fire insurance')
pyplot.show()

# analisando novamente a assimetria
assimetria = base.skew()
print(assimetria)

In [None]:
#arrumando valores da property tax (R$) com z test

fig, ax = pyplot.subplots(nrows=1, ncols=2, figsize=(15, 5))
data = base['property tax (R$)']
z_data = np.abs(stats.zscore(base['property tax (R$)']))
ax[0].hist(data)
ax[0].set_xlabel("Valores reais de property tax (R$)")
ax[1].hist(z_data)
ax[1].set_xlabel("Valores Z de property tax (R$)")
ax[1].vlines(x=3, ymin=0, ymax=10000, colors='red')
pyplot.show()

In [None]:
# arrumando valores de property tax (R$)

novo = np.abs(stats.zscore(base['property tax (R$)'])) < 3
base = base[novo]


In [None]:
sns.boxplot(x = 'area', data = base)

In [None]:
# arrumando valores de area

novo = base['area'] < (5000)
base = base[novo]


In [None]:
# analisando assimetria novamente
sns.boxplot(x = 'area', data = base)

assimetria = base.skew()
print(assimetria)

In [None]:
# arrumando valores de area com z teste

fig, ax = pyplot.subplots(nrows=1, ncols=2, figsize=(15, 5))
data = base['area']
z_data = np.abs(stats.zscore(base['area']))
ax[0].hist(data)
ax[0].set_xlabel("Valores reais de area")
ax[1].hist(z_data)
ax[1].set_xlabel("Valores Z de area")
ax[1].vlines(x=3, ymin=0, ymax=10000, colors='red')
pyplot.show()

In [None]:
# arrumando valores de area

area = np.abs(stats.zscore(base['area'])) < 3
base = base[area]


In [None]:
# assimetria
print(base.skew())

#Mapa de Calor
sns.heatmap(base.corr())


In [None]:
# feature engineering

# criando uma nova variavel que será a soma das taxas/impostos/seguro

base['sumtaxs'] = base['fire insurance (R$)']+ base['hoa (R$)'] + base['property tax (R$)']
base[['sumtaxs','fire insurance (R$)', 'hoa (R$)', 'property tax (R$)']]


In [None]:
dados_quantitativos = ['area','rooms','bathroom','hoa (R$)','rent amount (R$)','property tax (R$)','fire insurance (R$)','parking spaces','total (R$)','sumtaxs']
base.head()

In [None]:
# feature selection

#será usado o metodo SelectKBest para selecionar as k colunas mais importantes 
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif, mutual_info_classif

selected_columns = ['area', 'rooms', 'bathroom', 'parking spaces', 'hoa (R$)', 'property tax (R$)', 'fire insurance (R$)', 'rent amount (R$)', 'sumtaxs']
x = base[selected_columns]
y = base['total (R$)']

f_classif = SelectKBest(score_func=f_classif, k=8)
fit = f_classif.fit(x,y)
features = fit.transform(x)

# mostrando colunas escolhidas
cols = fit.get_support(indices=True)
base[selected_columns].iloc[:,cols]


In [None]:
# notamos que a coluna 'rooms' foi considerada a menos importante.
# exclindo a coluna rooms

base = base.drop(['rooms'], axis=1)

dados_quantitativos = ['area','bathroom','hoa (R$)','rent amount (R$)','property tax (R$)','fire insurance (R$)','parking spaces','total (R$)','sumtaxs']

In [None]:
#trocando as colunas 'animals', 'furniture' por valores 0 e 1

base['furniture'].replace(to_replace='furnished', value=1, inplace=True)
base['furniture'].replace(to_replace='not furnished', value=0, inplace=True)

base['animal'].replace(to_replace='acept', value=1, inplace=True)
base['animal'].replace(to_replace='not acept', value=0, inplace=True)

In [None]:

# utilizando o metrodo MinMax para normalizar os valores entre 0 e 1

scaler = MinMaxScaler()
data = base[dados_quantitativos]
scaler.fit(data)

data_scaled = scaler.transform(data)
data_scaled = pd.DataFrame(data_scaled)
data_scaled.columns = dados_quantitativos
data_scaled.index = base.index

base = base.drop(dados_quantitativos, axis=1)
base = pd.concat([base, data_scaled], axis=1)


In [None]:
base.head()

In [None]:
# excluindo as colunas 'city' e 'floor'
del base['city']
del base['floor']

In [None]:
# inicial 

original.head()

In [None]:
# final

base.head()