# Importação das bibliotecas

In [112]:
from scipy.stats import t
import pandas as pd
import seaborn as sns
import numpy as np
import plotly.express as px
from scipy.stats import norm
pd.set_option('display.float_format', lambda x: '{:.2f}'.format(x))
from typing import List
import matplotlib.pyplot as plt
import math


In [113]:
def obter_intervalo_confianca(dataframe: pd.DataFrame):    
    conf_level = 0.95
    media = dataframe['preco'].mean()
    std = dataframe['preco'].std()
    n = dataframe.shape[0]
    grausLiberdade = n - 1
    t_value = t.ppf((1 + conf_level) / 2, grausLiberdade)
    inf = media - t_value * std / np.sqrt(n)
    sup = media + t_value * std / np.sqrt(n)
    print('Média:', round(media), ' Desvio padrão:', round(std))
    print('Intervalo de confiança:', round(inf), '-', round(sup))

In [114]:
def obter_resumo(tipo_imovel: str, bairro: str = None, banheiro: int = None, quartos: int = None) -> pd.DataFrame:
    query = f'  tipo_imovel == "{tipo_imovel}"'
    if bairro is not None:
        query += f' and bairro_teste == "{bairro}"  '
    if banheiro is not None:
        query += f' and banheiro == {banheiro} '
    if quartos is not None:
        query += f' and quarto == {quartos} '
    base_original = pd.read_parquet('../data/processed/base_casa.parquet')
    base_original = base_original.query(query)
    return base_original

In [115]:
def obter_estatistica(bairros: List[str], base_original_completa: pd.DataFrame) -> pd.DataFrame:
    lista_dados = []
    for bairro in bairros:
        dados_temp = base_original_completa.query(
            f'bairro_teste == "{bairro}"')
        lista_dados.append(
            {
                'bairro': bairro,
                'media_preco':  round(dados_temp['preco'].mean(), 2),
                'mediana_preco':  round(dados_temp['preco'].median(), 2),
                'moda': round(dados_temp['preco'].mode()[0], 2),
                'desvio_padrao': round(dados_temp['preco'].std(), 2),
                'total_imoveis': dados_temp.shape[0],
                'coeficiente_variacao': (round(dados_temp['preco'].std(), 2) / round(dados_temp['preco'].mean(), 2)) * 100
            }
        )
    df = pd.DataFrame(lista_dados)
    return df
  

- Preço Geral

In [159]:
base_original = obter_resumo(tipo_imovel='Apartamento')
base_original['quarto'] = base_original['quarto'].str.replace('1 Quarto', '1').astype('int32')
base_original['garagem'] = base_original['garagem'].str.replace('--', '0').astype('int32')
base_original['metragem'] = base_original['metragem'].astype('int32')
base_original

Unnamed: 0,tipo_imovel,nome,preco,metragem,quarto,banheiro,garagem,ID_CASA,bairro_teste
0,Apartamento,"Apartamento com 2 Quartos à Venda, 65m²",310000.00,65,2,2,1,2699716579,Jardim Botânico
2,Apartamento,"Apartamento com 2 Quartos à Venda, 71m²",403500.00,71,2,2,2,2690493538,Bonfim Paulista
3,Apartamento,"Apartamento com 2 Quartos à Venda, 54m²",290000.00,54,2,2,1,2667652105,Nova Aliança
4,Apartamento,"Apartamento com 2 Quartos à Venda, 45m²",166420.00,45,2,1,1,2682840782,Conjunto Habitacional Jardim Das Palmeiras
5,Apartamento,"Apartamento com 2 Quartos à Venda, 64m²",370000.00,64,2,2,2,2688027260,Vila Ana Maria
...,...,...,...,...,...,...,...,...,...
9967,Apartamento,"Apartamento com 3 Quartos à Venda, 81m²",540000.00,81,3,2,2,2696267182,Vila Do Golf
9968,Apartamento,"Apartamento com Quarto à Venda, 61m²",280000.00,61,1,1,1,2476716299,Centro
9969,Apartamento,"Apartamento com 3 Quartos à Venda, 83m²",375000.00,83,3,3,2,2693112449,Jardim São Luiz
9970,Apartamento,"Apartamento com 2 Quartos à Venda, 48m²",175000.00,48,2,1,1,2572455642,Lagoinha


In [160]:
base_original = base_original.drop_duplicates()
base_original.shape

(8395, 9)

In [161]:
base_original.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8395 entries, 0 to 9971
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   tipo_imovel   8395 non-null   string 
 1   nome          8395 non-null   string 
 2   preco         8395 non-null   float64
 3   metragem      8395 non-null   int32  
 4   quarto        8395 non-null   int32  
 5   banheiro      8395 non-null   string 
 6   garagem       8395 non-null   int32  
 7   ID_CASA       8395 non-null   int64  
 8   bairro_teste  8395 non-null   string 
dtypes: float64(1), int32(3), int64(1), string(4)
memory usage: 557.5 KB


In [162]:
base_original['banheiro'].unique()

<StringArray>
[ '2 ',  '1 ',  '3 ',  '5 ',  '4 ',  '9 ',  '7 ',  '6 ', '20 ',  '8 ', '11 ',
 '13 ', '12 ', '10 ']
Length: 14, dtype: string

In [163]:
base_original['garagem'].unique()

array([  1,   2,   0,   4,   3,   5, 192,  10,  19,   6,   8,   7],
      dtype=int32)

In [164]:
base_original[['banheiro', 'garagem']] = base_original[['banheiro', 'garagem']].astype('int32')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  base_original[['banheiro', 'garagem']] = base_original[['banheiro', 'garagem']].astype('int32')


In [165]:
base_original.describe()

Unnamed: 0,preco,metragem,quarto,banheiro,garagem,ID_CASA
count,8395.0,8395.0,8395.0,8395.0,8395.0,8395.0
mean,450640.89,87.23,2.32,2.1,1.46,2635153434.16
std,453436.77,87.68,0.77,1.32,2.23,98351619.28
min,61045.0,10.0,1.0,1.0,0.0,54472404.0
25%,215000.0,49.0,2.0,1.0,1.0,2603212877.5
50%,320000.0,70.0,2.0,2.0,1.0,2658818753.0
75%,520000.0,103.0,3.0,3.0,2.0,2678280998.0
max,8000000.0,4687.0,20.0,20.0,192.0,2700624992.0


- Total Imóveis

In [166]:
base_original.groupby('bairro_teste').size().reset_index(name='count').sort_values(by='count', ascending=False).nlargest(50, 'count')


Unnamed: 0,bairro_teste,count
34,Jardim Botânico,739
9,Centro,736
98,Nova Aliança,661
51,Jardim Irajá,353
74,Jardim Paulista,315
137,Ribeirânia,235
8,Campos Eliseos,178
73,Jardim Palma Travassos,175
69,Jardim Olhos D Agua,172
29,Ipiranga,166


In [167]:
base_original.columns

Index(['tipo_imovel', 'nome', 'preco', 'metragem', 'quarto', 'banheiro',
       'garagem', 'ID_CASA', 'bairro_teste'],
      dtype='object')

- Total de imoveis ,Média, Moda e Mediana: Jardim Botânico, Centro, Nova Aliança, Jardim Irajá, Sumarezinho, Vila Tibério, Vila monte alegre 

In [168]:
bairros = ['Jardim Botânico', 'Centro', 'Nova Aliança', 'Jardim Irajá', 'Sumarezinho', 'Vila Tibério', 'Vila Monte Alegre', 'Bonfim Paulista', 'Jardim Nova Aliança Sul', 'Vila Tibério', 'Vila Virginia', 'Bonfim Paulista']

In [169]:
base_bairros = base_original[base_original['bairro_teste'].isin(bairros)]

In [170]:
base_bairros.head()

Unnamed: 0,tipo_imovel,nome,preco,metragem,quarto,banheiro,garagem,ID_CASA,bairro_teste
0,Apartamento,"Apartamento com 2 Quartos à Venda, 65m²",310000.0,65,2,2,1,2699716579,Jardim Botânico
2,Apartamento,"Apartamento com 2 Quartos à Venda, 71m²",403500.0,71,2,2,2,2690493538,Bonfim Paulista
3,Apartamento,"Apartamento com 2 Quartos à Venda, 54m²",290000.0,54,2,2,1,2667652105,Nova Aliança
9,Apartamento,"Apartamento com 2 Quartos à Venda, 64m²",370000.0,64,2,2,2,2694181987,Jardim Botânico
14,Apartamento,"Apartamento com 2 Quartos à Venda, 84m²",298000.0,84,2,2,1,2586310967,Vila Monte Alegre


In [171]:
base_bairros.groupby('metragem').count()

Unnamed: 0_level_0,tipo_imovel,nome,preco,quarto,banheiro,garagem,ID_CASA,bairro_teste
metragem,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
10,3,3,3,3,3,3,3,3
18,1,1,1,1,1,1,1,1
20,1,1,1,1,1,1,1,1
23,1,1,1,1,1,1,1,1
24,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...
530,2,2,2,2,2,2,2,2
700,1,1,1,1,1,1,1,1
721,1,1,1,1,1,1,1,1
726,1,1,1,1,1,1,1,1


In [172]:
base_bairros.pivot_table(index='garagem', columns='bairro_teste', aggfunc='size', fill_value=0)

bairro_teste,Bonfim Paulista,Centro,Jardim Botânico,Jardim Irajá,Jardim Nova Aliança Sul,Nova Aliança,Sumarezinho,Vila Monte Alegre,Vila Tibério,Vila Virginia
garagem,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,3,91,11,6,4,8,3,0,2,9
1,75,434,227,152,14,317,109,67,42,114
2,46,169,389,142,33,291,5,8,44,17
3,9,27,72,35,2,40,0,2,1,0
4,5,15,28,15,0,5,1,0,0,1
5,0,0,5,2,0,0,0,0,0,0
6,1,0,6,1,0,0,0,0,0,0
8,1,0,0,0,0,0,0,0,0,0
10,0,0,0,0,0,0,0,1,0,0
19,0,0,1,0,0,0,0,0,0,0


In [173]:
base_bairros.pivot_table(index='quarto', columns='bairro_teste', aggfunc='size', fill_value=0)

bairro_teste,Bonfim Paulista,Centro,Jardim Botânico,Jardim Irajá,Jardim Nova Aliança Sul,Nova Aliança,Sumarezinho,Vila Monte Alegre,Vila Tibério,Vila Virginia
quarto,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,6,103,111,35,9,186,0,21,5,1
2,92,165,254,85,20,236,104,44,43,134
3,40,402,328,206,21,229,13,12,41,6
4,1,64,46,26,3,10,1,0,0,0
5,2,2,0,1,0,0,0,0,0,0
20,0,0,0,0,0,0,0,1,0,0


In [174]:
base_banheiro = base_bairros.pivot_table(index='banheiro', columns='bairro_teste', aggfunc='size', fill_value=0).sort_index().reset_index()
base_banheiro


bairro_teste,banheiro,Bonfim Paulista,Centro,Jardim Botânico,Jardim Irajá,Jardim Nova Aliança Sul,Nova Aliança,Sumarezinho,Vila Monte Alegre,Vila Tibério,Vila Virginia
0,1,69,164,151,64,11,245,72,40,31,135
1,2,31,208,251,126,21,166,45,36,48,5
2,3,9,226,107,78,18,137,1,1,10,1
3,4,21,75,89,42,3,55,0,0,0,0
4,5,7,54,119,35,0,52,0,0,0,0
5,6,3,5,13,6,0,3,0,0,0,0
6,7,0,2,5,1,0,2,0,0,0,0
7,8,1,0,0,0,0,1,0,0,0,0
8,9,0,2,2,1,0,0,0,0,0,0
9,12,0,0,1,0,0,0,0,0,0,0


In [175]:
base_banheiro.columns

Index(['banheiro', 'Bonfim Paulista', 'Centro', 'Jardim Botânico',
       'Jardim Irajá', 'Jardim Nova Aliança Sul', 'Nova Aliança',
       'Sumarezinho', 'Vila Monte Alegre', 'Vila Tibério', 'Vila Virginia'],
      dtype='string', name='bairro_teste')

In [176]:

base_bairros['bairro_teste'] = base_bairros['bairro_teste'].str.strip()
dados_estatisticos = []
for bairro in bairros:
    amostras = {}
    base_temp = base_bairros.query(f'bairro_teste == "{bairro}"')
    amostras['bairro'] = bairro
    amostras['total_imoveis'] = base_temp.shape[0]
    amostras['media'] = round(base_temp["preco"].mean(), 2)
    amostras['max_preco'] = round(base_temp["preco"].max(), 2)
    amostras['min_preco'] = round(base_temp["preco"].min(), 2)
    amostras['mediana'] = round(base_temp["preco"].median(), 2)
    amostras['moda'] = round(base_temp["preco"].mode()[0], 2)
    amostras['desvio_padrao'] = round(base_temp["preco"].std(), 2)
    amostras['erro_padrao'] = round(base_temp["preco"].std(), 2) / math.sqrt(base_temp.shape[0])
    amostras['graus_liberdade'] = base_temp.shape[0] - 1
    amostras['ic_limite_inferior'] = norm.interval(0.95,  loc=round(base_temp["preco"].mean(), 2), scale=round(base_temp["preco"].std(), 2) / math.sqrt(base_temp.shape[0]))[0]
    amostras['ic_limite_superior'] = norm.interval(0.95,  loc=round(base_temp["preco"].mean(), 2), scale=round(base_temp["preco"].std(), 2) / math.sqrt(base_temp.shape[0]))[1]
    dados_estatisticos.append(amostras)
base_dados_bairros_selecionados = pd.DataFrame(dados_estatisticos)
base_dados_bairros_selecionados


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  base_bairros['bairro_teste'] = base_bairros['bairro_teste'].str.strip()


Unnamed: 0,bairro,total_imoveis,media,max_preco,min_preco,mediana,moda,desvio_padrao,erro_padrao,graus_liberdade,ic_limite_inferior,ic_limite_superior
0,Jardim Botânico,739,744389.39,8000000.0,180000.0,553000.0,320000.0,720996.16,26522.27,738,692406.69,796372.09
1,Centro,736,402076.73,3100000.0,92000.0,360000.0,450000.0,223344.46,8232.59,735,385941.16,418212.3
2,Nova Aliança,661,518588.2,2290000.0,133000.0,430000.0,350000.0,300496.67,11687.97,660,495680.21,541496.19
3,Jardim Irajá,353,600323.4,2150000.0,150000.0,450000.0,430000.0,354985.06,18893.95,352,563291.94,637354.86
4,Sumarezinho,118,245524.66,590000.0,117000.0,230000.0,170000.0,72993.17,6719.57,117,232354.55,258694.77
5,Vila Tibério,89,338247.07,636000.0,150000.0,370000.0,390000.0,82390.86,8733.41,88,321129.89,355364.25
6,Vila Monte Alegre,78,282407.71,2500000.0,125000.0,260000.0,150000.0,265524.94,30064.78,77,223481.83,341333.59
7,Bonfim Paulista,141,589270.0,7451000.0,104652.0,295000.0,190000.0,911435.88,76756.75,140,438829.54,739710.46
8,Jardim Nova Aliança Sul,53,503976.55,880000.0,220000.0,480000.0,380000.0,175128.32,24055.72,52,456828.2,551124.9
9,Vila Tibério,89,338247.07,636000.0,150000.0,370000.0,390000.0,82390.86,8733.41,88,321129.89,355364.25


# Testes de hipóteses

In [211]:
significancia = 0.05
confianca = 1 - significancia

- Preço por área

In [210]:
bairros = [
    ('Jardim Botânico', 100), 
    ('Centro', 80), 
    ('Nova Aliança', 100), 
    ('Jardim Irajá', 35), 
    ('Sumarezinho', 50), 
    ('Vila Tibério', 60), 
    ('Vila Monte Alegre', 50), 
    ('Bonfim Paulista', 100), 
    ('Jardim Nova Aliança Sul', 40), 
    ('Vila Tibério', 40), 
    ('Vila Virginia', 100), 
    ('Bonfim Paulista', 100)
]
for bairro in bairros:
    try:
        amostras = {}
        print('*' * 20, bairro[0], '*' * 20)
        base_temp = base_bairros.query(f'bairro_teste == "{ bairro[0]}"')
        print(base_temp.shape)
        base_menor_area = base_temp.query('metragem < 60').sample(n=bairro[1], random_state=101).preco
        base_maior_area = base_temp.query('metragem >= 60').sample(n=bairro[1], random_state=101).preco
        print(base_menor_area.shape, base_maior_area.shape)
        display(base_menor_area)
    except Exception as e:
        print(bairro)



******************** Jardim Botânico ********************
(739, 9)
(100,) (100,)


6938   191000.00
3747   430000.00
193    289500.00
5075   293204.00
3217   275000.00
          ...   
9673   290000.00
9831   190000.00
668    315000.00
5626   285000.00
8252   297000.00
Name: preco, Length: 100, dtype: float64

******************** Centro ********************
(736, 9)
(80,) (80,)


6847   165000.00
4289   250000.00
3274   190000.00
1280   189000.00
1750   232000.00
          ...   
4335   235000.00
3738   180000.00
5423   135000.00
3427   130000.00
1887   225000.00
Name: preco, Length: 80, dtype: float64

******************** Nova Aliança ********************
(661, 9)
(100,) (100,)


2502   270000.00
3013   250000.00
453    240000.00
7558   200000.00
1755   250000.00
          ...   
9941   220000.00
2008   190000.00
7700   220000.00
9076   133000.00
6659   200000.00
Name: preco, Length: 100, dtype: float64

******************** Jardim Irajá ********************
(353, 9)
(35,) (35,)


7800   200000.00
5533   185000.00
7878   180000.00
9224   155000.00
8710   150000.00
8311   160000.00
8196   165000.00
8359   170000.00
8925   160000.00
1654   350000.00
8398   220000.00
7854   180000.00
6802   380000.00
5760   185000.00
1746   215000.00
1521   298000.00
7939   230000.00
5771   210000.00
3897   205000.00
6212   180000.00
227    235000.00
7925   350000.00
8187   180000.00
3311   214000.00
5647   250000.00
6051   230000.00
5732   195000.00
6404   200000.00
9139   170000.00
8442   170000.00
8183   170000.00
4071   230000.00
7088   180000.00
5764   227900.00
8802   150000.00
Name: preco, dtype: float64

******************** Sumarezinho ********************
(118, 9)
(50,) (50,)


3005   218000.00
2347   175000.00
411    328490.00
3531   277000.00
8773   160000.00
7587   175000.00
3003   212000.00
5803   175000.00
8031   190000.00
6159   340000.00
3934   170000.00
280    212000.00
3713   175000.00
4233   195000.00
4669   190000.00
4339   165000.00
2330   170000.00
4751   160000.00
4214   200000.00
2509   230000.00
3989   170000.00
2206   180000.00
8680   215000.00
3800   220000.00
3425   200000.00
2990   213000.00
4410   255000.00
1019   212000.00
445    172000.00
4411   170000.00
6786   200000.00
3865   195000.00
5336   160000.00
7940   250000.00
6102   117000.00
8139   152000.00
9549   337000.00
8014   197000.00
1945   200000.00
5065   220000.00
4771   170000.00
4200   240000.00
6036   180000.00
1050   175000.00
2707   212000.00
6651   149000.00
3864   170000.00
2099   180000.00
650    213000.00
5512   195000.00
Name: preco, dtype: float64

******************** Vila Tibério ********************
(89, 9)
('Vila Tibério', 60)
******************** Vila Monte Alegre ********************
(78, 9)
('Vila Monte Alegre', 50)
******************** Bonfim Paulista ********************
(141, 9)
('Bonfim Paulista', 100)
******************** Jardim Nova Aliança Sul ********************
(53, 9)
('Jardim Nova Aliança Sul', 40)
******************** Vila Tibério ********************
(89, 9)
('Vila Tibério', 40)
******************** Vila Virginia ********************
(141, 9)
('Vila Virginia', 100)
******************** Bonfim Paulista ********************
(141, 9)
('Bonfim Paulista', 100)


In [222]:
bairro = 'Jardim Botânico'
base_temp = base_bairros.query(f'bairro_teste == "{bairro}"')
base_menor_area = base_temp.query('metragem <= 60').sample(n=100, random_state=101).preco
base_maior_area = base_temp.query('metragem > 60').sample(n=100, random_state=101).preco
media_menor_area = base_menor_area.mean()
media_maior_area = base_maior_area.mean()


In [223]:
from statsmodels.stats.weightstats import DescrStatsW, CompareMeans

In [224]:
teste_menor_area = DescrStatsW(base_menor_area)
teste_maior_area = DescrStatsW(base_maior_area)

In [226]:
teste_a = teste_menor_area.get_compare(teste_maior_area)

In [227]:
z, p_valor = teste_a.ztest_ind(alternative='smaller', value=0)
z, p_valor

(-6.199641152322223, 2.8296021547606276e-10)

In [228]:
p_valor <= significancia

True