# Análise exploratória de dados

Exemplo (1) Modelando uma regressão multipla para entender como diferentes variáveis afetam os preços de casas nos EUA.

Dependendo do objetivo podemos ter mais ou menos etapas. Vamos escolher um desafio mais dificil para cobrir a maioria das possibilidades de modelagem.

No nosso exemplo vamos modelar uma regressão para explicar os preços das casas nos EUA e usa-lo para escolher as casas desvalorizadas.

    -> Identificar como a base foi construida
    -> Quais foram as regras, essas regras influeciam os dados?
    -> Devemos nos preocupar os com outliners?
    -> Ao analisar como as variáveis estão distribuidas, temos funções conhecidas?
    -> Como as funções se correlacionam, os comportamentos são os previstos?
    -> Para estudar a correlação parcial com regressões precisamos mudar a forma dos dados?
    -> Transformações logaritmicas
    -> Variáveis demmuies, quando usa-las
    -> Mudando a forma funcional com polinomios
    -> Iterando váriaveis
    -> Gerando predições
    
    
Features:
- **price** - The last price the house was sold for
- **num_bed** - The number of bedrooms
- **num_bath** - The number of bathrooms (fractions mean the house has a toilet-only or shower/bathtub-only bathroom)
- **size_house** (includes basement) - The size of the house
- **size_lot** - The size of the lot
- **num_floors** - The number of floors
- **is_waterfront** - Whether or not the house is a waterfront house (0 means it is not a waterfront house whereas 1 means that it is a waterfront house)
- **condition** - How worn out the house is. Ranges from 1 (needs repairs all over the place) to 5 (the house is very well maintained)
- **size_basement** - The size of the basement
- **year_built** - The year the house was built
- **renovation_date** - The year the house was renovated for the last time. 0 means the house has never been renovated
- **zip** - The zip code
- **latitude** - Latitude
- **longitude** - Longitude
- **avg_size_neighbor_houses** - The average house size of the neighbors
- **avg_size_neighbor_lot** - The average lot size of the neighbors

In [None]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
import sqlite3


import warnings
warnings.filterwarnings("ignore")

%matplotlib inline
%config InlineBackend.figure_formats=['svg']

In [None]:
# import pandas.io.sql as sqlio
# import psycopg2
# from sqlalchemy import create_engine
# import io

# host = 'dh-ds-t1-2019.cpvwsnqnnd2w.us-east-1.rds.amazonaws.com'
# port = 5432
# dbname = 'DHds2019'
# username = 'digitalhouse'
# pwd = 'alunosDH19'

# conn = psycopg2.connect(f"host='{host}' port={port}  dbname='{dbname}' user={username} password={pwd}")

In [None]:
db = sqlite3.connect(r'../../99 Datasets/datasets.db')

In [None]:
query = 'SELECT * FROM house_sales'

df = pd.read_sql_query(query, db)

In [None]:
df.describe()

In [None]:
corrmat = df[['price','size_house','num_bath','size_house','size_lot','num_floors','is_waterfront','year_built','latitude','longitude','avg_size_neighbor_houses','avg_size_neighbor_lot']].corr()
cols = corrmat.nlargest(10, 'price')['price'].index
cm = np.corrcoef(df[cols].values.T)
sns.set(font_scale=1.15)
f, ax = plt.subplots(figsize=(8, 6))
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)

In [None]:
from IPython.display import Image
Image(filename=r'img/houses_tableau.jpg')

In [None]:
df['price'].plot.hist(bins=40)

In [None]:
np.log1p(df['price']).plot.hist(bins=40)

In [None]:
np.exp(13)

In [None]:
df['log_price'] = np.log1p(df['price'])

In [None]:
function1 = '''
price ~ 
 + size_house
 + num_bath
 + size_house
 + size_lot
 + num_floors
 + is_waterfront
 + year_built
 + latitude
 + longitude
 + avg_size_neighbor_houses
 + avg_size_neighbor_lot
'''

model1 = smf.ols(function1, df).fit()
print(model1.summary2())

In [None]:
function1 = '''
np.log(price) ~ 
 + size_house
 + num_bath
 + size_house
 + size_lot
 + num_floors
 + is_waterfront
 + year_built
 + latitude
 + longitude
 + avg_size_neighbor_houses
 + avg_size_neighbor_lot
'''

model1 = smf.ols(function1, df).fit()
print(model1.summary2())

In [None]:
function1 = '''
np.log1p(price) ~ 
 + np.log1p(size_house)
 + num_bath
 + size_house
 + size_lot
 + num_floors
 + is_waterfront
 + year_built
 + latitude
 + longitude
 + avg_size_neighbor_houses
 + avg_size_neighbor_lot
'''

model1 = smf.ols(function1, df).fit()
print(model1.summary2())

In [None]:
import statsmodels.api as sm


function7 = '''price ~ size_house'''


model7 = smf.ols(function7, df.sample(1000)).fit()
print(model7.summary2())

fig, ax = plt.subplots()
fig = sm.graphics.plot_fit(model7, 1, ax=ax)
ax.set_ylabel("price")
ax.set_xlabel("size")
ax.set_title("Linear Regression")

plt.show()

In [None]:
import statsmodels.api as sm


function7 = '''price ~ size_house + is_waterfront'''


model7 = smf.ols(function7, df.sample(2000)).fit()
print(model7.summary2())

fig, ax = plt.subplots()
fig = sm.graphics.plot_fit(model7, 1, ax=ax)
ax.set_ylabel("price")
ax.set_xlabel("size")
ax.set_title("Linear Regression")

plt.show()

corr pearson ao quadrado (size house)

In [None]:
0.705586**2

In [None]:
df.corr().round(2)

In [None]:
import statsmodels.api as sm


function7 = '''price ~ size_house + avg_size_neighbor_houses + size_lot + latitude'''


model7 = smf.ols(function7, df.sample(1000)).fit()
print(model7.summary2())

fig, ax = plt.subplots()
fig = sm.graphics.plot_fit(model7, 1, ax=ax)
ax.set_ylabel("price")
ax.set_xlabel("size")
ax.set_title("Linear Regression")

plt.show()

In [None]:
function7 = '''price ~ size_house*is_waterfront'''


model7 = smf.ols(function7, df.sample(3000)).fit()
print(model7.summary2())

fig, ax = plt.subplots()
fig = sm.graphics.plot_fit(model7, 1, ax=ax)
ax.set_ylabel("price")
ax.set_xlabel("size")
ax.set_title("Linear Regression")

plt.show()

In [None]:
df['log_price'] = np.log1p(df['price'])
df['log_size'] = np.log1p(df['size_house'])

In [None]:
df.sample(1000).plot.scatter(y='log_price', x='size_house')

In [None]:
df.sample(1000).plot.scatter(y='log_price', x='log_size')

In [None]:
df.sample(1000).plot.scatter(y='log_price', x='num_bath')