In [39]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor

In [3]:
df = pd.read_csv('./kc_final.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,0,7129300520,2014-10-13,221900.0,3,1.0,1180,5650,1.0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,1,6414100192,2014-12-09,538000.0,3,2.25,2570,7242,2.0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,2,5631500400,2015-02-25,180000.0,2,1.0,770,10000,1.0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,3,2487200875,2014-12-09,604000.0,4,3.0,1960,5000,1.0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,4,1954400510,2015-02-18,510000.0,3,2.0,1680,8080,1.0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [4]:
df.shape

(21613, 22)

In [6]:
df = df[['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront']]
df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront
0,7129300520,2014-10-13,221900.0,3,1.0,1180,5650,1.0,0
1,6414100192,2014-12-09,538000.0,3,2.25,2570,7242,2.0,0
2,5631500400,2015-02-25,180000.0,2,1.0,770,10000,1.0,0
3,2487200875,2014-12-09,604000.0,4,3.0,1960,5000,1.0,0
4,1954400510,2015-02-18,510000.0,3,2.0,1680,8080,1.0,0


In [8]:
df.shape

(21613, 9)

In [9]:
colunas_para_remover = ['id', 'date']
df = df.drop(colunas_para_remover, axis=1)

In [10]:
df.shape

(21613, 7)

In [11]:
x = df.drop('price', axis=1)
y = df['price']

In [13]:
min_max_scaler = StandardScaler()
x = min_max_scaler.fit_transform(x)

In [14]:
x

array([[-0.39873715, -1.44746357, -0.97983502, -0.22832133, -0.915427  ,
        -0.08717263],
       [-0.39873715,  0.1756067 ,  0.53363434, -0.18988538,  0.93650577,
        -0.08717263],
       [-1.47395936, -1.44746357, -1.42625404, -0.12329847, -0.915427  ,
        -0.08717263],
       ...,
       [-1.47395936, -1.77207762, -1.15404732, -0.33213703,  0.93650577,
        -0.08717263],
       [-0.39873715,  0.50022075, -0.52252773, -0.30707641,  0.93650577,
        -0.08717263],
       [-1.47395936, -1.77207762, -1.15404732, -0.33875227,  0.93650577,
        -0.08717263]], shape=(21613, 6))

In [16]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=23)

In [17]:
print(f'Exemplos para treinamento: {len(y_train)}. E para o teste: {len(y_test)}')

Exemplos para treinamento: 15129. E para o teste: 6484


# Regressão Linear

In [23]:
model = LinearRegression()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

In [27]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mape = mean_absolute_percentage_error(y_test, y_pred)

print(f'MSE: {mse}\nRMSE: {rmse}\nMAPE: {mape}')

MSE: 58290430743.82151
RMSE: 241434.11263494127
MAPE: 0.346890617216992


# K-NN Regressor

In [29]:
model = KNeighborsRegressor(n_neighbors=7, metric='euclidean')
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

In [30]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mape = mean_absolute_percentage_error(y_test, y_pred)

print(f'MSE: {mse}\nRMSE: {rmse}\nMAPE: {mape}')

MSE: 56929973907.05955
RMSE: 238600.02914304
MAPE: 0.3246391981351605


# SVR Support Vector Machines para Regressão

In [32]:
model = SVR()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

In [33]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mape = mean_absolute_percentage_error(y_test, y_pred)

print(f'MSE: {mse}\nRMSE: {rmse}\nMAPE: {mape}')

MSE: 130199624138.9766
RMSE: 360831.85022802046
MAPE: 0.4214532788664194


In [34]:
model = SVR(kernel='linear', C=100)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

In [None]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mape = mean_absolute_percentage_error(y_test, y_pred)

print(f'MSE: {mse}\nRMSE: {rmse}\nMAPE: {mape}')

# Árvore de Regressão

In [36]:
model = DecisionTreeRegressor()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

In [37]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mape = mean_absolute_percentage_error(y_test, y_pred)

print(f'MSE: {mse}\nRMSE: {rmse}\nMAPE: {mape}')

MSE: 98418736358.97878
RMSE: 313717.60607109504
MAPE: 0.4095970871923596


# XGBoost

In [40]:
model = XGBRegressor()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

In [41]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mape = mean_absolute_percentage_error(y_test, y_pred)

print(f'MSE: {mse}\nRMSE: {rmse}\nMAPE: {mape}')

MSE: 54032402381.69808
RMSE: 232448.70914181924
MAPE: 0.3115478188030054


In [42]:
params = {"n_estimators": 100,
          "max_depth": 6,
          "learning_rate": 0.1}

In [43]:
model = XGBRegressor(**params)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

In [44]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mape = mean_absolute_percentage_error(y_test, y_pred)

print(f'MSE: {mse}\nRMSE: {rmse}\nMAPE: {mape}')

MSE: 50099948746.49795
RMSE: 223830.17836408466
MAPE: 0.30671574201583884
