# Polynomial Regression

$Y = B_0 + B_1*X_1 +B_2*X_1^2 +...+ B_N*X_1^N$

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

In [10]:
from sklearn.linear_model import LinearRegression

In [5]:
from sklearn.preprocessing import PolynomialFeatures

# Health Care Data

In [3]:
data_health = pd.read_csv("health_care2.csv")
data_health

Unnamed: 0,idade,custo
0,18,470
1,23,520
2,28,630
3,33,830
4,38,1150
5,43,1530
6,48,2040
7,53,3080
8,58,5100
9,63,10100


In [4]:
x_health = data_health.iloc[:,0:1].values
y_health = data_health.iloc[:,1].values
x_health,y_health

(array([[18],
        [23],
        [28],
        [33],
        [38],
        [43],
        [48],
        [53],
        [58],
        [63]], dtype=int64),
 array([  470,   520,   630,   830,  1150,  1530,  2040,  3080,  5100,
        10100], dtype=int64))

In [9]:
polyn_regression = PolynomialFeatures(degree=4)
x_polyn_regression = polyn_regression.fit_transform(x_health)
x_polyn_regression

array([[1.0000000e+00, 1.8000000e+01, 3.2400000e+02, 5.8320000e+03,
        1.0497600e+05],
       [1.0000000e+00, 2.3000000e+01, 5.2900000e+02, 1.2167000e+04,
        2.7984100e+05],
       [1.0000000e+00, 2.8000000e+01, 7.8400000e+02, 2.1952000e+04,
        6.1465600e+05],
       [1.0000000e+00, 3.3000000e+01, 1.0890000e+03, 3.5937000e+04,
        1.1859210e+06],
       [1.0000000e+00, 3.8000000e+01, 1.4440000e+03, 5.4872000e+04,
        2.0851360e+06],
       [1.0000000e+00, 4.3000000e+01, 1.8490000e+03, 7.9507000e+04,
        3.4188010e+06],
       [1.0000000e+00, 4.8000000e+01, 2.3040000e+03, 1.1059200e+05,
        5.3084160e+06],
       [1.0000000e+00, 5.3000000e+01, 2.8090000e+03, 1.4887700e+05,
        7.8904810e+06],
       [1.0000000e+00, 5.8000000e+01, 3.3640000e+03, 1.9511200e+05,
        1.1316496e+07],
       [1.0000000e+00, 6.3000000e+01, 3.9690000e+03, 2.5004700e+05,
        1.5752961e+07]])

In [11]:
polyn_regression_health = LinearRegression()
polyn_regression_health.fit(x_polyn_regression,y_health)

In [12]:
#b0
polyn_regression_health.intercept_

16561.974637941225

In [13]:
#b1..bn
polyn_regression_health.coef_

array([ 0.00000000e+00, -2.12242253e+03,  9.90404199e+01, -1.95058276e+00,
        1.40792541e-02])

In [14]:
prediction = polyn_regression_health.predict(x_polyn_regression)
prediction

array([ 549.65035466,  345.85081577,  616.53845994,  975.83915951,
       1249.06759854, 1472.72727271, 1894.51048941, 2973.29836768,
       5379.16083826, 9993.35664352])

In [17]:
x_health

array([[18],
       [23],
       [28],
       [33],
       [38],
       [43],
       [48],
       [53],
       [58],
       [63]], dtype=int64)

In [18]:
plot = px.scatter(x = x_health.ravel(),
                  y = y_health)
plot.add_scatter(x = x_health.ravel(),
                 y = prediction,
                 name = "Health Care Polynomial Regression")
plot.show()

# House prices data

In [19]:
data_house = pd.read_csv("house_prices.csv")
data_house

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.00,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.7210,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.00,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.00,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.00,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21608,263000018,20140521T000000,360000.0,3,2.50,1530,1131,3.0,0,0,...,8,1530,0,2009,0,98103,47.6993,-122.346,1530,1509
21609,6600060120,20150223T000000,400000.0,4,2.50,2310,5813,2.0,0,0,...,8,2310,0,2014,0,98146,47.5107,-122.362,1830,7200
21610,1523300141,20140623T000000,402101.0,2,0.75,1020,1350,2.0,0,0,...,7,1020,0,2009,0,98144,47.5944,-122.299,1020,2007
21611,291310100,20150116T000000,400000.0,3,2.50,1600,2388,2.0,0,0,...,8,1600,0,2004,0,98027,47.5345,-122.069,1410,1287


In [23]:
x_house = data_house.iloc[:,3:19].values
x_house

array([[ 3.00000e+00,  1.00000e+00,  1.18000e+03, ...,  9.81780e+04,
         4.75112e+01, -1.22257e+02],
       [ 3.00000e+00,  2.25000e+00,  2.57000e+03, ...,  9.81250e+04,
         4.77210e+01, -1.22319e+02],
       [ 2.00000e+00,  1.00000e+00,  7.70000e+02, ...,  9.80280e+04,
         4.77379e+01, -1.22233e+02],
       ...,
       [ 2.00000e+00,  7.50000e-01,  1.02000e+03, ...,  9.81440e+04,
         4.75944e+01, -1.22299e+02],
       [ 3.00000e+00,  2.50000e+00,  1.60000e+03, ...,  9.80270e+04,
         4.75345e+01, -1.22069e+02],
       [ 2.00000e+00,  7.50000e-01,  1.02000e+03, ...,  9.81440e+04,
         4.75941e+01, -1.22299e+02]])

In [22]:
y_house = data_house.iloc[:,2].values #price
y_house

array([221900., 538000., 180000., ..., 402101., 400000., 325000.])

In [20]:
from sklearn.model_selection import train_test_split

In [24]:
x_house_train, x_house_test, y_house_train, y_house_test = train_test_split(x_house,y_house,
                                                                            test_size=0.3,
                                                                            random_state=0)

In [25]:
x_house_train.shape, x_house_test.shape, y_house_train.shape, y_house_test.shape

((15129, 16), (6484, 16), (15129,), (6484,))

In [26]:
polyn_regression = PolynomialFeatures(degree=2)
x_polyn_regression_house_train = polyn_regression.fit_transform(x_house_train)
x_polyn_regression_house_test = polyn_regression.fit_transform(x_house_test)

In [28]:
x_polyn_regression_house_train.shape, x_house_train.shape

((15129, 153), (15129, 16))

In [29]:
polyn_regression_house = LinearRegression()
polyn_regression_house.fit(x_polyn_regression_house_train,y_house_train)

In [30]:
#b0
polyn_regression_house.intercept_

35640273876.77372

In [35]:
#r2
polyn_regression_house.score(x_polyn_regression_house_train,y_house_train)

0.8179364886058189

In [36]:
#r^2
polyn_regression_house.score(x_polyn_regression_house_test,y_house_test)

0.8153438434533382

In [38]:
prediction = polyn_regression_house.predict(x_polyn_regression_house_test)
prediction

array([ 386510.41817474, 1991254.08815765,  578786.17447662, ...,
        433374.23888397,  234145.15529633,  123586.6026001 ])

In [39]:
y_house_test

array([ 297000., 1578000.,  562100., ...,  380000.,  268000.,  206000.])

In [40]:
from sklearn.metrics import mean_absolute_error,mean_squared_error,root_mean_squared_error

In [41]:
mean_absolute_error(y_house_test,prediction)

101223.30336258693

In [42]:
mean_squared_error(y_house_test,prediction)

25351804114.030533

In [43]:
root_mean_squared_error(y_house_test,prediction)

159222.498768329