# Import libraries

In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import f_regression
from sklearn.preprocessing import StandardScaler

# Load data and take a look on them

In [2]:
data = pd.read_csv('real_estate_price_size_year.csv')

In [3]:
data.head()

Unnamed: 0,price,size,year
0,234314.144,643.09,2015
1,228581.528,656.22,2009
2,281626.336,487.29,2018
3,401255.608,1504.75,2015
4,458674.256,1275.46,2009


In [4]:
data.describe()

Unnamed: 0,price,size,year
count,100.0,100.0,100.0
mean,292289.47016,853.0242,2012.6
std,77051.727525,297.941951,4.729021
min,154282.128,479.75,2006.0
25%,234280.148,643.33,2009.0
50%,280590.716,696.405,2015.0
75%,335723.696,1029.3225,2018.0
max,500681.128,1842.51,2018.0


# Regression

In [5]:
# Dependent and independent variables
x = data[['size', 'year']]
y = data['price']

In [6]:
# Create object
reg = LinearRegression()
# Fit the model
reg.fit(x, y)

In [7]:
# R-squared
reg.score(x, y)

0.7764803683276795

In [8]:
# Intercept
reg.intercept_

-5772267.017463279

In [9]:
# Coefficients
reg.coef_

array([ 227.70085401, 2916.78532684])

## Adjusted R^2

$R^2_{adj.} = 1 - (1-R^2)*\frac{n-1}{n-p-1}$

In [10]:
def adjusted_r2(x, y):
    r2 = reg.score(x, y)
    n = x.shape[0]
    p = x.shape[1]
    adjusted_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)
    return adjusted_r2

In [11]:
adjusted_r2(x, y)

0.7718717161282502

__Adjusted R-squared is only a little smaller than R-squared, what means that we were not penalized a lot for inclusion both independent variables.__

# Make prediction

In [12]:
# Prediction for apartament with 750 sq.ft from year 2009
new_data = pd.DataFrame(data=[[750, 2009]], columns=['size', 'year'])
reg.predict(new_data)

array([258330.34465995])

# Simple feature selection 

In [13]:
# P-values
p_values = f_regression(x, y)[1]
p_values

array([8.12763222e-31, 3.57340758e-01])

# Summary table

In [14]:
regression_summary = pd.DataFrame(data=x.columns.values, columns=['Features'])
regression_summary['Coefficients'] = reg.coef_
regression_summary['P-values'] = p_values.round(3)
regression_summary

Unnamed: 0,Features,Coefficients,P-values
0,size,227.700854,0.0
1,year,2916.785327,0.357


__Year is not useful in our regression as p-value is almost equal to 0 so we should not include Year in our model. After removing second independent variable we would get single linear regression so I will not do this here as single regression is in other task.__

# Standardization

In [15]:
# Instance of StandardScaler
scaler = StandardScaler()
# Fit scaler
scaler.fit(x)
# Scale data
x_scaled = scaler.transform(x)
x_scaled = pd.DataFrame(x_scaled, columns=['size', 'year'])
x_scaled

Unnamed: 0,size,year
0,-0.708164,0.510061
1,-0.663873,-0.765092
2,-1.233719,1.147638
3,2.198445,0.510061
4,1.424989,-0.765092
...,...,...
95,-1.022856,-0.765092
96,0.622084,-0.765092
97,2.198445,-1.402669
98,-0.690623,0.510061


# Regression after standardization

In [16]:
# Create object
reg_scaled = LinearRegression()
# Fit the model with scaled inputs
reg_scaled.fit(x_scaled, y)

In [17]:
# R-squared
reg.score(x_scaled, y)

-6257.450009944134

In [18]:
# Adjusted R-squared
adjusted_r2(x_scaled,y)

-6386.490216334734

In [19]:
# Intercept
reg_scaled.intercept_

292289.4701599997

In [20]:
# Coefficients
reg_scaled.coef_

array([67501.57614152, 13724.39708231])

In [21]:
# Scale new data
new_data_scaled = scaler.transform(new_data)
new_data_scaled = pd.DataFrame(new_data_scaled, columns=['size', 'year'])
new_data_scaled


Unnamed: 0,size,year
0,-0.347528,-0.765092


In [22]:
reg_scaled.predict(new_data_scaled)

array([258330.34465995])

__Result is same as without standardization, it can be caused because this dataset was extremely clean and probably artificially created. Due to this, standardization does not change prediction at all.__