Prepared by: Ömer Coşkun <br>
*Quick note: This notebook is prepared to show how an acceptable solution should look like. Better solutions are always possible.*                         

In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures
import seaborn as sns

In [2]:
df = pd.read_csv('real_estate.csv')
df.head()

Unnamed: 0,No,transactiondate,houseage,distancetostation,numberofstores,latitude,longitude,housepriceofunitarea
0,1,2012.917,32.0,84.87882,10,24.98298,121.54024,37.9
1,2,2012.917,19.5,306.5947,9,24.98034,121.53951,42.2
2,3,2013.583,13.3,561.9845,5,24.98746,121.54391,47.3
3,4,2013.5,13.3,561.9845,5,24.98746,121.54391,54.8
4,5,2012.833,5.0,390.5684,5,24.97937,121.54245,43.1


# Task 1

In this task, you are expected to:

* Make a linear regression analysis.
* Use the normal equation for your linear regression analysis to verify your results.
* Make a polynomial regression analysis (You can decide on the Nth degree).

You can drop the features that you find unnecessary.
<br>
You can use sklearn module to perform your analysis.

In [3]:
X = df.drop(columns=["housepriceofunitarea"])
y = df["housepriceofunitarea"]
model = LinearRegression()
model.fit(X, y)
model.score(X, y)

0.5833632616324671

In [4]:
X = df.drop(columns=["housepriceofunitarea", "No", "transactiondate"])
y = df["housepriceofunitarea"]
model = LinearRegression()
model.fit(X, y)
model.score(X, y)

0.5711617064827434

## sklearn

In [5]:
X = df.drop(columns=["housepriceofunitarea"])
y = df["housepriceofunitarea"]
model = LinearRegression()
model.fit(X, y)
print("r^2:", model.score(X, y))
print(f"w0: {model.intercept_}")
for i in range(len(model.coef_)):
    print(f"w{i+1}: {model.coef_[i]}")

r^2: 0.5833632616324671
w0: -14038.07978292443
w1: -0.0035935012815118787
w2: 5.079087316403185
w3: -0.2708419593068578
w4: -0.0045207901079944435
w5: 1.1292774867928073
w6: 224.6728922281327
w7: -14.423563462462413


## normal equation

In [6]:
X = df.drop(columns=["housepriceofunitarea"])
y_train = df["housepriceofunitarea"]

X = np.c_[np.ones(X.shape[0]), X]

w = np.linalg.solve(np.dot(X.T,X), np.dot(X.T,y_train))

y_average = np.mean(y_train)
SSR = sum([(y - w.dot(x.T))**2 for x, y in zip(X, y_train)])
SST = sum([(y - y_average)**2 for y in y_train])
r2 = 1 - SSR / SST
print("r^2:", r2)

for i in range(len(w)):
    print(f"w{i+1}: {float(w[i])}")

r^2: 0.5833632616324693
w1: -14038.080688278364
w2: -0.0035935012567931417
w3: 5.079087340387547
w4: -0.2708419591176207
w5: -0.0045207900348465556
w6: 1.1292774883303165
w7: 224.67289280365443
w8: -14.42355652934092


## Polynomial Regression

In [7]:
X = df.drop(columns=["housepriceofunitarea"])
y = df["housepriceofunitarea"]

N_poly_degree=2
poly = PolynomialFeatures(N_poly_degree)
x_transformed=poly.fit_transform(X)
x_transformed

model = LinearRegression()
model.fit(x_transformed, y)
print("r^2:", model.score(x_transformed ,y))

print(model.coef_)
print(model.intercept_)

r^2: 0.7122457326689602
[-5.14341827e+02  5.72631363e+01 -1.89687226e+04  5.47187601e+02
  1.75804297e+01  9.41467510e+03  7.02480238e+05  1.42231813e+06
  2.15773324e-05  6.82565102e-03  5.11086759e-04 -3.38389945e-06
  2.89503128e-04 -2.16886152e-01 -5.39818423e-01  7.20852259e+00
  6.38926828e-02 -2.87688866e-03 -5.57679332e-01 -1.19734808e+02
 -5.80544147e+01  1.93811244e-02  1.26163999e-05  1.04876027e-02
 -7.50098847e+00 -4.02924036e+00  2.13279250e-07 -1.54591840e-03
 -2.92551324e-01 -3.69280833e-02  3.97911684e-03 -1.17185534e+02
 -4.41387833e+01  2.59701073e+03 -4.85369091e+03 -4.87082019e+03]
-76150384.27728881
