In [54]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor

In [55]:
from __future__ import annotations

s0_default: float = 1
p_default: float = 0.2

batch_size_default: int = 1

alpha_default: float = 0.1
eps_default: float = 1e-8

mu_default = 1e-2

tolerance_default: float = 1e-3
max_iter_default: int = 1000

class BaseDescent:
    """
    A base class and examples for all functions
    """

    def __init__(self):
        self.w = None

    def step(self, X: np.ndarray, y: np.ndarray, iteration: int) -> np.ndarray:
        """
        Descent step
        :param iteration: iteration number
        :param X: objects' features
        :param y: objects' targets
        :return: difference between weights
        """
        return self.update_weights(self.calc_gradient(X, y), iteration)

    def update_weights(self, gradient: np.ndarray, iteration: int) -> np.ndarray:
        """
        Example for update_weights function
        :param iteration: iteration number
        :param gradient: gradient
        :return: weight difference: np.ndarray
        """
        pass

    def calc_gradient(self, X: np.ndarray, y: np.ndarray) -> np.ndarray:
        """
        Example for calc_gradient function
        :param X: objects' features
        :param y: objects' targets
        :return: gradient: np.ndarray
        """
        pass

In [56]:
class GradientDescent(BaseDescent):
    """
    Full gradient descent class
    """

    def __init__(self, w0: np.ndarray, lambda_: float, s0: float = s0_default, p: float = p_default):
        """
        :param w0: weight initialization
        :param lambda_: learning rate parameter (float)
        :param s0: learning rate parameter (float)
        :param p: learning rate parameter (float)
        """
        super().__init__()
        self.eta = lambda k: lambda_ * (s0 / (s0 + k)) ** p
        self.w = np.copy(w0)

    def update_weights(self, gradient: np.ndarray, iteration: int) -> np.ndarray:
        """
        Changing weights with respect to gradient
        :param iteration: iteration number
        :param gradient: gradient
        :return: weight difference: np.ndarray
        """
        new_weigths = self.w - self.eta(iteration) * gradient

        return new_weigths
        # TODO: implement updating weights function
#         raise NotImplementedError('GradientDescent update_weights function not implemented')

    def calc_gradient(self, X: np.ndarray, y: np.ndarray) -> np.ndarray:
        """
        Getting objects, calculating gradient at point w
        :param X: objects' features
        :param y: objects' targets
        :return: gradient: np.ndarray
        """
        
        diff = X @ self.w - y
        
        grad = 2 * X.transpose() @ diff
        
        return grad
        # TODO: implement calculating gradient function
#         raise NotImplementedError('GradientDescent calc_gradient function not implemented')

In [71]:
class LinearRegressionCustom:
    """
    Linear regression class
    """

    def __init__(self, descent, lambda_, tolerance: float = tolerance_default, max_iter: int = max_iter_default):
        """
        :param descent: Descent class
        :param tolerance: float stopping criterion for square of euclidean norm of weight difference
        :param max_iter: int stopping criterion for iterations
        """
        self.descent = descent
        self.tolerance = tolerance
        self.max_iter = int(max_iter)
        self.loss_history = []
        self.lambda_ = lambda_
        
        self.w = np.nan

    def fit(self, X: np.ndarray, y: np.ndarray) -> LinearRegression:
        """
        Getting objects, fitting descent weights
        :param X: objects' features
        :param y: objects' target
        :return: self
        """
        self.w = np.zeros(X.shape[1])
        
        for iteration in range(1, self.max_iter + 1):
            w_old = self.w
            w_new = self.descent(self.w, lambda_=self.lambda_).step(X, y, iteration=iteration)
            self.w = w_new
            
            self.calc_loss(X, y)
#             print(iteration, self.loss_history[-1])
            
            if np.linalg.norm(w_old - w_new) < self.tolerance: 
                break
                        
        return self
        # TODO: fit weights to X and y
#         raise NotImplementedError('LinearRegression fit function not implemented')

    def predict(self, X: np.ndarray) -> np.ndarray:
        """
        Getting objects, predicting targets
        :param X: objects' features
        :return: predicted targets
        """
        prediction = X @ self.w
        
        return prediction
        # TODO: calculate prediction for X
#         raise NotImplementedError('LinearRegression predict function not implemented')

    def calc_loss(self, X: np.ndarray, y: np.ndarray) -> None:
        """
        Getting objects, calculating loss
        :param X: objects' features
        :param y: objects' target
        """
        loss = ((X @ self.w - y)**2).sum()
        
        self.loss_history.append(loss) 
        
        # TODO: calculate loss and save it to loss_history
#         raise NotImplementedError('LinearRegression calc_loss function not implemented')

In [58]:
num_objects = 100
dimension = 5

X = np.random.rand(num_objects, dimension)
y = np.random.rand(num_objects)

lambda_ = 1e-2
w0 = np.zeros(dimension)

max_iter = 10
tolerance = 0

In [59]:
data = pd.read_csv(r'data/autos.csv')
data.head()

Unnamed: 0,brand,model,vehicleType,gearbox,fuelType,notRepairedDamage,powerPS,kilometer,yearOfRegistration,monthOfRegistration,dateCreated,lastSeen,postalCode,price
0,volkswagen,golf,kleinwagen,manuell,benzin,nein,75,150000,2001,6,2016-03-17 00:00:00,2016-03-17 17:40:17,91074,1500
1,skoda,fabia,kleinwagen,manuell,diesel,nein,69,90000,2008,7,2016-03-31 00:00:00,2016-04-06 10:17:21,60437,3600
2,bmw,3er,limousine,manuell,benzin,ja,102,150000,1995,10,2016-04-04 00:00:00,2016-04-06 19:17:07,33775,650
3,peugeot,2_reihe,cabrio,manuell,benzin,nein,109,150000,2004,8,2016-04-01 00:00:00,2016-04-05 18:18:39,67112,2200
4,mazda,3_reihe,limousine,manuell,benzin,nein,105,150000,2004,12,2016-03-26 00:00:00,2016-04-06 10:45:34,96224,2000


In [60]:
X = data.drop(columns=['monthOfRegistration', 'dateCreated', 'lastSeen', \
                       'postalCode', 'price'], inplace=False)
y = data['price']

numerical = ['powerPS', 'kilometer', 'yearOfRegistration']
categorical = ['brand', 'model', 'vehicleType', 'gearbox', 'fuelType',
       'notRepairedDamage']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

column_transformer = ColumnTransformer([
         ('ohe', OneHotEncoder(handle_unknown='ignore'), categorical),
         ('num', StandardScaler(), numerical)])

In [61]:
pipeline = Pipeline(steps=[
    ('ohe_and_scaling', column_transformer),
    ('estimator', DecisionTreeRegressor(max_depth=100, min_samples_leaf=6))])

pipeline.fit(X_train, y_train)
pred = pipeline.predict(X_test)

print(mean_squared_error(pred, y_test))

print(pd.DataFrame({'pred':pred[:20], 'y_test':y_test[:20]}))

8395920.392735025
                pred  y_test
117337   9449.666667   11500
192600  11543.600000   12800
209225   1967.166667    1650
30818    7907.636364    7890
220792   7719.454545    9989
85576    1304.104167    1050
122929   1864.000000    1900
256      3635.714286    3600
80889   17371.142857    9600
77520    5671.800000    5450
217088   4694.181818    4545
128403   3456.666667    3350
211817   3916.500000    3900
148783   5799.000000    5200
183904   3703.250000    3900
91101    2481.125000    1600
55771    9031.666667    9395
146394  18297.000000   21000
5298     2468.900000    5600
1499     1606.990654    1800


In [62]:
pipeline = Pipeline(steps=[
    ('ohe_and_scaling', column_transformer),
    ('estimator', LinearRegressionCustom(GradientDescent, 
                                         tolerance=1e-6, 
                                        max_iter=1e4, lambda_=3e-6))])

pipeline.fit(X_train, y_train)
pred = pipeline.predict(X_test)

print(pred[0:6])

mean_squared_error(pred, y_test)

1 25207755977731.07
2 48306207093171.22
3 105444396326566.23
4 231564950667141.06
5 494177907864486.8
6 1016056090567333.0
7 2009683118526653.0
8 3826218413558726.5
9 7020606020133512.0
10 1.2431977788103058e+16
11 2.1274534980311044e+16
12 3.5228791282309144e+16
13 5.6517064546828184e+16
14 8.794159349077346e+16
15 1.3286024925801632e+17
16 1.950752892932076e+17
17 2.7861645305943978e+17
18 3.8741160280441376e+17
19 5.248554700054857e+17
20 6.933077738296472e+17
21 8.935725483990787e+17
22 1.1244266314914193e+18
23 1.3822726600406141e+18
24 1.6609874139704963e+18
25 1.9520188074656369e+18
26 2.244755692515871e+18
27 2.5271579328764114e+18
28 2.786595878237355e+18
29 3.010815151325278e+18
30 3.1889206741840717e+18
31 3.3122673035683215e+18
32 3.375154701117193e+18
33 3.375249419406283e+18
34 3.313693231579689e+18
35 3.194897321742972e+18
36 3.026060467419762e+18
37 2.816479975055079e+18
38 2.5767428835311964e+18
39 2.3178902031163356e+18
40 2.050639471143972e+18
41 1.7847333933133627e+

363 3486711951945.9634
364 3486271079463.003
365 3485832004316.2964
366 3485394714386.1943
367 3484959197662.516
368 3484525442243.349
369 3484093436333.8643
370 3483663168245.152
371 3483234626393.056
372 3482807799297.049
373 3482382675579.094
374 3481959243962.5405
375 3481537493271.026
376 3481117412427.395
377 3480698990452.628
378 3480282216464.7837
379 3479867079677.963
380 3479453569401.2744
381 3479041675037.8184
382 3478631386083.69
383 3478222692126.977
384 3477815582846.7915
385 3477410048012.297
386 3477006077481.757
387 3476603661201.5913
388 3476202789205.446
389 3475803451613.274
390 3475405638630.427
391 3475009340546.754
392 3474614547735.726
393 3474221250653.5493
394 3473829439838.31
395 3473439105909.114
396 3473050239565.249
397 3472662831585.3477
398 3472276872826.564
399 3471892354223.7656
400 3471509266788.724
401 3471127601609.3257
402 3470747349848.789
403 3470368502744.8853
404 3469991051609.1724
405 3469614987826.2495
406 3469240302852.999
407 3468866988217

KeyboardInterrupt: 

In [None]:
# X_train = np.array([[1,2], [3,4], [6,9]])
# y_train = np.array([0, 2, 3])

# X_test = np.array([[1,3], [3,1]])
# y_test = np.array([-1, 5])

# # pipeline = Pipeline(steps=[
# #     ('estimator', LinearRegressionCustom(GradientDescent, 
# #                                          tolerance=1e-3, 
# #                                         max_iter=1e3))])
# pipeline = Pipeline(steps=[
#     ('estimator', LinearRegression())])

# pipeline.fit(X_train, y_train)

# pred = pipeline.predict(X_train)
# print(pred, mean_squared_error(pred, y_train), pipeline.named_steps['estimator'].coef_)

# print(pipeline.predict([[1,1]]))

# pred = pipeline.predict(X_test)

# print(pred, mean_squared_error(pred, y_test))

In [75]:
X_train_smpl = np.array([[1,2], [3,4], [6,9]])
y_train_smpl = np.array([0, 2, 3])

X_test_smpl = np.array([[1,3], [3,1]])
y_test_smpl = np.array([-1, 5])

pipeline = Pipeline(steps=[
    ('estimator', LinearRegressionCustom(GradientDescent, 
                                         tolerance=1e-6, 
                                        max_iter=1e5, lambda_=7e-3))])

pipeline.fit(X_train_smpl, y_train_smpl)
pred = pipeline.predict(X_test_smpl)

print(pipeline.predict(X_train_smpl), pipeline.predict(X_test_smpl), \
      mean_squared_error(pred, y_test_smpl))

[5.57793477e-04 1.99951519e+00 3.00010948e+00] [-0.99836311  4.99627791] 8.266678361076879e-06
