In [171]:
#experiment tracking
import mlflow
import os
# This the dockerized method.
# We build two docker containers, one for python/jupyter and another for mlflow.
# The url `mlflow` is resolved into another container within the same composer.
mlflow.set_tracking_uri("http://mlflow:5000")
# In the dockerized way, the user who runs this code will be `root`.
# The MLflow will also log the run user_id as `root`.
# To change that, we need to set this environ["LOGNAME"] to your name.
os.environ["LOGNAME"] = "reza370"
mlflow.create_experiment(name="Reza A5")  #create if you haven't create
mlflow.set_experiment(experiment_name="Reza A5")

<Experiment: artifact_location='mlflow-artifacts:/727757533792890976', creation_time=1694346934716, experiment_id='727757533792890976', last_update_time=1694346934716, lifecycle_stage='active', name='Reza A5', tags={}>

In [172]:
import pandas as pd
df_cars=pd.read_csv('Cars.csv')
owner_coding = {
    'First Owner': 1,
    'Second Owner': 2,
    'Third Owner': 3,
    'Fourth & Above Owner': 4,
    'Test Drive Car': 5
}
df_cars['owner'] = df_cars['owner'].map(owner_coding)
#2
df_cars = df_cars[df_cars['fuel'].isin(['Petrol', 'Diesel'])]
#3
df_cars.mileage = df_cars.mileage.str.split(expand=True)[0].astype(float)
#4
df_cars.engine = df_cars.engine.str.split(expand=True)[0].astype(float)
#5
df_cars.loc[df_cars['max_power'] == 'bph', 'max_power'] = ' bph'
df_cars.max_power = df_cars.max_power.str.split(expand=True)[0].astype(float)
#6
df_cars.name=df_cars.name.str.split(expand=True)[0]
#7
df_cars = df_cars.drop(columns=['torque'])
#8
df_cars = df_cars[df_cars['owner'] != 5]
#9
import numpy as np
df_cars['selling_price'] = np.log(df_cars['selling_price'])
from datetime import datetime
now = datetime.now()
df_cars['car_age'] = int(now.strftime("%Y")) - df_cars['year']


In [173]:
df_cars.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,seats,car_age
0,Maruti,2014,13.017003,145500,Diesel,Individual,Manual,1,23.4,1248.0,74.0,5.0,9
1,Skoda,2014,12.821258,120000,Diesel,Individual,Manual,2,21.14,1498.0,103.52,5.0,9
2,Honda,2006,11.97035,140000,Petrol,Individual,Manual,3,17.7,1497.0,78.0,5.0,17
3,Hyundai,2010,12.323856,127000,Diesel,Individual,Manual,1,23.0,1396.0,90.0,5.0,13
4,Maruti,2007,11.77529,120000,Petrol,Individual,Manual,1,16.1,1298.0,88.2,5.0,16


In [174]:
X = df_cars[['max_power', 'mileage', 'car_age']]
y = df_cars['selling_price']
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 370)
X_train['max_power'].fillna(X_train['max_power'].median(), inplace=True)
X_train['mileage'].fillna(X_train['mileage'].mean(), inplace=True)
X_test['max_power'].fillna(X_train['max_power'].median(), inplace=True)
X_test['mileage'].fillna(X_train['mileage'].mean(), inplace=True)

In [175]:
X_train

Unnamed: 0,max_power,mileage,car_age
1937,112.00,14.000000,11
3966,67.00,20.630000,5
6823,46.30,19.700000,14
5447,82.00,19.394267,15
4446,88.50,24.520000,6
...,...,...,...
3898,88.70,19.000000,8
6268,81.80,21.210000,4
4143,81.83,18.600000,6
5267,81.80,21.010000,3


In [176]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import numpy as np
from time import time




In [177]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test  = scaler.transform(X_test)



In [178]:
X_train = np.insert(X_train, 0, 1, axis=1)
X_test = np.insert(X_test, 0, 1, axis=1)

In [179]:
y_train = np.array(y_train)
y_test=np.array(y_test)

In [180]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((5619, 4), (2409, 4), (5619,), (2409,))

In [181]:
type(X_train), type(X_test), type(y_train), type(y_test)

(numpy.ndarray, numpy.ndarray, numpy.ndarray, numpy.ndarray)

In [182]:
from sklearn.model_selection import KFold

class LinearRegression(object):
    
    kfold = KFold(n_splits=3)
            
    def __init__(self, regularization, lr=0.001, method='batch', num_epochs=500, batch_size=50, cv=kfold, initial = 'Xavier'):
        self.lr         = lr
        self.num_epochs = num_epochs
        self.batch_size = batch_size
        self.method     = method
        self.cv         = cv
        self.initial    = initial
        self.regularization = regularization

    def mse(self, ytrue, ypred):
        return ((ypred - ytrue) ** 2).sum() / ytrue.shape[0]
    
    def r_squared(self, y_true, y_pred):
        ss_residual = ((y_pred - y_true) ** 2).sum()
        ss_total = (((y_true) - np.mean(y_true)) ** 2).sum()
        r2 = 1 - (ss_residual / ss_total)
        return r2
    
    def fit(self, X_train, y_train):
            
        #create a list of kfold scores
        self.kfold_scores = list()
        
        #reset val loss
        self.val_loss_old = np.infty

        #kfold.split in the sklearn.....
        #5 splits
        for fold, (train_idx, val_idx) in enumerate(self.cv.split(X_train)):
            
            X_cross_train = X_train[train_idx]
            y_cross_train = y_train[train_idx]
            X_cross_val   = X_train[val_idx]
            y_cross_val   = y_train[val_idx]
            
            #using Xavier:::::::
            limit = np.sqrt(6.0 / (X.shape[0] + 1))
            self.theta = np.random.uniform(-limit, limit, size=X_cross_train.shape[1])
            #self.theta = np.zeros(X_cross_train.shape[1])
            
            #define X_cross_train as only a subset of the data
            #how big is this subset?  => mini-batch size ==> 50
            
            #one epoch will exhaust the WHOLE training set
            with mlflow.start_run(run_name=f"Fold-{fold}", nested=True):
                
                params = {"method": self.method, "lr": self.lr, "reg": type(self).__name__, "initial":self.initial}
                mlflow.log_params(params=params)
                
                for epoch in range(self.num_epochs):
                
                    #with replacement or no replacement
                    #with replacement means just randomize
                    #with no replacement means 0:50, 51:100, 101:150, ......300:323
                    #shuffle your index
                    perm = np.random.permutation(X_cross_train.shape[0])
                            
                    X_cross_train = X_cross_train[perm]
                    y_cross_train = y_cross_train[perm]
                    
                    if self.method == 'sto':
                        for batch_idx in range(X_cross_train.shape[0]):
                            X_method_train = X_cross_train[batch_idx].reshape(1, -1) #(11,) ==> (1, 11) ==> (m, n)
                            y_method_train = y_cross_train[batch_idx] 
                            train_loss = self._train(X_method_train, y_method_train)
                    elif self.method == 'mini':
                        for batch_idx in range(0, X_cross_train.shape[0], self.batch_size):
                            #batch_idx = 0, 50, 100, 150
                            X_method_train = X_cross_train[batch_idx:batch_idx+self.batch_size, :]
                            y_method_train = y_cross_train[batch_idx:batch_idx+self.batch_size]
                            train_loss = self._train(X_method_train, y_method_train)
                    else:
                        X_method_train = X_cross_train
                        y_method_train = y_cross_train
                        train_loss = self._train(X_method_train, y_method_train)

                    mlflow.log_metric(key="train_loss", value=train_loss, step=epoch)

                    yhat_val = self.predict(X_cross_val)
                    #val_loss_new = self.mse(y_cross_val, yhat_val)
                    val_loss_new = self.r_squared(y_cross_val, yhat_val)
                    mlflow.log_metric(key="val_loss", value=val_loss_new, step=epoch)
                    
                    #record dataset
                    #mlflow_train_data = mlflow.data.from_numpy(features=X_method_train, targets=y_method_train)
                    #mlflow.log_input(mlflow_train_data, context="training")
                    
                    #mlflow_val_data = mlflow.data.from_numpy(features=X_cross_val, targets=y_cross_val)
                    #mlflow.log_input(mlflow_val_data, context="validation")
                    
                    #early stopping
                    if np.allclose(val_loss_new, self.val_loss_old):
                        break
                    self.val_loss_old = val_loss_new
            
                self.kfold_scores.append(val_loss_new)
                print(f"Fold {fold}: {val_loss_new}")
            
                    
    def _train(self, X, y):
        yhat = self.predict(X)
        m    = X.shape[0]        
        grad = (1/m) * X.T @(yhat - y) + self.regularization.derivation(self.theta)
        self.theta = self.theta - self.lr * grad
        #return self.mse(y, yhat)
        return self.r_squared(y, yhat)
    
    def predict(self, X):
        return X @ self.theta  #===>(m, n) @ (n, )
    
    def _coef(self):
        return self.theta[1:]  #remind that theta is (w0, w1, w2, w3, w4.....wn)
                               #w0 is the bias or the intercept
                               #w1....wn are the weights / coefficients / theta
    def _bias(self):
        return self.theta[0]

In [183]:
class LassoPenalty:
    
    def __init__(self, l):
        self.l = l # lambda value
        
    def __call__(self, theta): #__call__ allows us to call class as method
        return self.l * np.sum(np.abs(theta))
        
    def derivation(self, theta):
        return self.l * np.sign(theta)
    
class RidgePenalty:
    
    def __init__(self, l):
        self.l = l
        
    def __call__(self, theta): #__call__ allows us to call class as method
        return self.l * np.sum(np.square(theta))
        
    def derivation(self, theta):
        return self.l * 2 * theta
    
class ElasticPenalty:
    
    def __init__(self, l = 0.1, l_ratio = 0.5):
        self.l = l 
        self.l_ratio = l_ratio

    def __call__(self, theta):  #__call__ allows us to call class as method
        l1_contribution = self.l_ratio * self.l * np.sum(np.abs(theta))
        l2_contribution = (1 - self.l_ratio) * self.l * 0.5 * np.sum(np.square(theta))
        return (l1_contribution + l2_contribution)

    def derivation(self, theta):
        l1_derivation = self.l * self.l_ratio * np.sign(theta)
        l2_derivation = self.l * (1 - self.l_ratio) * theta
        return (l1_derivation + l2_derivation)
    
class Lasso(LinearRegression):
    
    def __init__(self, method, lr, l):
        self.regularization = LassoPenalty(l)
        super().__init__(self.regularization, lr, method)
        
class Ridge(LinearRegression):
    
    def __init__(self, method, lr, l):
        self.regularization = RidgePenalty(l)
        super().__init__(self.regularization, lr, method)
        
class ElasticNet(LinearRegression):
    
    def __init__(self, method, lr, l, l_ratio=0.5):
        self.regularization = ElasticPenalty(l, l_ratio)
        super().__init__(self.regularization, lr, method)


In [184]:
#helper function for looping classnames
import sys

def str_to_class(classname):
    return getattr(sys.modules[__name__], classname)

In [185]:
regs = ["Ridge", "Lasso", "ElasticNet"]

for reg in regs:

    params = {"method": "batch", "lr": 0.1, "l": 0.1}
    mlflow.start_run(run_name=f"method-{params['method']}-lr-{params['lr']}-reg-{reg}-initial-{'Xavier'}", nested=True)
    
    print("="*5, reg, "="*5)

    # #######
    type_of_regression = str_to_class(reg)    #Ridge, Lasso, ElasticNet
    model = type_of_regression(**params)  
    model.fit(X_train, y_train)
    yhat = model.predict(X_test)
    mse  = model.mse(yhat, y_test)
    r_squared = model.r_squared(y_test, yhat)

    print("Test mse: ", mse)
    mlflow.log_metric(key="mse", value=mse)
    print("Test R2: ", r_squared)
    mlflow.log_metric(key="r_squared", value=r_squared)
    signature = mlflow.models.infer_signature(X_train, model.predict(X_train))
    mlflow.sklearn.log_model(model, artifact_path='model', signature=signature)

    # #######

    mlflow.end_run()

===== Ridge =====
Fold 0: -5.832485497597063
Fold 1: -5.855699048130322
Fold 2: -5.8269302229061815
Test mse:  4.794188154382868
Test R2:  -5.863298145466894




===== Lasso =====
Fold 0: 0.7956588394342786
Fold 1: 0.790635534609066
Fold 2: 0.8028494953798337
Test mse:  0.14678743187917795
Test R2:  0.7898609990779777
===== ElasticNet =====
Fold 0: 0.18758993544673486
Fold 1: 0.1856598620289669
Fold 2: 0.20472173790473125
Test mse:  0.5662693215942306
Test R2:  0.18933611706927977
