In [1]:
# Importing the libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import mlflow
import sys
import time
import warnings

from sklearn.metrics import classification_report
from sklearn.preprocessing import RobustScaler
warnings.filterwarnings('ignore')

In [2]:
np.__version__, pd.__version__, sns.__version__, matplotlib.__version__

('1.24.4', '1.5.3', '0.12.2', '3.7.2')

### 1. Load Data

In [3]:
# Load the preproccesed data

df = pd.read_csv('../dataset/prepocessed_cars.csv')

In [4]:
df.head()

Unnamed: 0,max_power,engine,mileage,selling_price
0,72.4,2499.0,13.58,12.9808
1,62.1,2523.0,15.96,12.89922
2,88.76,1248.0,20.77,13.122363
3,90.0,1248.0,18.8,12.154779
4,90.0,1396.0,23.0,12.278393


In [5]:
# df['selling_price'] = np.exp(df['selling_price'])

# df.head()

### 2. Converting selling price into discrete values

In [6]:
print(min(df['selling_price']))
print(max(df['selling_price']))

10.308919326755392
16.11809565095832


In [7]:
# bins = [29999, 2500000, 5000000, 7500000, 100000000]
bins = [10, 11.75, 13.5, 15.25, 17]
labels = [0, 1, 2, 3]

df['selling_price'] = pd.cut(df['selling_price'], bins=bins, labels=labels)

df.head()

Unnamed: 0,max_power,engine,mileage,selling_price
0,72.4,2499.0,13.58,1
1,62.1,2523.0,15.96,1
2,88.76,1248.0,20.77,1
3,90.0,1248.0,18.8,1
4,90.0,1396.0,23.0,1


Here, the selling price column has been categorized into four categories (0, 1, 2, 3) based on range. Lets see the number of categories for each category

In [8]:
for label in labels:
    print(f"Number of category {label}: {(df['selling_price'] == label).sum()}")


Number of category 0: 550
Number of category 1: 5752
Number of category 2: 1606
Number of category 3: 120


### 3. Test Train Split

In [9]:
X = df[['max_power', 'engine', 'mileage']]

y = df['selling_price']

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

### 4. Scaling

In [11]:
scaler = RobustScaler()

X_train = scaler.fit_transform(X_train)
X_test  = scaler.transform(X_test)

In [12]:
# add intercept to our X
intercept = np.ones((X_train.shape[0], 1))
X_train   = np.concatenate((intercept, X_train), axis=1)  #add intercept
intercept = np.ones((X_test.shape[0], 1))
X_test    = np.concatenate((intercept, X_test), axis=1)  #add intercept

In [13]:
Y_train_encoded = pd.get_dummies(y_train)

In [14]:
k = len(set(y))  # no. of class  (can also use np.unique)
m = X_train.shape[0]  # no.of samples
n = X_train.shape[1]  # no. of features
Y_train_encoded = np.zeros((m, k))
for each_class in range(k):
    cond = y_train==each_class
    Y_train_encoded[np.where(cond), each_class] = 1

In [15]:
# Shape check for X_train, X_test, y_train, y before model fitting
print("Shape of X_train: ", X_train.shape)
print("Shape of X_test: ", X_test.shape)
print("Shape of y_train: ", y_train.shape)
print("Shape of y_test: ", y_test.shape)

Shape of X_train:  (5619, 4)
Shape of X_test:  (2409, 4)
Shape of y_train:  (5619,)
Shape of y_test:  (2409,)


In [16]:
Y_train_encoded

array([[0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       ...,
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.]])

In [17]:
y_test

5867    1
5957    2
3032    1
6445    0
318     1
       ..
5670    2
1003    1
4801    2
7446    0
555     1
Name: selling_price, Length: 2409, dtype: category
Categories (4, int64): [0 < 1 < 2 < 3]

### 5. Modeling

In [18]:
class LogisticRegression:
    
    def __init__(self, regularization, method, alpha, k, n, max_iter=5000):
        self.regularization = regularization
        self.k = k
        self.n = n
        self.method = method
        self.alpha = alpha
        self.max_iter = max_iter
    
    def fit(self, X, Y):
        self.W = np.random.rand(self.n, self.k)
        self.losses = []
        
        if self.method == "batch":
            start_time = time.time()
            for i in range(self.max_iter):
                loss, grad =  self.gradient(X, Y)
                self.losses.append(loss)
                self.W = self.W - self.alpha * grad
                if i % 500 == 0:
                    print(f"Loss at iteration {i}", loss)
            print(f"time taken: {time.time() - start_time}")
            
        elif self.method == "minibatch":
            start_time = time.time()
            batch_size = int(0.3 * X.shape[0])
            for i in range(self.max_iter):
                ix = np.random.randint(0, X.shape[0]) #<----with replacement
                batch_X = X[ix:ix+batch_size]
                batch_Y = Y[ix:ix+batch_size]
                loss, grad = self.gradient(batch_X, batch_Y)
                self.losses.append(loss)
                self.W = self.W - self.alpha * grad
                if i % 500 == 0:
                    print(f"Loss at iteration {i}", loss)
            print(f"time taken: {time.time() - start_time}")
            
        elif self.method == "stochastic":
            start_time = time.time()
            list_of_used_ix = []
            for i in range(self.max_iter):
                idx = np.random.randint(X.shape[0])
                while i in list_of_used_ix:
                    idx = np.random.randint(X.shape[0])
                X_train = X[idx, :].reshape(1, -1)
                Y_train = Y[idx]
                loss, grad = self.gradient(X_train, Y_train)
                self.losses.append(loss)
                self.W = self.W - self.alpha * grad
                
                list_of_used_ix.append(i)
                if len(list_of_used_ix) == X.shape[0]:
                    list_of_used_ix = []
                if i % 500 == 0:
                    print(f"Loss at iteration {i}", loss)
            print(f"time taken: {time.time() - start_time}")
            
        else:
            raise ValueError('Method must be one of the followings: "batch", "minibatch" or "sto".')
        
        
    def gradient(self, X, Y):
        m = X.shape[0]
        h = self.h_theta(X, self.W)
        loss = - np.sum(Y*np.log(h)) / m + self.regularization(self.W)
        error = h - Y
        grad = self.softmax_grad(X, error) + self.regularization.derivation(self.W)
        return loss, grad

    def softmax(self, theta_t_x):
        return np.exp(theta_t_x) / np.sum(np.exp(theta_t_x), axis=1, keepdims=True)

    def softmax_grad(self, X, error):
        return  X.T @ error

    def h_theta(self, X, W):
        return self.softmax(X @ W)
    
    def predict(self, X_test):
        return np.argmax(self.h_theta(X_test, self.W), axis=1)
    
    def plot(self):
        plt.plot(np.arange(len(self.losses)) , self.losses, label = "Train Losses")
        plt.title("Losses")
        plt.xlabel("epoch")
        plt.ylabel("losses")
        plt.legend()

    def accuracy(self, y, y_pred):
        correct_predictions = np.sum(y == y_pred)
        total_predictions = y.shape[0]
        return correct_predictions / total_predictions

    def precision(self, y, y_pred, c=0):
        true_positives = np.sum((y == c) & (y_pred == c))
        false_positives = np.sum((y != c) & (y_pred == c))
        if true_positives + false_positives == 0:
            return 0
        else:
            return true_positives / (true_positives + false_positives)

    def recall(self, y, y_pred, c=0):
        true_positives = np.sum((y == c) & (y_pred == c))
        false_negatives = np.sum((y == c) & (y_pred != c))
        if true_positives + false_negatives == 0:
            return 0
        else:
            return true_positives / (true_positives + false_negatives)

    def f1_score(self, y, y_pred, c=0):
        precision = self.precision(y, y_pred, c)
        recall = self.recall(y, y_pred, c)
        if precision + recall == 0:
            return 0
        else:
            return 2 * precision * recall / (precision + recall)

    def macro_precision(self, y, y_pred):
        precisions = [self.precision(y, y_pred, c) for c in range(self.k)]
        return np.sum(precisions) / self.k

    def macro_recall(self, y, y_pred):
        recalls = [self.recall(y, y_pred, c) for c in range(self.k)]
        return np.sum(recalls) / self.k

    def macro_f1(self, y, y_pred):
        f1s = [self.f1_score(y, y_pred, c) for c in range(self.k)]
        return np.sum(f1s) / self.k

    def weighted_precision(self, y, y_pred):
        class_counts = [np.count_nonzero(y == c) for c in range(self.k)]
        precisions = [class_counts[c] / len(y) * self.precision(y, y_pred, c) for c in range(self.k)]
        return np.sum(precisions)

    def weighted_recall(self, y, y_pred):
        class_counts = [np.count_nonzero(y == c) for c in range(self.k)]
        recalls = [class_counts[c] / len(y) * self.recall(y, y_pred, c) for c in range(self.k)]
        return np.sum(recalls)

    def weighted_f1(self, y, y_pred):
        class_counts = [np.count_nonzero(y == c) for c in range(self.k)]
        f1s = [class_counts[c] / len(y) * self.f1_score(y, y_pred, c) for c in range(self.k)]
        return np.sum(f1s)

    def classification_report(self, y, y_pred):
        cols = ["precision", "recall", "f1-score"]
        idx = list(range(self.k)) + ["accuracy", "macro", "weighted"]

        report = [[self.precision(y, y_pred, c),
                   self.recall(y, y_pred, c),
                   self.f1_score(y, y_pred, c)] for c in range(self.k)]

        report.append(["", "", self.accuracy(y, y_pred)])

        report.append([self.macro_precision(y, y_pred),
                       self.macro_recall(y, y_pred),
                       self.macro_f1(y, y_pred)])

        report.append([self.weighted_precision(y, y_pred),
                       self.weighted_recall(y, y_pred),
                       self.weighted_f1(y, y_pred)])

        return pd.DataFrame(report, index=idx, columns=cols)

In [19]:
class RidgePenalty:
    def __init__(self, l):
        self.l = l
        
    def __call__(self, theta): 
        return self.l * np.sum(np.square(theta))
        
    def derivation(self, theta):
        return self.l * 2 * theta
    
class NoPenalty():
    def __init__(self, l):
        self.l = l
        
    def __call__(self, theta): 
        return 0.0
        
    def derivation(self, theta):
        return 0.0
    
class Ridge(LogisticRegression):
    def __init__(self, reg, method, alpha, k, n, l):
        self.regularization = RidgePenalty(l)
        super().__init__(self.regularization, method, alpha, k, n)

class Normal(LogisticRegression):
    def __init__(self, reg, method, alpha, k, n, l):
        self.regularization = NoPenalty(l)
        super().__init__(self.regularization, method, alpha, k, n)


In [20]:
import itertools

regs = ['Normal', 'Ridge']
methods = ['stochastic', 'minibatch', 'batch']
alphas = [0.01, 0.001, 0.0001]

# Combining all the parameters into list of arrays
all_combinations = list(itertools.product(regs, methods, alphas))

# Init parameters
parameters = []

# Fill the parameters list with combination dictionaries
for combo in all_combinations:
    parameters.append({
        "reg": combo[0],
        "method": combo[1],
        "alpha": combo[2],
        "k": k,
        "n": n,
        "l": 0.1
    })

In [21]:
# Helper function for looping classnames
def str_to_class(classname):
    return getattr(sys.modules[__name__], classname)

In [23]:
for params in parameters:
    run_name=f"reg-{params['reg']}-method-{params['method']}-alpha={params['alpha']}"

    print(run_name)
    
    # Start mlflow experiment
    mlflow.start_run(run_name = run_name, nested=True)

    type_of_regression = str_to_class(params['reg'])
    
    # Instantize an regression object based on regularization. 
    model = type_of_regression(**params)

    # Start training the model with training set
    model.fit(X_train, Y_train_encoded)

    yhat = model.predict(X_test)

    accuracy = model.accuracy(y_test, yhat)
    precision = model.precision(y_test, yhat)
    recall = model.recall(y_test, yhat)
    f1_score = model.f1_score(y_test, yhat)

    print("Accuracy: " + accuracy)
    print("Precision: ", + precision)
    print("Recall: " + recall)
    print("F1_Score: " + f1_score)

    mlflow.log_metrics({
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1_score": f1_score
    })

    # Log the trained model in mlflow
    signature = mlflow.models.infer_signature(X_train, model.predict(X_train))
    mlflow.sklearn.log_model(model, artifact_path='model', signature=signature)

    mlflow.end_run()
    print("========================================")


reg-Normal-method-stochastic-alpha=0.01
Loss at iteration 0 2.502923880772256
Loss at iteration 500 0.5717060146710765
Loss at iteration 1000 0.2654845448740318
Loss at iteration 1500 1.95270627686727
Loss at iteration 2000 1.4362431504272961
Loss at iteration 2500 0.24536842860085137
Loss at iteration 3000 0.3213732181768244
Loss at iteration 3500 0.18346973720281684
Loss at iteration 4000 0.19326321075148017
Loss at iteration 4500 1.0859428677440905
time taken: 0.1685619354248047
[1 2 1 ... 2 1 1]
reg-Normal-method-stochastic-alpha=0.001
Loss at iteration 0 1.0740483417771054
Loss at iteration 500 1.7803721946079025
Loss at iteration 1000 0.733874381103748
Loss at iteration 1500 0.8464456280782063
Loss at iteration 2000 0.4876786545719561
Loss at iteration 2500 1.7985672611634838
Loss at iteration 3000 0.6697118262171395
Loss at iteration 3500 0.401820617873111
Loss at iteration 4000 1.9199208692767806
Loss at iteration 4500 0.3287383360730625
time taken: 0.15339183807373047
[1 2 1 .