In [10]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [24]:
class LinRegressionLearner:
    num_coeff = 1      # this doesn't include the bias term
    weights = np.random.random(10)     #learnable weights
    learnt_gamma = np.random.normal(0, 1)  # learnable parameter for thre batch norm
    learnt_beta = np.random.normal(0, 1)  # learnable parameter for thre batch norm
    learning_rate = 0.005
    convergence_limit = 0.01
    converged_batch = 0
    converged_batch_limit = 50    # if this many batches are converged, then we can stop training
    enable_batch_norm = False

    def __init__(self, num_coeff, alpha, enable_batch_norm = False):
        self.num_coeff = num_coeff
        self.learning_rate = alpha
        self.enable_batch_norm = enable_batch_norm
        self.weights = np.random.normal(0, 1, (num_coeff + 1, 1))
    
    def printSummary(self):
        print("The learnt weights are: ", self.weights)
        print("The learnt gamma and beta are:", self.learnt_gamma, " and ", self.learnt_beta)

    # return whether the batch converged or not
    def learnGradient(self, training_batch, training_labels):
        num_rows = training_batch.shape[0]

        # few checks on the input
        if training_batch.shape[1] != self.num_coeff:
            print("Invalid number of columns: ", num_coeff)
            return -1

        # add the first column for bias learning
#         print("training batch: ", training_batch)
        bias_col = np.ones((num_rows, 1))
        mod_train_batch = np.append(bias_col, training_batch, axis=1)
#         print("modified training batch: ", mod_train_batch)

        pred_label = np.matmul(mod_train_batch, self.weights)
#         print("weights are:", self.weights)
#         print("predicted label is:", pred_label)
#         print("actual label is:", training_label)
        
        pred_delta = pred_label - training_labels
#         print("pred delta is:", pred_delta)
        
        max_delta = 0
        for i in range(self.num_coeff + 1):
            col_feature = np.reshape(mod_train_batch[:, i], (num_rows, 1))
#             print("col feature is:", col_feature)
            error_delta = pred_delta * col_feature
#             print("err delta is:", error_delta)
            
            dl_by_dw = np.mean(error_delta)
#             dl_by_dw = min(dl_by_dw, 5)
            self.weights[i] -= self.learning_rate * dl_by_dw
            
#             print("new weights are:", self.weights)
            max_delta = max(max_delta, np.abs(dl_by_dw))
            
        if max_delta <= self.convergence_limit:
            self.converged_batch += 1
            return 1

        return 0

    def learnGradientUsingBatchNorm(self, training_batch, training_labels):
        num_rows = training_batch.shape[0]

        # few checks on the input
        if training_batch.shape[1] != self.num_coeff:
            print("Invalid number of columns: ", num_coeff)
            return -1

#         print("modified training batch: ", mod_train_batch)

        #standardize the input training data
        mod_train_batch = (training_batch - np.mean(training_batch, axis=0))/(np.std(training_batch) + 0.01)

        # add the first column for bias learning
        bias_col = np.ones((num_rows, 1))
        mod_train_batch = np.append(bias_col, mod_train_batch, axis=1)
#         print("standardized training batch: ", mod_train_batch)

        pred_label_1 = np.matmul(mod_train_batch, self.weights)
        pred_label_2 = self.learnt_gamma * pred_label_1 + self.learnt_beta

#         print("weights are:", self.weights)
#         print("predicted label is:", pred_label)
#         print("actual label is:", training_label)

        pred_delta_2 = pred_label_2 - training_labels
#         print("pred delta is:", pred_delta)

        # update the weights
        max_delta = 0
        for i in range(self.num_coeff + 1):
            col_feature = np.reshape(mod_train_batch[:, i], (num_rows, 1))
#             print("col feature is:", col_feature)
            error_delta = pred_delta_2 * col_feature
#             print("err delta is:", error_delta)
            error_delta *= self.learnt_gamma

            dl_by_dw = np.mean(error_delta)
#             dl_by_dw = min(dl_by_dw, 5)
            self.weights[i] -= self.learning_rate * dl_by_dw

#             print("new weights are:", self.weights)
            max_delta = max(max_delta, np.abs(dl_by_dw))

        # update the learnable gamma and betas
        dl_by_dgamma = pred_delta_2 * pred_label_1
        self.learnt_gamma -= self.learning_rate * np.mean(dl_by_dgamma)

        dl_by_dbeta = np.mean(pred_delta_2)
        self.learnt_beta -= self.learning_rate * dl_by_dbeta

        if max_delta <= self.convergence_limit:
            self.converged_batch += 1
            return 1

        return 0



In [25]:

# generate the training data for a simple mathematical equation

def generate_training_batch(num_coeff, batch_size):    # this doesn't include the bias term
    # this is now generating features normalized around 0. 
    # We would play with it to see how batch norm can handle any feature distribution and scaling aspects
    return np.random.normal(100, 50, (batch_size, num_coeff))


def generate_training_labels(training_batch):
    # y = -11.52 + 132.54 * x1 - 89.23 * x2
    coeff = np.array([[132.54]])
#     coeff = np.array([[132.54], [-89.23]])
    label = np.matmul(training_batch, coeff)
    label = label - 11.52
    noise = np.random.normal(0, 5, (training_batch.shape[0], 1))
    label += noise
    return np.reshape(label, (training_batch.shape[0], 1))


num_coeff = 1
converged_batch = 0
batch_size = 100
total_batches = 0
learner = LinRegressionLearner(num_coeff, 0.005)

while(True):
    training_batch = generate_training_batch(num_coeff=num_coeff, batch_size=batch_size)
    training_label = generate_training_labels(training_batch)
    total_batches += 1
#     if learner.learnGradient(training_batch, training_label) == 1:
    if learner.learnGradientUsingBatchNorm(training_batch, training_label) == 1:
        converged_batch += 1
        if converged_batch % 10 == 0:
            print("Total epochs passes: ", total_batches)
            learner.printSummary()
        
    if total_batches % 100000 == 0:
        print("Total epochs passes: ", total_batches)
        learner.printSummary()
        
    if converged_batch > 20: #or total_batches > 500000:
        print("the learner has converged in ", total_batches)
        learner.printSummary()    
        break
#     break



Total epochs passes:  16
The learnt weights are:  [[nan]
 [nan]]
The learnt gamma and beta are: nan  and  nan
Total epochs passes:  26
The learnt weights are:  [[nan]
 [nan]]
The learnt gamma and beta are: nan  and  nan
the learner has converged in  27
The learnt weights are:  [[nan]
 [nan]]
The learnt gamma and beta are: nan  and  nan


  ret = umr_sum(arr, axis, dtype, out, keepdims)


In [22]:
StandardScaler().fit_transform(training_batch)

array([[-0.52206574],
       [-0.23459001],
       [-0.45171382],
       [-0.76298217],
       [ 1.97135175]])

In [23]:
(training_batch - np.mean(training_batch, axis=0))/(np.std(training_batch) + 0.01)

array([[-0.52196138],
       [-0.23454312],
       [-0.45162352],
       [-0.76282964],
       [ 1.97095766]])

In [108]:
np.matmul(training_batch, coeff).shape

(100, 1)

In [110]:
noise = np.random.normal(0, 5, (training_batch.shape[0]))

In [111]:
noise.shape

(100,)

In [114]:
label = generate_training_labels(training_batch)
label.shape

(100, 1)