In [57]:
import numpy as np
import sympy as sp
import pandas as pd
from itertools import combinations
from numpy import linalg as LA
from matplotlib import pyplot as plt
from scipy.spatial.distance import pdist, squareform
from multiprocessing import Process
import warnings
# from sklearn.datasets import make_regression
warnings.filterwarnings("error")


In [58]:
def gradient_descent(w_init, x, y, c, del_c, max_iter=100000):
    eta_max, tau, tolerance = 0.0001, 0.7, 10e-5
    eta, w_prev = eta_max, w_init
    i = 0
    convergence_obtained = False
    while i < max_iter:
        try:
            w_next = w_prev - eta * del_c(w_prev, x, y)
            if np.all(np.abs(w_next - w_prev) < tolerance):
                convergence_obtained = True
                break
            w_prev = w_next
            i += 1
        except:
            break
    if not convergence_obtained:
        eta = 0
        print("WARNING: Convergence not obtained for " + str(c) + "\\\\")
        return w_prev, i, eta
    return w_prev, i, eta

In [59]:
def SSE(w, x, y):
    x = np.insert(x, 0, 1, axis=1)
    delta = (np.dot(x, w) - y)
    return np.dot(delta.T, delta)

In [60]:
def del_SSE(w, x, y):
    x = np.insert(x, 0, 1, axis=1)
    delta = np.dot(x, w) - y
    return 2 * np.dot(x.T, delta)

In [61]:
def SSED(w, x, y):
    return SSE(w, x, y) / np.dot(w.T, w)

In [62]:
def del_SSED(w, x, y):
    w_2 = w
    w_2[0, 0] = 0
    w_sq = np.dot(w.T, w) + 1
    numerator = w_sq * del_SSE(w, x, y) - 2 * SSE(w, x, y) * w_2
    denominator = w_sq**2
    return numerator / denominator

In [63]:
def w_ML_closed_form(x, y):
    x = np.insert(x, 0, 1, axis=1)
    return np.dot(np.dot(np.linalg.inv(np.dot(x.T, x)), x.T), y)

In [64]:
def r_square(w, x, y):
    x = np.insert(x, 0, 1, axis=1)
    delta = np.dot(x, w) - y
    mean_diff = np.mean(y) - y
    return 1 - (np.dot(delta.T, delta) / np.dot(mean_diff.T, mean_diff))

In [65]:
def test(x, w, w_init):
    n, d = x.shape
    x_mod = np.insert(x, 0, 1, axis=1)
    y_act = np.dot(x_mod, w) + 0.5 * np.random.rand(n, 1)
    w_ML = w_ML_closed_form(x, y_act)
    w_SSE = gradient_descent(w_init, x, y_act, SSE, del_SSE)
    w_SSED = gradient_descent(w_init, x, y_act, SSED, del_SSED)
    print("Raw dataset results:\\\\")
    print("Weights from Maximum Likelihood =", w_ML.T[0], "\\\\")
    print("Weights from Sum of Squared Error =", w_SSE[0].T[0], "Number of iterations taken = ", w_SSE[1], "\\\\")
    print("Weights from Sum of Squared Euclidean Distance Error =", w_SSED[0].T[0], "Number of iterations taken = ", w_SSED[1], "\\\\")
    print("$R^2$ for Maximum Likelihood =", r_square(w_ML, x, y_act)[0, 0], "\\\\")
    print("$R^2$ for Sum of Squared Error =", r_square(w_SSE[0], x, y_act)[0, 0], "\\\\")
    print("$R^2$ for Sum of Squared Euclidean Distance Error =", r_square(w_SSED[0], x, y_act)[0, 0], "\\\\")
    print()

In [66]:
def test_normalization(x, w, w_init):
    n, d = x.shape
    x = (x - x.min(axis=0)) / (x.max(axis=0) - x.min(axis=0))
    x_mod = np.insert(x, 0, 1, axis=1)
    y_act = np.dot(x_mod, w) + 0.5 * np.random.rand(n, 1)
    y_act = (y_act - y_act.min(axis=0)) / (y_act.max(axis=0) - y_act.min(axis=0))
    w_ML = w_ML_closed_form(x, y_act)
    w_SSE = gradient_descent(w_init, x, y_act, SSE, del_SSE)
    w_SSED = gradient_descent(w_init, x, y_act, SSED, del_SSED)
    print("Normalized dataset results:\\\\")
    print("Weights from Maximum Likelihood =", w_ML.T[0], "\\\\")
    print("Weights from Sum of Squared Error =", w_SSE[0].T[0], "Number of iterations taken = ", w_SSE[1], "\\\\")
    print("Weights from Sum of Squared Euclidean Distance Error =", w_SSED[0].T[0], "Number of iterations taken = ", w_SSED[1], "\\\\")
    print("$R^2$ for Maximum Likelihood =", r_square(w_ML, x, y_act)[0, 0], "\\\\")
    print("$R^2$ for Sum of Squared Error =", r_square(w_SSE[0], x, y_act)[0, 0], "\\\\")
    print("$R^2$ for Sum of Squared Euclidean Distance Error =", r_square(w_SSED[0], x, y_act)[0, 0], "\\\\")
    print()

In [67]:
n = 1000
x = np.random.rand(n, 1)
w_act = np.array([[1], [2]])
print("Actual weights = ", w_act.T[0], "\\\\")
w_init = np.array([[1], [1]])
test(x, w_act, w_init)
test_normalization(x, w_act, w_init)

Actual weights =  [1 2] \\
Raw dataset results:\\
Weights from Maximum Likelihood = [1.25499004 1.98981147] \\
Weights from Sum of Squared Error = [1.25888602 1.98237399] Number of iterations taken =  336 \\
Weights from Sum of Squared Euclidean Distance Error = [1.79207960e-03 4.21289384e+00] Number of iterations taken =  100000 \\
$R^2$ for Maximum Likelihood = 0.9423719488680293 \\
$R^2$ for Sum of Squared Error = 0.9423585914916401 \\
$R^2$ for Sum of Squared Euclidean Distance Error = -0.3125292809590352 \\

Normalized dataset results:\\
Weights from Maximum Likelihood = [0.08428984 0.82866939] \\
Weights from Sum of Squared Error = [0.080403   0.83608681] Number of iterations taken =  232 \\
Weights from Sum of Squared Euclidean Distance Error = [0.00183914 0.9647496 ] Number of iterations taken =  100000 \\
$R^2$ for Maximum Likelihood = 0.9405070723861864 \\
$R^2$ for Sum of Squared Error = 0.9404306160582103 \\
$R^2$ for Sum of Squared Euclidean Distance Error = 0.911018487729

In [68]:
n = 1000
x = np.random.rand(n, 2)
w_act = np.array([[1], [2], [3]])
print("Actual weights = ", w_act.T[0], "\\\\")
w_init = np.array([[1], [1], [1]])
test(x, w_act, w_init)
test_normalization(x, w_act, w_init)

Actual weights =  [1 2 3] \\
Raw dataset results:\\
Weights from Maximum Likelihood = [1.23021267 2.02393632 3.01567512] \\
Weights from Sum of Squared Error = [1.23897042 2.01566028 3.00716707] Number of iterations taken =  411 \\
Weights from Sum of Squared Euclidean Distance Error = [9.11827650e-04 3.10726850e+00 4.17204299e+00] Number of iterations taken =  100000 \\
$R^2$ for Maximum Likelihood = 0.9810229751608907 \\
$R^2$ for Sum of Squared Error = 0.981012314928364 \\
$R^2$ for Sum of Squared Euclidean Distance Error = 0.7807863494298954 \\

Normalized dataset results:\\
Weights from Maximum Likelihood = [0.02130786 0.40205078 0.5999009 ] \\
Weights from Sum of Squared Error = [0.01296103 0.41063084 0.60728349] Number of iterations taken =  326 \\
Weights from Sum of Squared Euclidean Distance Error = [3.44419144e-04 4.20719822e-01 6.19044100e-01] Number of iterations taken =  100000 \\
$R^2$ for Maximum Likelihood = 0.9815491309587564 \\
$R^2$ for Sum of Squared Error = 0.9813

In [69]:
n = 1000
x = np.random.rand(n, 3)
w_act = np.array([[1], [2], [3], [4]])
print("Actual weights = ", w_act.T[0], "\\\\")
w_init = np.array([[1], [1], [1], [1]])
test(x, w_act, w_init)
test_normalization(x, w_act, w_init)

Actual weights =  [1 2 3 4] \\
Raw dataset results:\\
Weights from Maximum Likelihood = [1.23175493 2.00920529 3.02320381 4.00696197] \\
Weights from Sum of Squared Error = [1.24252118 2.00268153 3.01665139 3.99957899] Number of iterations taken =  544 \\
Weights from Sum of Squared Euclidean Distance Error = [4.19130419e-04 2.71024299e+00 3.76068168e+00 4.83138405e+00] Number of iterations taken =  100000 \\
$R^2$ for Maximum Likelihood = 0.9915521623406103 \\
$R^2$ for Sum of Squared Error = 0.9915472787565527 \\
$R^2$ for Sum of Squared Euclidean Distance Error = 0.9292159669387261 \\

Normalized dataset results:\\
Weights from Maximum Likelihood = [-0.09614229  0.25250589  0.37625801  0.50156219] \\
Weights from Sum of Squared Error = [-0.1068754   0.25926529  0.38285911  0.5086211 ] Number of iterations taken =  410 \\
Weights from Sum of Squared Euclidean Distance Error = [-0.00150743  0.19394194  0.32020437  0.44452522] Number of iterations taken =  100000 \\
$R^2$ for Maximum L

In [70]:
# n_list = [100, 200, 300, 400, 500]
# d_list = [5, 10, 15, 20, 25]
n_list = [300, 500]
d_list = [15, 25]
for n in n_list:
    for d in d_list:
        print("\n\n+++++++++++++START WITH N = " + str(n) + " and D = " + str(d) + "+++++++++++++\\\\")
        x = np.random.randint(100, size=(n, d))
        w_act = np.random.randint(5, size=(d + 1, 1)) + 1
        print("Actual weights = ", w_act.T[0], "\\\\")
        w_init = np.array(np.ones(d + 1).reshape(d + 1, 1))
        test(x, w_act, w_init)
        test_normalization(x, w_act, w_init)
        print("++++++++++++++++++++END+++++++++++++++++++++\\\\")



+++++++++++++START WITH N = 300 and D = 15+++++++++++++\\
Actual weights =  [1 2 4 4 2 4 4 1 4 2 4 5 1 1 2 3] \\
Raw dataset results:\\
Weights from Maximum Likelihood = [1.11379202 2.00029796 3.99977594 4.00013072 2.00049968 4.00002218
 3.99985206 1.0002666  4.00036985 1.9999127  3.99993452 5.00038289
 1.00011637 1.00009822 2.00048834 3.00036932] \\
Weights from Sum of Squared Error = [6.55525175e+303             inf             inf             inf
             inf             inf             inf             inf
             inf             inf             inf             inf
             inf             inf             inf             inf] Number of iterations taken =  91 \\
Weights from Sum of Squared Euclidean Distance Error = [-9.01336498e-06 -1.37630440e+02  4.59005871e+02 -8.54542452e+01
  1.48951236e+03  3.87126499e+02  7.20603366e+02  9.57736517e+02
 -4.30511843e+02  2.89074535e+02 -1.68475555e+03 -3.16648651e+02
 -4.24806071e+02 -6.64258165e+02 -1.77869449e+02  8.49412333e+

Raw dataset results:\\
Weights from Maximum Likelihood = [3.24497527 1.00028751 3.99993721 1.0000767  2.99997367 5.00030765
 5.00010002 4.000008   5.00028374 4.00049966 0.9997872  1.99978382
 3.00006563 4.00001772 5.00016734 1.99997228 5.00055664 0.99983617
 1.99990638 1.00008227 3.00002484 1.99976782 0.99994172 0.99967888
 1.9996288  2.9995631 ] \\
Weights from Sum of Squared Error = [9.45550614e+298 4.95770499e+300 4.96664283e+300 4.81437234e+300
 4.83837582e+300 4.87131164e+300 4.77213033e+300 4.72443157e+300
 4.82788492e+300 4.90858866e+300 4.86871021e+300 4.66732369e+300
 4.97032502e+300 4.67548743e+300 4.64735652e+300 4.74008072e+300
 4.75269017e+300 4.72077171e+300 4.73918214e+300 4.70027626e+300
 4.83761292e+300 4.73605989e+300 4.76102506e+300 4.83220060e+300
 4.95664850e+300 4.76815571e+300] Number of iterations taken =  79 \\
Weights from Sum of Squared Euclidean Distance Error = [-3.44913351e-06 -2.30382376e+03 -1.68883000e+03 -5.33322120e+02
 -2.63620894e+02 -4.55335563e+02