In [68]:
import numpy as np

In [70]:
def mse(y_true, y_pred):
    return np.mean(np.power(y_true - y_pred, 2))

def mse_grad(y_true, y_pred):
    return 2 * (y_pred - y_true) / np.size(y_true)

def binary_cross_entropy(y_true, y_pred):
    return np.mean(-y_true * np.log(y_pred) - (1 - y_true) * np.log(1 - y_pred))

def binary_cross_entropy_grad(y_true, y_pred): # wrt y_pred
    return ((1 - y_true) / (1 - y_pred) - y_true / y_pred) / np.size(y_true)

def cross_entropy(y_true,y_pred,epsilon=1e-15):
    y_pred = np.clip(y_pred, epsilon, 1.0-epsilon)
    return np.mean(-y_true*np.log(y_pred)) 

def cross_entropy_grad(y_true,y_pred,epsilon=1e-15):
    y_pred = np.clip(y_pred, epsilon, 1.0-epsilon)
    return np.mean(-y_true/y_pred) 

In [103]:
y_pred = np.random.randn(4,3)
y_pred = np.exp(y_pred)/np.sum( np.exp(y_pred),axis=0) 
y_pred

array([[0.50461979, 0.22395939, 0.19356359],
       [0.2932162 , 0.03676562, 0.41132993],
       [0.18079632, 0.50143734, 0.08565319],
       [0.02136769, 0.23783765, 0.30945328]])

In [113]:
y = np.array( [np.random.randint(0,4) for i in range(3)] )
y
y = np.array([[1 if val==i else 0 for i in range(4)] for val in y]).T
y

array([[0, 0, 0],
       [0, 0, 1],
       [1, 0, 0],
       [0, 1, 0]])

In [114]:
np.sum(np.argmax(y_pred,axis=0) == np.argmax(y,axis=0))

1

In [84]:
cross_entropy_grad(y,y_pred)

ValueError: operands could not be broadcast together with shapes (4,) (4,3) 

In [6]:
import numpy as np

def cross_entropy_loss(y_true, y_pred):
    """
    Calculate the cross-entropy loss.

    Parameters:
    - y_true: 2D array with one-hot encoded true class labels (shape: [num_samples, num_classes]).
    - y_pred: 2D array with predicted probabilities (shape: [num_samples, num_classes]).

    Returns:
    - float: Cross-entropy loss.
    """
    epsilon = 1e-15  # Small constant to avoid log(0)
    num_samples = len(y_true)

    # Clip predicted probabilities to avoid log(0)
    y_pred = np.clip(y_pred, epsilon, 1 - epsilon)

    # Calculate cross-entropy loss
    loss = -np.sum(y_true * np.log(y_pred)) / num_samples
    return loss

def cross_entropy_loss_grad(y_true, y_pred):
    """
    Calculate the gradient of the cross-entropy loss with respect to the predicted probabilities.

    Parameters:
    - y_true: 2D array with one-hot encoded true class labels (shape: [num_samples, num_classes]).
    - y_pred: 2D array with predicted probabilities (shape: [num_samples, num_classes]).

    Returns:
    - 2D array: Gradient of the cross-entropy loss with respect to y_pred (shape: [num_samples, num_classes]).
    """
    epsilon = 1e-15  # Small constant to avoid division by zero
    num_samples = len(y_true)

    # Clip predicted probabilities to avoid division by zero
    y_pred = np.clip(y_pred, epsilon, 1 - epsilon)

    # Calculate the gradient of the cross-entropy loss
    grad = (-y_true / y_pred) / num_samples
    return grad

# Example usage
num_classes = 3
num_samples = 5

# Generate random one-hot encoded true labels and predicted probabilities
y_true = np.eye(num_classes)[np.random.choice(num_classes, num_samples)]
y_pred = np.random.rand(num_samples, num_classes)

# Calculate cross-entropy loss
loss = cross_entropy_loss(y_true, y_pred)
print(f"Cross-entropy loss: {loss:.4f}")

# Calculate the gradient of cross-entropy loss
grad = cross_entropy_loss_grad(y_true, y_pred)
print("Gradient of cross-entropy loss:")
print(grad)


Cross-entropy loss: 1.1450
Gradient of cross-entropy loss:
[[ -0.28448088  -0.          -0.        ]
 [ -0.          -0.25471642  -0.        ]
 [ -0.         -10.23950689  -0.        ]
 [ -0.          -0.          -0.29424728]
 [ -0.          -0.44913301  -0.        ]]


In [7]:
y = [2,3, 4]

np.argmax(y)

2

In [8]:
y = np.array([i for i in range(16)])

In [9]:
y.reshape(2,8)

array([[ 0,  1,  2,  3,  4,  5,  6,  7],
       [ 8,  9, 10, 11, 12, 13, 14, 15]])

In [10]:
import numpy as np

# Example arrays
array1 = np.array([1, 2, 3, 4, 5])
array2 = np.array(['a', 'b', 'c', 'd', 'e'])

# Generate shuffled indices
shuffled_indices = np.arange(len(array1))
np.random.shuffle(shuffled_indices)

# Shuffle both arrays using the same indices
shuffled_array1 = array1[shuffled_indices]
shuffled_array2 = array2[shuffled_indices]

# Print the original and shuffled arrays
print("Original array1:", array1)
print("Original array2:", array2)
print("\nShuffled array1:", shuffled_array1)
print("Shuffled array2:", shuffled_array2)


Original array1: [1 2 3 4 5]
Original array2: ['a' 'b' 'c' 'd' 'e']

Shuffled array1: [1 2 4 3 5]
Shuffled array2: ['a' 'b' 'd' 'c' 'e']


In [11]:
xx = (np.arange(10))
np.random.shuffle(xx)
xx

array([8, 1, 6, 7, 5, 9, 0, 3, 4, 2])

In [12]:
xx[:4]

array([8, 1, 6, 7])

In [13]:
xx[4:8]

array([5, 9, 0, 3])

In [14]:
xx[8:12]

array([4, 2])

In [15]:
8/3

2.6666666666666665

In [16]:
8//3

2

In [17]:
batch_size = 8
(17+batch_size-1)//batch_size

3

In [18]:
x = np.array([_ for _ in range(16)]).reshape(4,4)

In [19]:
x = x.reshape(4,2,2)
x

array([[[ 0,  1],
        [ 2,  3]],

       [[ 4,  5],
        [ 6,  7]],

       [[ 8,  9],
        [10, 11]],

       [[12, 13],
        [14, 15]]])

In [20]:
for i in range(3):
    print(np.sum(x, axis=i))
    print()

[[24 28]
 [32 36]]

[[ 2  4]
 [10 12]
 [18 20]
 [26 28]]

[[ 1  5]
 [ 9 13]
 [17 21]
 [25 29]]



In [21]:
x = np.arange(16).reshape(-1,2)

In [22]:
np.size(x, axis=1)

2

In [23]:
x

array([[ 0,  1],
       [ 2,  3],
       [ 4,  5],
       [ 6,  7],
       [ 8,  9],
       [10, 11],
       [12, 13],
       [14, 15]])

In [24]:
np.mean(x, axis=1, keepdims=True)

array([[ 0.5],
       [ 2.5],
       [ 4.5],
       [ 6.5],
       [ 8.5],
       [10.5],
       [12.5],
       [14.5]])

In [25]:
x = x-np.max(x,axis=0)
x

array([[-14, -14],
       [-12, -12],
       [-10, -10],
       [ -8,  -8],
       [ -6,  -6],
       [ -4,  -4],
       [ -2,  -2],
       [  0,   0]])

In [26]:
x = np.exp(x) / np.sum( np.exp(x), axis=0, keepdims=True)
x

array([[7.18993625e-07, 7.18993625e-07],
       [5.31268423e-06, 5.31268423e-06],
       [3.92557218e-05, 3.92557218e-05],
       [2.90062731e-04, 2.90062731e-04],
       [2.14328979e-03, 2.14328979e-03],
       [1.58368885e-02, 1.58368885e-02],
       [1.17019658e-01, 1.17019658e-01],
       [8.64664814e-01, 8.64664814e-01]])

In [27]:
np.sum(x, axis=0,keepdims=True)

array([[1., 1.]])

In [28]:
x = np.arange(12).reshape(4,3)

In [29]:
x

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11]])

In [36]:
out_grad = np.random.randn(4,3)
out_grad

array([[ 0.67717148, -0.00180326,  1.11673346],
       [ 1.232062  , -1.17398262,  0.38570624],
       [ 0.72022374,  0.44995566,  0.65589631],
       [ 1.04552205, -0.27868516, -0.98596944]])

In [57]:
out_grad[:,1:2]

array([[-0.00180326],
       [-1.17398262],
       [ 0.44995566],
       [-0.27868516]])

In [37]:
n = np.size(x,axis=0)

In [67]:
index = 2
input = x[:,index:index+1]
print(f"{input}")
print(f"{out_grad[:,index:index+1]}")
np.dot( (np.identity(n) - input.T )*input, out_grad[:,index:index+1]) 

[[ 2]
 [ 5]
 [ 8]
 [11]]
[[ 1.11673346]
 [ 0.38570624]
 [ 0.65589631]
 [-0.98596944]]


array([[ 5.10645732],
       [ 9.11100722],
       [16.73913216],
       [ 4.95578345]])

In [54]:
np.size(x, axis=1) == np.size(out_grad, axis=1)

True

In [65]:
print(np.hstack( [np.dot( (np.identity(n)-x[:,i:i+1].T)*x[:,i:i+1], out_grad[:,i:i+1]) for i in range(np.size(x,axis=1)) ] ))
print()
print(np.hstack( [[np.dot( (np.identity(n)-x[:,i:i+1].T)*x[:,i:i+1], out_grad[:,i:i+1])] for i in range(np.size(x,axis=1)) ] ))


[[   0.            4.33309249    5.10645732]
 [ -48.58549476   12.64365252    9.11100722]
 [-100.24201908   33.49395985   16.73913216]
 [-147.43534383   40.56210589    4.95578345]]

[[[   0.        ]
  [ -48.58549476]
  [-100.24201908]
  [-147.43534383]
  [   4.33309249]
  [  12.64365252]
  [  33.49395985]
  [  40.56210589]
  [   5.10645732]
  [   9.11100722]
  [  16.73913216]
  [   4.95578345]]]


In [33]:
grad = np.hstack([ np.dot( (np.identity(n) - input.T )*input, og.T) for input,og in zip(x.T, out_grad.T)  ])

In [34]:
grad

array([[ -85.04539749, -100.54416528, -118.23041173],
       [ -86.2995104 , -102.925166  , -121.73830024],
       [ -88.21481954, -104.95067451, -123.87400812],
       [ -73.0649232 ,  -87.94137735, -105.00531015]])

In [35]:
input = x.T[2]
