In [1]:
import numpy as np

In [2]:
def mse(y_true, y_pred):
    return np.mean(np.power(y_true - y_pred, 2))

def mse_grad(y_true, y_pred):
    return 2 * (y_pred - y_true) / np.size(y_true)

def binary_cross_entropy(y_true, y_pred):
    return np.mean(-y_true * np.log(y_pred) - (1 - y_true) * np.log(1 - y_pred))

def binary_cross_entropy_grad(y_true, y_pred): # wrt y_pred
    return ((1 - y_true) / (1 - y_pred) - y_true / y_pred) / np.size(y_true)

def cross_entropy(y_true,y_pred,epsilon=1e-15):
    y_pred = np.clip(y_pred, epsilon, 1.0-epsilon)
    return np.mean(-y_true*np.log(y_pred)) 

def cross_entropy_grad(y_true,y_pred,epsilon=1e-15):
    y_pred = np.clip(y_pred, epsilon, 1.0-epsilon)
    return np.mean(-y_true/y_pred) 

In [3]:
y_pred = np.random.randn(4,3)
y_pred = np.exp(y_pred)/np.sum( np.exp(y_pred),axis=0) 
y_pred

array([[0.14879928, 0.15663419, 0.52594875],
       [0.64177533, 0.0839823 , 0.34586745],
       [0.11229952, 0.30872901, 0.08755595],
       [0.09712587, 0.4506545 , 0.04062785]])

In [4]:
y = np.array( [np.random.randint(0,4) for i in range(3)] )
y
y = np.array([[1 if val==i else 0 for i in range(4)] for val in y]).T
y

array([[0, 0, 0],
       [1, 1, 0],
       [0, 0, 1],
       [0, 0, 0]])

In [5]:
np.sum(np.argmax(y_pred,axis=0) == np.argmax(y,axis=0))

1

In [6]:
cross_entropy_grad(y,y_pred)

-2.0738930048920152

In [7]:
import numpy as np

def cross_entropy_loss(y_true, y_pred):
    """
    Calculate the cross-entropy loss.

    Parameters:
    - y_true: 2D array with one-hot encoded true class labels (shape: [num_samples, num_classes]).
    - y_pred: 2D array with predicted probabilities (shape: [num_samples, num_classes]).

    Returns:
    - float: Cross-entropy loss.
    """
    epsilon = 1e-15  # Small constant to avoid log(0)
    num_samples = len(y_true)

    # Clip predicted probabilities to avoid log(0)
    y_pred = np.clip(y_pred, epsilon, 1 - epsilon)

    # Calculate cross-entropy loss
    loss = -np.sum(y_true * np.log(y_pred)) / num_samples
    return loss

def cross_entropy_loss_grad(y_true, y_pred):
    """
    Calculate the gradient of the cross-entropy loss with respect to the predicted probabilities.

    Parameters:
    - y_true: 2D array with one-hot encoded true class labels (shape: [num_samples, num_classes]).
    - y_pred: 2D array with predicted probabilities (shape: [num_samples, num_classes]).

    Returns:
    - 2D array: Gradient of the cross-entropy loss with respect to y_pred (shape: [num_samples, num_classes]).
    """
    epsilon = 1e-15  # Small constant to avoid division by zero
    num_samples = len(y_true)

    # Clip predicted probabilities to avoid division by zero
    y_pred = np.clip(y_pred, epsilon, 1 - epsilon)

    # Calculate the gradient of the cross-entropy loss
    grad = (-y_true / y_pred) / num_samples
    return grad

# Example usage
num_classes = 3
num_samples = 5

# Generate random one-hot encoded true labels and predicted probabilities
y_true = np.eye(num_classes)[np.random.choice(num_classes, num_samples)]
y_pred = np.random.rand(num_samples, num_classes)

# Calculate cross-entropy loss
loss = cross_entropy_loss(y_true, y_pred)
print(f"Cross-entropy loss: {loss:.4f}")

# Calculate the gradient of cross-entropy loss
grad = cross_entropy_loss_grad(y_true, y_pred)
print("Gradient of cross-entropy loss:")
print(grad)


Cross-entropy loss: 0.9246
Gradient of cross-entropy loss:
[[-0.         -1.17912289 -0.        ]
 [-0.         -0.         -0.32359368]
 [-0.         -0.32908379 -0.        ]
 [-0.         -0.         -1.1748346 ]
 [-0.         -0.22078801 -0.        ]]


In [8]:
y = [2,3, 4]

np.argmax(y)

2

In [9]:
y = np.array([i for i in range(16)])

In [10]:
y.reshape(2,8)

array([[ 0,  1,  2,  3,  4,  5,  6,  7],
       [ 8,  9, 10, 11, 12, 13, 14, 15]])

In [11]:
import numpy as np

# Example arrays
array1 = np.array([1, 2, 3, 4, 5])
array2 = np.array(['a', 'b', 'c', 'd', 'e'])

# Generate shuffled indices
shuffled_indices = np.arange(len(array1))
np.random.shuffle(shuffled_indices)

# Shuffle both arrays using the same indices
shuffled_array1 = array1[shuffled_indices]
shuffled_array2 = array2[shuffled_indices]

# Print the original and shuffled arrays
print("Original array1:", array1)
print("Original array2:", array2)
print("\nShuffled array1:", shuffled_array1)
print("Shuffled array2:", shuffled_array2)


Original array1: [1 2 3 4 5]
Original array2: ['a' 'b' 'c' 'd' 'e']

Shuffled array1: [4 1 2 3 5]
Shuffled array2: ['d' 'a' 'b' 'c' 'e']


In [12]:
xx = (np.arange(10))
np.random.shuffle(xx)
xx

array([1, 0, 6, 5, 8, 9, 7, 3, 2, 4])

In [13]:
xx[:4]

array([1, 0, 6, 5])

In [14]:
xx[4:8]

array([8, 9, 7, 3])

In [15]:
xx[8:12]

array([2, 4])

In [16]:
8/3

2.6666666666666665

In [17]:
8//3

2

In [18]:
batch_size = 8
(17+batch_size-1)//batch_size

3

In [19]:
x = np.array([_ for _ in range(16)]).reshape(4,4)

In [20]:
x = x.reshape(4,2,2)
x

array([[[ 0,  1],
        [ 2,  3]],

       [[ 4,  5],
        [ 6,  7]],

       [[ 8,  9],
        [10, 11]],

       [[12, 13],
        [14, 15]]])

In [21]:
for i in range(3):
    print(np.sum(x, axis=i))
    print()

[[24 28]
 [32 36]]

[[ 2  4]
 [10 12]
 [18 20]
 [26 28]]

[[ 1  5]
 [ 9 13]
 [17 21]
 [25 29]]



In [22]:
x = np.arange(16).reshape(-1,2)

In [23]:
np.size(x, axis=1)

2

In [24]:
x

array([[ 0,  1],
       [ 2,  3],
       [ 4,  5],
       [ 6,  7],
       [ 8,  9],
       [10, 11],
       [12, 13],
       [14, 15]])

In [25]:
np.mean(x, axis=1, keepdims=True)

array([[ 0.5],
       [ 2.5],
       [ 4.5],
       [ 6.5],
       [ 8.5],
       [10.5],
       [12.5],
       [14.5]])

In [26]:
x = x-np.max(x,axis=0)
x

array([[-14, -14],
       [-12, -12],
       [-10, -10],
       [ -8,  -8],
       [ -6,  -6],
       [ -4,  -4],
       [ -2,  -2],
       [  0,   0]])

In [27]:
x = np.exp(x) / np.sum( np.exp(x), axis=0, keepdims=True)
x

array([[7.18993625e-07, 7.18993625e-07],
       [5.31268423e-06, 5.31268423e-06],
       [3.92557218e-05, 3.92557218e-05],
       [2.90062731e-04, 2.90062731e-04],
       [2.14328979e-03, 2.14328979e-03],
       [1.58368885e-02, 1.58368885e-02],
       [1.17019658e-01, 1.17019658e-01],
       [8.64664814e-01, 8.64664814e-01]])

In [28]:
np.sum(x, axis=0,keepdims=True)

array([[1., 1.]])

In [29]:
x = np.arange(12).reshape(4,3)

In [30]:
x

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11]])

In [31]:
out_grad = np.random.randn(4,3)
out_grad

array([[ 1.31637246,  0.67385564,  0.49822016],
       [-0.08805314, -1.59005347,  0.28489178],
       [ 0.1105085 ,  0.09189745, -0.9295835 ],
       [ 1.07346238, -0.22969457,  0.45190604]])

In [32]:
out_grad[:,1:2]

array([[ 0.67385564],
       [-1.59005347],
       [ 0.09189745],
       [-0.22969457]])

In [33]:
n = np.size(x,axis=0)

In [34]:
index = 2
input = x[:,index:index+1]
print(f"{input}")
print(f"{out_grad[:,index:index+1]}")
np.dot( (np.identity(n) - input.T )*input, out_grad[:,index:index+1]) 

[[ 2]
 [ 5]
 [ 8]
 [11]]
[[ 0.49822016]
 [ 0.28489178]
 [-0.9295835 ]
 [ 0.45190604]]


array([[ 1.08604489],
       [ 1.6484703 ],
       [-7.07824974],
       [ 5.46379154]])

In [35]:
np.size(x, axis=1) == np.size(out_grad, axis=1)

True

In [36]:
print(np.hstack( [np.dot( (np.identity(n)-x[:,i:i+1].T)*x[:,i:i+1], out_grad[:,i:i+1]) for i in range(np.size(x,axis=1)) ] ))
print()
print(np.hstack( [[np.dot( (np.identity(n)-x[:,i:i+1].T)*x[:,i:i+1], out_grad[:,i:i+1])] for i in range(np.size(x,axis=1)) ] ))


[[  0.           8.01387745   1.08604489]
 [-30.44431852  22.99987334   1.6484703 ]
 [-59.6972672   52.02343479  -7.07824974]
 [-80.87931586  71.1032724    5.46379154]]

[[[  0.        ]
  [-30.44431852]
  [-59.6972672 ]
  [-80.87931586]
  [  8.01387745]
  [ 22.99987334]
  [ 52.02343479]
  [ 71.1032724 ]
  [  1.08604489]
  [  1.6484703 ]
  [ -7.07824974]
  [  5.46379154]]]


In [37]:
grad = np.hstack([ np.dot( (np.identity(n) - input.T )*input, og.T) for input,og in zip(x.T, out_grad.T)  ])

In [38]:
grad

array([-90.13628078, -90.4004402 , -89.47322978, -80.47511934,
        43.90733752,  36.87326798,  43.876764  ,  40.9365362 ,
        -3.30602214,  -2.87800357, -11.73913044,   0.66850399])

In [39]:
input = x.T[2]


In [40]:


z = np.random.randn(4,3)

In [41]:
z

array([[-0.33278326, -0.53682407,  1.26221123],
       [ 1.22063451, -0.82217607,  1.02838569],
       [ 2.12760077, -0.55876496, -0.51955966],
       [-0.79225827, -1.30919617, -1.53427343]])

In [42]:
p = np.zeros(z.shape)

In [43]:
p

array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]])

In [44]:
z = np.array([_ for _ in range(16)]).reshape(4,4)
z

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15]])

In [45]:
z*z

array([[  0,   1,   4,   9],
       [ 16,  25,  36,  49],
       [ 64,  81, 100, 121],
       [144, 169, 196, 225]])

In [46]:
eps = 1e-8

In [47]:
np.sqrt(z)+eps

array([[1.00000000e-08, 1.00000001e+00, 1.41421357e+00, 1.73205082e+00],
       [2.00000001e+00, 2.23606799e+00, 2.44948975e+00, 2.64575132e+00],
       [2.82842713e+00, 3.00000001e+00, 3.16227767e+00, 3.31662480e+00],
       [3.46410163e+00, 3.60555129e+00, 3.74165740e+00, 3.87298336e+00]])

In [48]:
func = lambda x,y : x**2 + y

In [49]:
func(4,1)

17

In [50]:
from sklearn.preprocessing import StandardScaler

In [51]:
x = np.arange(16).reshape(4,4)

In [52]:
x

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15]])

In [53]:
(x-np.mean(x,axis=0,keepdims=True)) / np.std(x,axis=0,keepdims=True)

array([[-1.34164079, -1.34164079, -1.34164079, -1.34164079],
       [-0.4472136 , -0.4472136 , -0.4472136 , -0.4472136 ],
       [ 0.4472136 ,  0.4472136 ,  0.4472136 ,  0.4472136 ],
       [ 1.34164079,  1.34164079,  1.34164079,  1.34164079]])

In [54]:
np.std(x,axis=0,keepdims=True)

array([[4.47213595, 4.47213595, 4.47213595, 4.47213595]])

In [55]:
scaler = StandardScaler()
x[:,3:4] = scaler.fit_transform(x[:,3:4])
x

array([[ 0,  1,  2, -1],
       [ 4,  5,  6,  0],
       [ 8,  9, 10,  0],
       [12, 13, 14,  1]])

In [56]:
x

array([[ 0,  1,  2, -1],
       [ 4,  5,  6,  0],
       [ 8,  9, 10,  0],
       [12, 13, 14,  1]])

In [57]:
scaler = StandardScaler()
scaler.fit_transform(x)

array([[-1.34164079, -1.34164079, -1.34164079, -1.41421356],
       [-0.4472136 , -0.4472136 , -0.4472136 ,  0.        ],
       [ 0.4472136 ,  0.4472136 ,  0.4472136 ,  0.        ],
       [ 1.34164079,  1.34164079,  1.34164079,  1.41421356]])

In [58]:
sdt = "fsdfsd\n\n"
sdt.replace("\n",'_')

'fsdfsd__'

In [59]:
def softmax(input):
    input = input - np.max(input) #,axis=1,keepdims=True)
    input = np.exp(input)
    return input / np.sum(input) #,axis=1,keepdims=True)

def softmax_grad(output,out_grad):
    # batch_size = len(out_grad) 
    output_size = len(out_grad)

    # d_softmax = np.zeros_like(output)

    J_softmax = np.zeros((output_size,output_size)) 
    for i in range(output_size):
        for j in range(output_size):
            J_softmax[i,j] = output[i] * ((i==j) - output[j])

    return np.dot(out_grad, J_softmax) 

In [60]:
x = np.array([[1,2,3],[4,5,6]])
softmax(x)

array([[0.00426978, 0.01160646, 0.03154963],
       [0.08576079, 0.23312201, 0.63369132]])

In [61]:
out_grad = np.array([[i for i in range(3)]*2]).reshape(-1,3)

In [62]:

out_grad

array([[0, 1, 2],
       [0, 1, 2]])

In [63]:
softmax_grad(softmax(x),out_grad)

ValueError: setting an array element with a sequence.

In [None]:
x = np.array([1,2,3])
out_grad = np.array([3,2,1])

In [None]:
input_grad = softmax_grad(softmax(x),out_grad)

In [None]:
np.vstack( [input_grad, input_grad])

array([[ 0.14181709,  0.14077036, -0.28258745],
       [ 0.14181709,  0.14077036, -0.28258745]])

In [64]:
x = np.array([2,3,3,12,3,4234,-23434])
x.clip(-500,500)

array([   2,    3,    3,   12,    3,  500, -500])