In [12]:
"""
Basics
Each value in the array is a weight that determines how much influence a particular input node has on a particular hidden layer node.
After transpose-:
 toes  win %  fans
 hid[0] 0.1   -0.1   0.1
 hid[1] 0.2    0.1   0.4
 hid[2] -0.1   0.9   0.1

 hidden to predict layer
 hid[0]  hid[1]  hid[2]
 hurt?  0.3     0.1     0.0
 win?   1.1     0.2     1.3
 sad?  -0.3     0.0     0.1

 So, hp_wgt[0, 1] = (0.1) is the weight connecting hid[1] to the hurt? prediction, 
 hp_wgt[2, 2] = (0.1) is the weight connecting hid[2] to the sad? prediction, and so on.

How it looks like-:
    Input Layer          Hidden Layer         Output Layer
    (3 nodes)            (3 nodes)           (3 nodes)

      toes  --------> hid[0] --------> hurt?
             \       /  /|\       / \
      win % -----> hid[1] ---  ---  win?
             /     \   |       \ /
      fans --------> hid[2] --------> sad?

      Weights:     ih_wgt          hp_wgt

Everything we’ve done in this chapter is a form of what’s called forward propagation, wherein
a neural network takes input data and makes a prediction. It’s called this because you’re
propagating activations forward through the network. In these examples, activations are all the
numbers that are not weights and are unique for every prediction.
The weights are static for a given trained network, while the activations change with each new input.

Data is presented like this
ih_wgt = np.array(
    [    toes  wl   fans
        [0.1, 0.2, -0.1],  # hid[0]
        [-0.1, 0.1, 0.9],  # hid[1]
        [0.1, 0.4, 0.1],  # hid[2]
    ]
).T

Transpose why?
This is our input to hidden weight layer -> we need to multiply this with input (num_toes, wlrec, n_fans)
So it should multiply like 0.1 * num_toes + wlrec * -0.1 + n_fans * 0.1 hence we transpose the matrix into this-:
hid[0]  hid[1]  hid[2]
toes      0.1     0.2    -0.1
win%     -0.1     0.1     0.9
fans      0.1     0.4     0.1

A weight of 0.9 from 'win%' to 'hid[2]' means that the win/loss record has a strong positive influence on the activation of the third hidden node
A weight of -0.1 from 'toes' to 'hid[2]' means that the number of toes has a small negative influence on that same hidden node
The hidden layer is learning to detect different patterns or combinations of the input features.

The output vector represents the network's prediction scores for each sentiment:

0.21 for "hurt?"
0.145 for "win?"
0.506 for "sad?"
This means that for the input [8.5, 0.65, 1.2], the network predicts the highest score for "sad?" (0.506), 
    suggesting this is the most likely sentiment outcome based on the patterns it has learned.

Forward Pass: Input → Hidden → Output
    We take our input values
    We multiply by weights to get hidden layer activations
    We multiply by more weights to get output predictions

The Backward Pass (Learning)
    Compare output to actual targets (what should have been predicted)
    Calculate error/loss
    Propagate error backward through the network
    Update weights to reduce error
    Repeat many times with training data
"""

  \       /  /|\       / \


'\nBasics\nEach value in the array is a weight that determines how much influence a particular input node has on a particular hidden layer node.\nAfter transpose-:\n toes  win %  fans\n hid[0] 0.1   -0.1   0.1\n hid[1] 0.2    0.1   0.4\n hid[2] -0.1   0.9   0.1\n\n hidden to predict layer\n hid[0]  hid[1]  hid[2]\n hurt?  0.3     0.1     0.0\n win?   1.1     0.2     1.3\n sad?  -0.3     0.0     0.1\n\n So, hp_wgt[0, 1] = (0.1) is the weight connecting hid[1] to the hurt? prediction, \n hp_wgt[2, 2] = (0.1) is the weight connecting hid[2] to the sad? prediction, and so on.\n\nHow it looks like-:\n    Input Layer          Hidden Layer         Output Layer\n    (3 nodes)            (3 nodes)           (3 nodes)\n\n      toes  --------> hid[0] --------> hurt?\n             \\       /  /|\\       /       win % -----> hid[1] ---  ---  win?\n             /     \\   |       \\ /\n      fans --------> hid[2] --------> sad?\n\n      Weights:     ih_wgt          hp_wgt\n\nEverything we’ve done in

In [13]:
# neural network
import numpy as np

weights = np.array([0.1, 0.2, 0])


def neural_network(input, weights):
    pred = input.dot(weights)
    return pred


toes = np.array([8.5, 9.5, 9.9, 9.0])
wlrec = np.array([0.65, 0.8, 0.8, 0.9])
nfans = np.array([1.2, 1.3, 0.5, 1.0])
input = np.array([toes[0], wlrec[0], nfans[0]])
pred = neural_network(input, weights)
print(pred)


# dot product does the same as this
def w_sum(a, b):
    assert len(a) == len(b)
    output = 0
    for i in range(len(a)):
        output += a[i] * b[i] # weights * input
    return output

0.9800000000000001


In [14]:
# we use this neural network to feed output of this into another neural network
# since finding patterns is hard, often a single weight matrix mul is not sufficient.
# input -> hid(pred1) -> pred(final)
import numpy as np

        # toes % win # fans
ih_wgt = np.array(
    [
        [0.1, 0.2, -0.1],  # hid[0]
        [-0.1, 0.1, 0.9],  # hid[1]
        [0.1, 0.4, 0.1],  # hid[2]
    ]
).T

        # hid[0] hid[1] hid[2]
hp_wgt = np.array(
    [
        [0.3, 1.1, -0.3],  # hurt?
        [0.1, 0.2, 0.0],  # win?
        [0.0, 1.3, 0.1],  # sad?
    ]
).T

weights = [ih_wgt, hp_wgt]

def neural_network(input, weights):
    hid = input.dot(weights[0])
    pred = hid.dot(weights[1])
    return pred


toes = np.array([8.5, 9.5, 9.9, 9.0])
wlrec = np.array([0.65, 0.8, 0.8, 0.9])
nfans = np.array([1.2, 1.3, 0.5, 1.0])
input = np.array([toes[0], wlrec[0], nfans[0]])
pred = neural_network(input, weights)
# prediction of what factors num toes, win/loss rec, fans play on actual winning of match
print(pred) 


[0.2135 0.145  0.5065]


In [43]:
# Weight Updates:

# For hidden→prediction weights: Create a gradient using the hidden layer activations and output deltas
# For input→hidden weights: Create a gradient using the input values and hidden deltas
# Subtract a portion (learning_rate) of these gradients from the weights

# Building the entire network
ih_wgt = np.random.randn(3, 3) * 0.1 
hp_wgt = np.random.randn(3, 3) * 0.1
weights = [ih_wgt, hp_wgt]

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_derivative(x):
    return x * (1 - x)

# forward
def neuralnet(input, weights):
    hid_inp = input.dot(weights[0])
    hid = sigmoid(hid_inp)
    pred_inp = hid.dot(weights[1])
    pred = sigmoid(pred_inp)
    return pred, hid, hid_inp, pred_inp

toes = np.array([8.5, 9.5, 9.9, 9.0])
wlrec = np.array([0.65, 0.8, 0.8, 0.9])
nfans = np.array([1.2, 1.3, 0.5, 1.0])

# normalise
toes_norm = (toes - np.mean(toes)) / np.std(toes)
wlrec_norm = (wlrec - np.mean(wlrec)) / np.std(wlrec)
nfans_norm = (nfans - np.mean(nfans)) / np.std(nfans)

# Feature Matrix
X = np.column_stack((toes_norm, wlrec_norm, nfans_norm))

# Target values (example: these would be the actual sentiments for each game)
# Format: [hurt?, win?, sad?]
y = np.array([
    [0.1, 0.9, 0.2],  # Game 1: Likely win, not hurt, not sad
    [0.0, 0.8, 0.3],  # Game 2: Likely win, not hurt, slightly sad
    [0.7, 0.2, 0.5],  # Game 3: Likely hurt, not win, somewhat sad
    [0.3, 0.9, 0.1]   # Game 4: Likely win, slightly hurt, not sad
])

lr, epochs = 0.1, 10000
for epoch in range(epochs):
    total_loss = 0
    for i in range(len(X)):
        input_data = X[i]
        target = y[i]
        pred, hid, hid_inp, pred_inp = neuralnet(input_data, weights)
        loss = np.mean((pred - target) ** 2)
        total_loss += loss

        # backpropagation
        err = pred - target
        err_delta = err * sigmoid_derivative(pred) 
        # hidden_to_pred layer error
        hidden_err = err_delta.dot(weights[1].T)
        hidden_delta = hidden_err * sigmoid_derivative(hid)
        # update weights
        # basically gradient here is the derivative we do in gradient descent. but how do we calculate this in terms of matrix?
        # wasnt it this? gradients = 1 / m * (X.T @ error)
        # If I turn the dial between hid[i] and output[j], how much will the error change?
        hp_pred_gradient = np.outer(hid, err_delta)
        weights[1] -= lr * hp_pred_gradient
        # input-> hidden
        ih_pred_gradient = np.outer(input_data, hidden_delta) # input * hidden me kitna error tha (what does hidden_delta mean here)
        weights[0] -= lr * ih_pred_gradient
    if epoch % 1000 == 0:
        print(f"Epoch {epoch}, Loss : {total_loss/len(X)}")

# Test trained network
for i in range(len(X)):
    pred, _, _, _ = neuralnet(X[i], weights)
    print(f"\nGame {i+1}:")
    print(f"Input: Toes={toes[i]}, Win/Loss={wlrec[i]}, Fans={nfans[i]}")
    print(f"Prediction: Hurt={pred[0]:.3f}, Win={pred[1]:.3f}, Sad={pred[2]:.3f}")
    print(f"Target: Hurt={y[i,0]:.3f}, Win={y[i,1]:.3f}, Sad={y[i,2]:.3f}")


# Make a new prediction (new game data)
new_input = np.array([8.8, 0.12, 0])  # (num_toes, wl, fans) ~ not going to win, going to be pretty sad (actual ans : Hurt=0.139, Win=0.433, Sad=0.684 Checks out)
new_input_norm = (new_input - np.array([np.mean(toes), np.mean(wlrec), np.mean(nfans)])) / np.array([np.std(toes), np.std(wlrec), np.std(nfans)])
new_pred, _, _, _ = neuralnet(new_input_norm, weights)
print(f"\nNew prediction for Toes={new_input[0]}, Win/Loss={new_input[1]}, Fans={new_input[2]}:")
print(f"Hurt={new_pred[0]:.3f}, Win={new_pred[1]:.3f}, Sad={new_pred[2]:.3f}")



Epoch 0, Loss : 0.10274602160572524
Epoch 1000, Loss : 0.017924054333226427
Epoch 2000, Loss : 0.016793841822455455
Epoch 3000, Loss : 0.014440416455399905
Epoch 4000, Loss : 0.012387851717950227
Epoch 5000, Loss : 0.011838216443075502
Epoch 6000, Loss : 0.011668971906495176
Epoch 7000, Loss : 0.011589497419769088
Epoch 8000, Loss : 0.011540781765255272
Epoch 9000, Loss : 0.011506275597289613

Game 1:
Input: Toes=8.5, Win/Loss=0.65, Fans=1.2
Prediction: Hurt=0.044, Win=0.890, Sad=0.194
Target: Hurt=0.100, Win=0.900, Sad=0.200

Game 2:
Input: Toes=9.5, Win/Loss=0.8, Fans=1.3
Prediction: Hurt=0.054, Win=0.807, Sad=0.304
Target: Hurt=0.000, Win=0.800, Sad=0.300

Game 3:
Input: Toes=9.9, Win/Loss=0.8, Fans=0.5
Prediction: Hurt=0.499, Win=0.502, Sad=0.499
Target: Hurt=0.700, Win=0.200, Sad=0.500

Game 4:
Input: Toes=9.0, Win/Loss=0.9, Fans=1.0
Prediction: Hurt=0.302, Win=0.898, Sad=0.105
Target: Hurt=0.300, Win=0.900, Sad=0.100

New prediction for Toes=8.8, Win/Loss=0.12, Fans=0.0:
Hurt=0.2

In [47]:
"""
NumPy version (Regression-like output): 
produced three separate output values, which you interpreted as raw probabilities for "hurt", "win", and "sad" independently. 
For example, [0.053, 0.887, 0.190]. This is more like a multi-output regression or predicting independent probabilities.

PyTorch version (Multi-class Classification): 
This version predicts one single class out of the three possibilities. 
It assumes that for any given input, only one of "Hurt", "Win", or "Sad" is the primary sentiment. 
The output is a single class label (0, 1, or 2).

Predicted classes: tensor([1, 1, 0, 2]) means:


For the 1st input sample, the network predicts Class 1 ("Win").
For the 2nd input sample, the network predicts Class 1 ("Win").
For the 3rd input sample, the network predicts Class 0 ("Hurt").
For the 4th input sample, the network predicts Class 2 ("Sad").

NumPy Version (Predicting individual sentiment scores/probabilities):

Question it answers: "For this game, what is the probability of being hurt? What is the probability of winning? What is the probability of being sad?" (These could be independent or sum to 1, depending on how you train it).
Target format: A vector of scores/probabilities for each sentiment, e.g., [0.1, 0.9, 0.2].
Analogy: Predicting the individual scores a student gets in Math, Science, and English.



PyTorch Version (Predicting a single dominant sentiment class):

Question it answers: "For this game, what is the single most dominant sentiment: Hurt, Win, or Sad?"
Target format: A single integer representing the class label, e.g., 1 (for "Win").
Analogy: Predicting a student's overall grade category (A, B, C) based on their performance.

In your NumPy code, y provided a vector of target scores for each sample.
In your PyTorch code, targets provides a single class label for each sample, chosen from 3 possible classes.

How inp / features looks like-:
       toes  wlrec  nfans
      --------------------
Sample 1: [8.5,  0.65,  1.2 ]
Sample 2: [9.5,  0.8,   1.3 ]
Sample 3: [9.9,  0.8,   0.5 ]
Sample 4: [9.0,  0.9,   1.0 ]

targets: class indices (0=Hurt, 1=Win, 2=Sad) : Predicted classes: tensor([1, 1, 0, 2])
Sample 1 : Win, Sample 2 : Win, Sample 3 : Hurt, Sample 4 : Sad {can verify}
"""


'\nNumPy version (Regression-like output): \nproduced three separate output values, which you interpreted as raw probabilities for "hurt", "win", and "sad" independently. \nFor example, [0.053, 0.887, 0.190]. This is more like a multi-output regression or predicting independent probabilities.\n\nPyTorch version (Multi-class Classification): \nThis version predicts one single class out of the three possibilities. \nIt assumes that for any given input, only one of "Hurt", "Win", or "Sad" is the primary sentiment. \nThe output is a single class label (0, 1, or 2).\n\nPredicted classes: tensor([1, 1, 0, 2]) means:\n\n\nFor the 1st input sample, the network predicts Class 1 ("Win").\nFor the 2nd input sample, the network predicts Class 1 ("Win").\nFor the 3rd input sample, the network predicts Class 0 ("Hurt").\nFor the 4th input sample, the network predicts Class 2 ("Sad").\n\nNumPy Version (Predicting individual sentiment scores/probabilities):\n\nQuestion it answers: "For this game, what

In [None]:
# same in pytorch
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

toes = torch.tensor([8.5, 9.5, 9.9, 9.0])
wlrec = torch.tensor([0.65, 0.8, 0.8, 0.9])
nfans = torch.tensor([1.2, 1.3, 0.5, 1.0])

# Normalize features
features = torch.stack([toes, wlrec, nfans], dim=1)
features = (features - features.mean(dim=0)) / features.std(dim=0)
# targets: class indices (0=Hurt, 1=Win, 2=Sad)
targets = torch.tensor([1, 1, 0, 2])  
# if you want to it like numpy -> make targets normal not classes and use sigmoid instead of crossentropy and loss = nn.MSELoss()
# targets_multilabel = torch.tensor([
#     [0.1, 0.9, 0.2],  # Game 1
#     [0.0, 0.8, 0.3],  # Game 2
#     [0.7, 0.2, 0.5],  # Game 3
#     [0.3, 0.9, 0.1]   # Game 4
# ], dtype=torch.float32)

dataset = TensorDataset(features, targets)
loader = DataLoader(dataset, batch_size=2, shuffle=True)

class PyTorchNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.inp_to_hidden = nn.Linear(3, 3)
        self.relu = nn.ReLU() # activation function
        self.hidden_to_pred = nn.Linear(3, 3)

    # Takes input
    def forward(self, x):
        x = self.inp_to_hidden(x)
        x = self.relu(x)
        x = self.hidden_to_pred(x)
        return x 
model = PyTorchNet()
loss = nn.CrossEntropyLoss() # target - pred

optimizer = optim.Adam(model.parameters(), lr=0.01) # weights -= learning_rate * gradient
epochs = 2000
for epoch in range(epochs):
    for xb, yb in loader:
        optimizer.zero_grad()
        pred = model(xb)
        final_loss = loss(pred, yb)
        final_loss.backward()
        optimizer.step() 
    if epoch % 500 == 0:
        print(f"Epoch {epoch}, Loss : {final_loss.item():.4f}")

# prediction
with torch.no_grad():
    pred = model(features)
    probs = torch.softmax(pred, dim=1)
    prediction = torch.argmax(probs, dim=1)
    print("\nPredicted class probabilities:\n", probs)
    print("Predicted classes:", prediction)
    print("True classes:", targets)

Epoch 0, Loss : 1.1407
Epoch 500, Loss : 0.0006
Epoch 1000, Loss : 0.0002
Epoch 1500, Loss : 0.0001

Predicted class probabilities:
 tensor([[1.7477e-05, 9.9998e-01, 3.7920e-06],
        [1.1746e-05, 9.9997e-01, 1.3212e-05],
        [9.9997e-01, 5.3250e-06, 2.0332e-05],
        [2.4065e-06, 4.0955e-05, 9.9996e-01]])
Predicted classes: tensor([1, 1, 0, 2])
True classes: tensor([1, 1, 0, 2])
