In [None]:
import random
import math

learning_rate = 0.01

def dot_prod(inputs,w):
    z = w[0]  # this is the bias
    for i in range(1,4):
        z += inputs[i-1] * w[i]
    return z

def ReLU(x,derivative = False):
    if x>=0 and not derivative:
        return x
    if x>=0:
        return 1
    return 0

def sigmoid(x,derivative = False):
    if not derivative:
        return 1/(1 +  math.exp(-x))
    else:
        return sigmoid(x)*(1-sigmoid(x))

def tanh(x,derivative = False):
    if not derivative:
        return math.tanh(x)
    else:
        return 1 - math.tanh(x)**2

def uniform_0_1():
    return random.random()

def uniform_m1_p1():
    return (random.random() * 2) - 1

def zeros():
    return 0

def forward_and_backward(example, g, w):
    grad = [0, 0, 0, 0]
    dot = dot_prod(example[0],w)
    answer = g(dot)
    err =  example[1] - answer
    loss = .5 * (err **2)
    derivative = g(dot, derivative=True)
    grad[0] = -err * derivative * 1
    for i in range(1, 4):
        grad[i] = -err * derivative * example[0][i - 1]
    return answer,err,loss,grad

def update(gradient, w):
    for i in range(0,4):
        w[i] = w[i] - learning_rate * gradient[i]

def train(g, examples, initial_weight_distribution, epochs=100, print_weights=False):
    w = [initial_weight_distribution() for i in range(4)]
    for epoch in range(epochs):
        total_loss = 0
        for example in examples:
            answer,err,loss,grad = forward_and_backward(example, g, w)
            total_loss += loss
            if print_weights:
                print(f"weights: {w[0]:.4f} {w[1]:.4f} {w[2]:.4f} {w[3]:.4f}")
            print(f"{str(example)}:{answer:.4f}, error={err:.4f}, loss={loss:.4f}, grad={grad[0]:.4f} {grad[1]:.4f} {grad[2]:.4f} {grad[3]:.4f}")
            update(grad, w)
        print(f"epoch{epoch}, average loss {total_loss/len(examples):.4f} \n")

In [None]:
majority = [
[[0,0,0],0],
[[0,0,1],0],
[[0,1,0],0],
[[1,0,0],0],
[[0,1,1],1],
[[1,0,1],1],
[[1,1,0],1],
[[1,1,1],1]
]
xor = [[[0,0,0],0],
[[0,0,1],1],
[[0,1,0],1],
[[1,0,0],1],
[[0,1,1],0],
[[1,0,1],0],
[[1,1,0],0],
[[1,1,1],0]]

one_wire_not = [
  [[0,0,0],1],
  [[0,0,1],1],
  [[0,1,0],1],
  [[0,1,1],1],
  [[1,0,0],0],
  [[1,0,1],0],
  [[1,1,0],0],
  [[1,1,1],0]
                 ]
#1
train(tanh, xor,uniform_0_1,epochs=100,print_weights=True)



weights: 0.5581 0.9734 0.4002 0.0105
[[0, 0, 0], 0]:0.5066, error=-0.5066, loss=0.1283, grad=0.3766 0.0000 0.0000 0.0000
weights: 0.5543 0.9734 0.4002 0.0105
[[0, 0, 1], 1]:0.5115, error=0.4885, loss=0.1193, grad=-0.3607 -0.0000 -0.0000 -0.3607
weights: 0.5579 0.9734 0.4002 0.0141
[[0, 1, 0], 1]:0.7434, error=0.2566, loss=0.0329, grad=-0.1148 -0.0000 -0.1148 -0.0000
weights: 0.5591 0.9734 0.4014 0.0141
[[1, 0, 0], 1]:0.9108, error=0.0892, loss=0.0040, grad=-0.0152 -0.0152 -0.0000 -0.0000
weights: 0.5592 0.9736 0.4014 0.0141
[[0, 1, 1], 0]:0.7507, error=-0.7507, loss=0.2818, grad=0.3276 0.0000 0.3276 0.3276
weights: 0.5559 0.9736 0.3981 0.0108
[[1, 0, 1], 0]:0.9122, error=-0.9122, loss=0.4160, grad=0.1532 0.1532 0.0000 0.1532
weights: 0.5544 0.9720 0.3981 0.0093
[[1, 1, 0], 0]:0.9583, error=-0.9583, loss=0.4592, grad=0.0783 0.0783 0.0783 0.0000
weights: 0.5536 0.9712 0.3973 0.0093
[[1, 1, 1], 0]:0.9589, error=-0.9589, loss=0.4597, grad=0.0773 0.0773 0.0773 0.0773
epoch0, average loss 0.

XOR

see it as just two hidden nodes

* d,e,g,h=1
* b=-1
* k=1
* l=-2
* rest is 0

##Basic Torch

In [None]:
#4
import torch
A = torch.rand((40,256,1000), dtype=torch.float16)
x = torch.rand(1000)*9+1
(A*x).shape

torch.Size([40, 256, 1000])

## CoLA

In [None]:
import torch
import torch.nn as nn

from pandas import read_csv
from random import shuffle

hidden_size = 100
num_classes = 1
num_epochs = 7
batch_size = 100
learning_rate = 0.001


device = "cpu"#torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("running on", device)

df = read_csv('/content/CoLA.tsv', sep='\t', header=0)

tokens = {}
number_grammatical = 0
number_nongrammatical = 0

current_addr = 0
hot_one_address = {}
examples = []

for row in df.iterrows():
    gramatical_p = row[1].values[1]
    sentence = row[1].values[3]

    examples.append([sentence, gramatical_p])

    for token in sentence.split(' '):
        count = 1
        if token in tokens:
            count = tokens[token] + 1
        else:
            hot_one_address[token] = current_addr
            current_addr += 1

        tokens[token] = count

    if gramatical_p:
        number_grammatical += 1
        #print(f"{sentence}")
    else:
        number_nongrammatical += 1
        #print(f"*{sentence}")

shuffle(examples)

test_set = []
training_set = []

for i in range(len(examples)):
    if i < 100:
        test_set.append(examples[i])
    else:
        training_set.append(examples[i])

num_tokens = len(tokens)

# Device configuration

batch_size = 100
vocab = torch.diag(torch.ones(num_tokens,dtype=torch.float32))

class NeuralNet(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(NeuralNet, self).__init__()
        self.l1_1 = nn.Linear(input_size, hidden_size)
        self.l1_2 = nn.Linear(hidden_size, hidden_size)
        self.relu = nn.ReLU()
        self.l2 = nn.Linear(hidden_size, num_classes)
        self.sigm = nn.Sigmoid()

    def forward(self, x):
        out = self.l1_1(x)
        out = self.relu(out)
        out = self.l1_2(out)
        out = self.relu(out)
        out = self.l2(out)
        out = self.sigm(out)
        return out

input_size = num_tokens


model = NeuralNet(input_size, hidden_size, num_classes).to(device)


#criterion = nn.CrossEntropyLoss()  # remember, this does its own softmax (this should actually be BCELoss)
criterion = nn.BCELoss()  # remember, this does its own softmax (this should actually be BCELoss)

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

def train(silent=False):
    for epoch in range(num_epochs):
        shuffle(training_set)

        for bnum in range(80):
            batch = torch.zeros([100,num_tokens], dtype=torch.float32)
            labels = torch.zeros([100], dtype=torch.float32)
            for i in range(0,100):
                for word in training_set[bnum*100 + i][0].split(' '):
                    batch[i][hot_one_address[word]] = 1
                labels[i] = training_set[bnum*100 + i][1]

            labels.to(device)
            batch.to(device)

        # Forward pass and loss calculation
            outputs = model(batch).squeeze()
            loss = criterion(outputs,labels)

        # Backward and optimize
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            if not silent:
                print(f'Epoch [{epoch + 1}/{num_epochs}] Batch[{bnum+1}/80], Loss: {loss.item():.4f}')

def evaluate():
    n_correct = 0
    n_samples = len(test_set)
    with torch.no_grad():
        batch = torch.zeros([100, num_tokens], dtype=torch.float32)
        labels = torch.zeros([100], dtype=torch.int64)
        for i in range(0, 100):
            for word in test_set[i][0].split(' '):
                batch[i][hot_one_address[word]] = 1
            labels[i] = test_set[i][1]
        batch.to(device)
        labels.to(device)

        outputs = model(batch).squeeze()
        predicted = torch.round(outputs)
        n_correct += (predicted == labels).sum().item()

        acc = n_correct / n_samples
        print(f'Accuracy of the network on the {n_samples} test images: {100 * acc:.4f} %')
print("before")
evaluate()
train(silent=True)
print("after")
evaluate()


running on cpu
before
Accuracy of the network on the 100 test images: 35.0000 %
after
Accuracy of the network on the 100 test images: 64.0000 %


1-layer 500 nodes

* before:
Accuracy of the network on the 100 test images: 54.0000 %
* after:
Accuracy of the network on the 100 test images: 62.0000 %





2-layer 100 nodes
* before:
Accuracy of the network on the 100 test images: 68.0000 %
* after:
Accuracy of the network on the 100 test images: 70.0000 %

BCE
* before:
Accuracy of the network on the 100 test images: 35.0000 %
* after:
Accuracy of the network on the 100 test images: 64.0000 %

In [None]:
import torch
import torch.nn as nn

from pandas import read_csv
from random import shuffle

hidden_size = 100
num_classes = 1
num_epochs = 7
batch_size = 100
learning_rate = 0.001


device = "cpu"#torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("running on", device)

df = read_csv('/content/CoLA.tsv', sep='\t', header=0)

tokens = {}
number_grammatical = 0
number_nongrammatical = 0

current_addr = 0
hot_one_address = {}
examples = []

for row in df.iterrows():
    gramatical_p = row[1].values[1]
    sentence = row[1].values[3]

    examples.append([sentence, gramatical_p])

    for token in sentence.split(' '):
        count = 1
        if token in tokens:
            count = tokens[token] + 1
        else:
            hot_one_address[token] = current_addr
            current_addr += 1

        tokens[token] = count

    if gramatical_p:
        number_grammatical += 1
        #print(f"{sentence}")
    else:
        number_nongrammatical += 1
        #print(f"*{sentence}")

shuffle(examples)

test_set = []
training_set = []

for i in range(len(examples)):
    if i < 100:
        test_set.append(examples[i])
    else:
        training_set.append(examples[i])

num_tokens = len(tokens)

# Device configuration

batch_size = 100
vocab = torch.diag(torch.ones(num_tokens,dtype=torch.float32))

class NeuralNet(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(NeuralNet, self).__init__()
        self.rnn = nn.GRU(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.l2 = nn.Linear(hidden_size, num_classes)
        self.sigm = nn.Sigmoid()

    def forward(self, x):
        out, _ = self.rnn(x)
        out = self.relu(out)
        out = self.l2(out)
        out = self.sigm(out)
        return out

input_size = num_tokens


model = NeuralNet(input_size, hidden_size, num_classes).to(device)


#criterion = nn.CrossEntropyLoss()  # remember, this does its own softmax (this should actually be BCELoss)
criterion = nn.BCELoss()  # remember, this does its own softmax (this should actually be BCELoss)

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

def train(silent=False):
    for epoch in range(num_epochs):
        shuffle(training_set)

        for bnum in range(80):
            batch = torch.zeros([100,num_tokens], dtype=torch.float32)
            labels = torch.zeros([100], dtype=torch.float32)
            for i in range(0,100):
                for word in training_set[bnum*100 + i][0].split(' '):
                    batch[i][hot_one_address[word]] = 1
                labels[i] = training_set[bnum*100 + i][1]

            labels.to(device)
            batch.to(device)

        # Forward pass and loss calculation
            outputs = model(batch).squeeze()
            loss = criterion(outputs,labels)

        # Backward and optimize
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            if not silent:
                print(f'Epoch [{epoch + 1}/{num_epochs}] Batch[{bnum+1}/80], Loss: {loss.item():.4f}')

def evaluate():
    n_correct = 0
    n_samples = len(test_set)
    with torch.no_grad():
        batch = torch.zeros([100, num_tokens], dtype=torch.float32)
        labels = torch.zeros([100], dtype=torch.int64)
        for i in range(0, 100):
            for word in test_set[i][0].split(' '):
                batch[i][hot_one_address[word]] = 1
            labels[i] = test_set[i][1]
        batch.to(device)
        labels.to(device)

        outputs = model(batch).squeeze()
        predicted = torch.round(outputs)
        n_correct += (predicted == labels).sum().item()

        acc = n_correct / n_samples
        print(f'Accuracy of the network on the {n_samples} test images: {100 * acc:.4f} %')
print("before")
evaluate()
train(silent=True)
print("after")
evaluate()


running on cpu
before
Accuracy of the network on the 100 test images: 27.0000 %
after
Accuracy of the network on the 100 test images: 71.0000 %


**RNN**
Accuracy of the network on the 100 test images: 27.0000 %
after
Accuracy of the network on the 100 test images: 70.0000 %

**LSTM** before
Accuracy of the network on the 100 test images: 69.0000 %
after
Accuracy of the network on the 100 test images: 67.0000 %

**GRU**
before
Accuracy of the network on the 100 test images: 27.0000 %
after
Accuracy of the network on the 100 test images: 71.0000 %