# delta-MNLs with PyTorch

In [None]:
import numpy as np
import pandas as pd
import torch
from torch import Tensor

In [None]:
base_folder = "data/mnl_datasets"
train_folder = "train"
test_folder = "test"
rumwt_folder = "data/rumwt_pred"
scores_folder = "data/slates"

## Sushi Dataset

In [None]:
ds_sushi_train = "sushi_10_4310_3_1000_0.5_train.csv"
ds_sushi_test = "sushi_10_431_3_10000_0.5_test.csv"

In [None]:
sushi_train_df = pd.read_csv(f"{base_folder}/{train_folder}/{ds_sushi_train}")
# Look at the first 5 rows of the data
sushi_train_df.head()

In [None]:
sushi_test_df = pd.read_csv(f"{base_folder}/{test_folder}/{ds_sushi_test}")
# Look at the first 5 rows of the data
sushi_test_df.head()

In [None]:
X_train_numpy = sushi_train_df.drop(['slate_ID','no-choice','CHOICE'], axis=1).values
X_test_numpy = sushi_test_df.drop(['slate_ID','no-choice','CHOICE'], axis=1).values

num_classes = X_train_numpy.shape[1] + 1
y_train_numpy = np.subtract(sushi_train_df.CHOICE.values,1)
y_test_numpy = np.subtract(sushi_test_df.CHOICE.values,1)

X_train = torch.tensor(X_train_numpy, dtype=torch.float32)
y_train = torch.tensor(y_train_numpy)
X_test = torch.tensor(X_test_numpy, dtype=torch.float32)
y_test = torch.tensor(y_test_numpy)

In [None]:
print(X_train.shape)
print(X_test.shape)

In [None]:
print(y_train.shape)
print(y_test.shape)

In [None]:
def one_hot_encode(vector, n_classes):
    one_hot = torch.zeros((vector.shape[0], n_classes))\
        .type(torch.LongTensor)  # 1
    return one_hot.scatter(
        1, vector.type(torch.LongTensor).unsqueeze(1), 1
    )


y_train_one_hot = one_hot_encode(y_train,num_classes)
y_test_one_hot = one_hot_encode(y_test,num_classes)

In [None]:
y_test_one_hot[:5]

In [None]:
def delta_softmax_activation(z,delta):
    z = torch.sub(z, torch.max(z).repeat(z.size()))
    exponentials = torch.exp(z)  # exp(z_ij)
    shifted_exponentials = torch.exp(torch.add(z,torch.abs(delta))) # exp(z_ij + delta)
    shifted_exponentials_sums = torch.sum(shifted_exponentials, axis=1).unsqueeze(1).repeat(1,shifted_exponentials.size(axis=1))
    exponentials_norm = shifted_exponentials_sums - shifted_exponentials + exponentials;
    probabilities = exponentials / exponentials_norm
    relu = torch.nn.ReLU()
    missing_probs = relu(torch.sub(torch.ones(probabilities.size(axis=0),1),torch.sum(probabilities, axis=1).unsqueeze(1)))
    return torch.cat((probabilities, missing_probs), 1)

def cross_entropy_loss(y_one_hot, activations):
    return -torch.mean(torch.sum(y_one_hot * torch.log(activations), axis=1))

In [None]:
num_dims = X_train_numpy.shape[1]
w_autograd = torch.rand((num_dims,num_dims), requires_grad=True)
b_autograd = torch.rand(num_dims, requires_grad=True)
delta_autograd = torch.rand(1, requires_grad=True)

In [None]:
n_iterations = 10000
learning_rate = 0.1
lambda_param = 0.0005
DELTA = 1.1
for i in range(1, n_iterations + 1):
    
    Z = torch.mm(X_train, w_autograd) + b_autograd
    A = delta_softmax_activation(Z,delta_autograd)
    l2_regularization = torch.sum(w_autograd ** 2)
    loss = cross_entropy_loss(y_train_one_hot, A) \
           + lambda_param * l2_regularization
    
    if w_autograd.grad is not None:
        w_autograd.grad.zero_()
    if b_autograd.grad is not None:
        b_autograd.grad.zero_()
    if delta_autograd.grad is not None:
        delta_autograd.grad.zero_()
        
    loss.backward()
    
    with torch.no_grad():
        w_autograd -= learning_rate * w_autograd.grad
        b_autograd -= learning_rate * b_autograd.grad
        delta_autograd -= learning_rate * delta_autograd.grad
    
    if i == 1 or i % 100 == 0:
        print(delta_autograd)
        print("Loss at iteration {}: {}".format(i, loss))
        print("Non-regularized Loss at iteration {}: {}".format(i, loss - lambda_param * l2_regularization))

    # print(delta_softmax_activation(torch.mm(X_test, w_autograd) + b_autograd, delta_autograd))
test_predictions = torch.argmax(
    delta_softmax_activation(torch.mm(X_test, w_autograd) + b_autograd, torch.tensor(delta_autograd)), axis=1
)
test_accuracy = float(sum(test_predictions == y_test)) / y_test.shape[0]
print("\nFinal Test Accuracy: {}".format(test_accuracy))

### Test

In [None]:
ds_winner_probs_test = 'sushi_10_431_3_10000_0.5_winner_probs_test.csv'

In [None]:
y_rumwt_probs_test = np.genfromtxt(f'{rumwt_folder}/{ds_winner_probs_test}', delimiter=',')
print(y_rumwt_probs_test[1])

In [None]:
y_dmnl_probs_test = delta_softmax_activation(torch.mm(X_test, w_autograd) + b_autograd, delta_autograd).detach().numpy()

In [None]:
print(y_dmnl_probs_test[1])

In [None]:
def total_var(y, y_pred):
    losses = []
    num_items = y_pred.shape[1]
    for i in range(y_pred.shape[0]):
        loss=np.linalg(y,y_pred)
        losses.append(loss)
    return np.array(losses)

In [None]:
def cross_entropy(y, y_pred):
    losses = []
    num_items = y_pred.shape[1]
    for i in range(y_pred.shape[0]):
        loss=-np.sum(y[i]*np.log(y_pred[i]+0.000001))/num_items  # +eps for numerical stability
        losses.append(loss)
    return np.array(losses)

In [None]:
def kl_div(y, y_pred):
    losses = []
    num_items = y_pred.shape[1]
    for i in range(y_pred.shape[0]):
        loss=(-np.sum(y[i]*np.log(y_pred[i]+0.000001))+np.sum(y[i]*np.log(y[i]+0.000001)))/num_items # +eps for numerical stability
        losses.append(loss)
    return np.array(losses)

In [None]:
l1 = np.linalg.norm(y_rumwt_probs_test-y_dmnl_probs_test,ord=1,axis=1)
total_variation = np.amax(l1)
mean_variation = np.mean(l1)
median_variation = np.median(l1)
print(total_variation)
print(mean_variation)
print(median_variation)

In [None]:
kl = kl_div(y_rumwt_probs_test,y_dmnl_probs_test)
total_ce = np.amax(kl)
mean_ce = np.mean(kl)
median_ce = np.median(kl)
print(total_ce)
print(mean_ce)
print(median_ce)

In [None]:
def cast_char(c):
    if c == '-':
        return 0
    return float(c)

In [None]:
def compute_accuracy(y_pred, y_test_scores, delta=0.5):
    score = 0.0
    for i in range(len(y_pred)):
        top_index = np.argmax(y_pred[i][:-1])
        if y_test_scores[i][top_index] > max(y_test_scores[i]) - delta:
            score += 1
    return score/len(y_pred)

In [None]:
sushi_test_scores_df = pd.read_csv(f"{scores_folder}/{test_folder}/sushi_10_431_3_10000_test.csv")
# Look at the first 5 rows of the data
sushi_test_scores_df.head()

In [None]:
y_test_scores_str = sushi_test_scores_df.to_numpy()
y_test_scores = []
for row in y_test_scores_str:
    y_test_scores.append([cast_char(x) for x in row])
print(y_test_scores[0])

In [None]:
compute_accuracy(y_dmnl_probs_test, y_test_scores)

## Young People Spending Habits Dataset

In [None]:
ds_ypsh_train = "young_people_spendinghabits_7_1010_2_1000_0.5_train.csv"
ds_ypsh_test = "young_people_spendinghabits_7_101_2_10000_0.5_test.csv"

In [None]:
ypsh_train_df = pd.read_csv(f"{base_folder}/{train_folder}/{ds_ypsh_train}")
# Look at the first 5 rows of the data
ypsh_train_df.head()

In [None]:
ypsh_test_df = pd.read_csv(f"{base_folder}/{test_folder}/{ds_ypsh_test}")
# Look at the first 5 rows of the data
ypsh_test_df.head()

In [None]:
X_train_numpy = ypsh_train_df.drop(['slate_ID','no-choice','CHOICE'], axis=1).values
X_test_numpy = ypsh_test_df.drop(['slate_ID','no-choice','CHOICE'], axis=1).values

num_classes = X_train_numpy.shape[1] + 1
y_train_numpy = np.subtract(ypsh_train_df.CHOICE.values,1)
y_test_numpy = np.subtract(ypsh_test_df.CHOICE.values,1)

X_train = torch.tensor(X_train_numpy, dtype=torch.float32)
y_train = torch.tensor(y_train_numpy)
X_test = torch.tensor(X_test_numpy, dtype=torch.float32)
y_test = torch.tensor(y_test_numpy)

In [None]:
print(X_train.shape)
print(X_test.shape)

In [None]:
print(y_train.shape)
print(y_test.shape)

In [None]:
def one_hot_encode(vector, n_classes):
    one_hot = torch.zeros((vector.shape[0], n_classes))\
        .type(torch.LongTensor)  # 1
    return one_hot.scatter(
        1, vector.type(torch.LongTensor).unsqueeze(1), 1
    )


y_train_one_hot = one_hot_encode(y_train,num_classes)
y_test_one_hot = one_hot_encode(y_test,num_classes)

In [None]:
y_test_one_hot[:5]

In [None]:
num_dims = X_train_numpy.shape[1]
w_autograd = torch.rand((num_dims,num_dims), requires_grad=True)
b_autograd = torch.rand(num_dims, requires_grad=True)
delta_autograd = torch.rand(1, requires_grad=True)

In [None]:
n_iterations = 10000
learning_rate = 0.5
lambda_param = 0.0001
DELTA = 1.1
for i in range(1, n_iterations + 1):
    
    Z = torch.mm(X_train, w_autograd) + b_autograd
    A = delta_softmax_activation(Z,delta_autograd)
    l2_regularization = torch.sum(w_autograd ** 2)
    loss = cross_entropy_loss(y_train_one_hot, A) \
           + lambda_param * l2_regularization
    
    if w_autograd.grad is not None:
        w_autograd.grad.zero_()
    if b_autograd.grad is not None:
        b_autograd.grad.zero_()
    if delta_autograd.grad is not None:
        delta_autograd.grad.zero_()
        
    loss.backward()
    
    with torch.no_grad():
        w_autograd -= learning_rate * w_autograd.grad
        b_autograd -= learning_rate * b_autograd.grad
        delta_autograd -= learning_rate * delta_autograd.grad
    
    if i == 1 or i % 100 == 0:
        print(delta_autograd)
        print("Loss at iteration {}: {}".format(i, loss))
        print("Non-regularized Loss at iteration {}: {}".format(i, loss - lambda_param * l2_regularization))

    # print(delta_softmax_activation(torch.mm(X_test, w_autograd) + b_autograd, delta_autograd))
test_predictions = torch.argmax(
    delta_softmax_activation(torch.mm(X_test, w_autograd) + b_autograd, torch.tensor(delta_autograd)), axis=1
)
test_accuracy = float(sum(test_predictions == y_test)) / y_test.shape[0]
print("\nFinal Test Accuracy: {}".format(test_accuracy))

### Test

In [None]:
ds_winner_probs_test = 'young_people_spendinghabits_7_101_2_10000_0.5_winner_probs_test.csv'

In [None]:
y_rumwt_probs_test = np.genfromtxt(f'{rumwt_folder}/{ds_winner_probs_test}', delimiter=',')
print(y_rumwt_probs_test[1])

In [None]:
y_dmnl_probs_test = delta_softmax_activation(torch.mm(X_test, w_autograd) + b_autograd, delta_autograd).detach().numpy()

In [None]:
print(y_dmnl_probs_test[1])

In [None]:
l1 = np.linalg.norm(y_rumwt_probs_test-y_dmnl_probs_test,ord=1,axis=1)
total_variation = np.amax(l1)
mean_variation = np.mean(l1)
median_variation = np.median(l1)
print(total_variation)
print(mean_variation)
print(median_variation)

In [None]:
kl = kl_div(y_rumwt_probs_test,y_dmnl_probs_test)
total_ce = np.amax(kl)
mean_ce = np.mean(kl)
median_ce = np.median(kl)
print(total_ce)
print(mean_ce)
print(median_ce)

In [None]:
ypsh_test_scores_df = pd.read_csv(f"{scores_folder}/{test_folder}/young_people_spendinghabits_7_101_2_10000_test.csv")
# Look at the first 5 rows of the data
ypsh_test_scores_df.head()

In [None]:
y_test_scores_str = ypsh_test_scores_df.to_numpy()
y_test_scores = []
for row in y_test_scores_str:
    y_test_scores.append([cast_char(x) for x in row])
print(y_test_scores[0])

In [None]:
compute_accuracy(y_dmnl_probs_test, y_test_scores)

## TripAdvisor

In [None]:
ds_trip_train = "tripadvisor_10_980_4_10000_0.5_train.csv"
ds_trip_test = "tripadvisor_10_98_4_10000_0.5_test.csv"

In [None]:
trip_train_df = pd.read_csv(f"{base_folder}/{train_folder}/{ds_trip_train}")
# Look at the first 5 rows of the data
trip_train_df.head()

In [None]:
trip_test_df = pd.read_csv(f"{base_folder}/{test_folder}/{ds_trip_test}")
# Look at the first 5 rows of the data
trip_test_df.head()

In [None]:
X_train_numpy = trip_train_df.drop(['slate_ID','no-choice','CHOICE'], axis=1).values
X_test_numpy = trip_test_df.drop(['slate_ID','no-choice','CHOICE'], axis=1).values

num_classes = X_train_numpy.shape[1] + 1
y_train_numpy = np.subtract(trip_train_df.CHOICE.values,1)
y_test_numpy = np.subtract(trip_test_df.CHOICE.values,1)

X_train = torch.tensor(X_train_numpy, dtype=torch.float32)
y_train = torch.tensor(y_train_numpy)
X_test = torch.tensor(X_test_numpy, dtype=torch.float32)
y_test = torch.tensor(y_test_numpy)

In [None]:
print(X_train.shape)
print(X_test.shape)

In [None]:
print(y_train.shape)
print(y_test.shape)

In [None]:
def one_hot_encode(vector, n_classes):
    one_hot = torch.zeros((vector.shape[0], n_classes))\
        .type(torch.LongTensor)  # 1
    return one_hot.scatter(
        1, vector.type(torch.LongTensor).unsqueeze(1), 1
    )


y_train_one_hot = one_hot_encode(y_train,num_classes)
y_test_one_hot = one_hot_encode(y_test,num_classes)

In [None]:
y_test_one_hot[:5]

In [None]:
num_dims = X_train_numpy.shape[1]
w_autograd = torch.rand((num_dims,num_dims), requires_grad=True)
b_autograd = torch.rand(num_dims, requires_grad=True)
delta_autograd = torch.rand(1, requires_grad=True)

In [None]:
n_iterations = 10000
learning_rate = 0.5
lambda_param = 0.0001
DELTA = 1.1
for i in range(1, n_iterations + 1):
    
    Z = torch.mm(X_train, w_autograd) + b_autograd
    A = delta_softmax_activation(Z,delta_autograd)
    l2_regularization = torch.sum(w_autograd ** 2)
    loss = cross_entropy_loss(y_train_one_hot, A) \
           + lambda_param * l2_regularization
    
    if w_autograd.grad is not None:
        w_autograd.grad.zero_()
    if b_autograd.grad is not None:
        b_autograd.grad.zero_()
    if delta_autograd.grad is not None:
        delta_autograd.grad.zero_()
        
    loss.backward()
    
    with torch.no_grad():
        w_autograd -= learning_rate * w_autograd.grad
        b_autograd -= learning_rate * b_autograd.grad
        delta_autograd -= learning_rate * delta_autograd.grad
    
    if i == 1 or i % 100 == 0:
        print(delta_autograd)
        print("Loss at iteration {}: {}".format(i, loss))
        print("Non-regularized Loss at iteration {}: {}".format(i, loss - lambda_param * l2_regularization))

    # print(delta_softmax_activation(torch.mm(X_test, w_autograd) + b_autograd, delta_autograd))
test_predictions = torch.argmax(
    delta_softmax_activation(torch.mm(X_test, w_autograd) + b_autograd, torch.tensor(delta_autograd)), axis=1
)
test_accuracy = float(sum(test_predictions == y_test)) / y_test.shape[0]
print("\nFinal Test Accuracy: {}".format(test_accuracy))

### Test

In [None]:
ds_winner_probs_test = 'tripadvisor_10_98_4_10000_0.5_winner_probs_test.csv'

In [None]:
y_rumwt_probs_test = np.genfromtxt(f'{rumwt_folder}/{ds_winner_probs_test}', delimiter=',')
print(y_rumwt_probs_test[1])

In [None]:
y_dmnl_probs_test = delta_softmax_activation(torch.mm(X_test, w_autograd) + b_autograd, delta_autograd).detach().numpy()

In [None]:
print(y_dmnl_probs_test[1])

In [None]:
l1 = np.linalg.norm(y_rumwt_probs_test-y_dmnl_probs_test,ord=1,axis=1)
total_variation = np.amax(l1)
mean_variation = np.mean(l1)
median_variation = np.median(l1)
print(total_variation)
print(mean_variation)
print(median_variation)

In [None]:
kl = kl_div(y_rumwt_probs_test,y_dmnl_probs_test)
total_ce = np.amax(kl)
mean_ce = np.mean(kl)
median_ce = np.median(kl)
print(total_ce)
print(mean_ce)
print(median_ce)

In [None]:
trip_test_scores_df = pd.read_csv(f"{scores_folder}/{test_folder}/tripadvisor_10_98_4_10000_test.csv")
# Look at the first 5 rows of the data
trip_test_scores_df.head()

In [None]:
y_test_scores_str = trip_test_scores_df.to_numpy()
y_test_scores = []
for row in y_test_scores_str:
    y_test_scores.append([cast_char(x) for x in row])
print(y_test_scores[0])

In [None]:
compute_accuracy(y_dmnl_probs_test, y_test_scores)

## MovieLens

In [None]:
ds_movie_train = "movies_20_174130_5_100000_0.25_train.csv"
ds_movie_test = "movies_20_17413_5_10000_0.25_test.csv"

In [None]:
movie_train_df = pd.read_csv(f"{base_folder}/{train_folder}/{ds_movie_train}")
# Look at the first 5 rows of the data
movie_train_df.head()

In [None]:
movie_test_df = pd.read_csv(f"{base_folder}/{test_folder}/{ds_movie_test}")
# Look at the first 5 rows of the data
movie_test_df.head()

In [None]:
X_train_numpy = movie_train_df.drop(['slate_ID','no-choice','CHOICE'], axis=1).values
X_test_numpy = movie_test_df.drop(['slate_ID','no-choice','CHOICE'], axis=1).values

num_classes = X_train_numpy.shape[1] + 1
y_train_numpy = np.subtract(movie_train_df.CHOICE.values,1)
y_test_numpy = np.subtract(movie_test_df.CHOICE.values,1)

X_train = torch.tensor(X_train_numpy, dtype=torch.float32)
y_train = torch.tensor(y_train_numpy)
X_test = torch.tensor(X_test_numpy, dtype=torch.float32)
y_test = torch.tensor(y_test_numpy)

In [None]:
print(X_train.shape)
print(X_test.shape)

In [None]:
print(y_train.shape)
print(y_test.shape)

In [None]:
def one_hot_encode(vector, n_classes):
    one_hot = torch.zeros((vector.shape[0], n_classes))\
        .type(torch.LongTensor)  # 1
    return one_hot.scatter(
        1, vector.type(torch.LongTensor).unsqueeze(1), 1
    )


y_train_one_hot = one_hot_encode(y_train,num_classes)
y_test_one_hot = one_hot_encode(y_test,num_classes)

In [None]:
y_test_one_hot[:5]

In [None]:
num_dims = X_train_numpy.shape[1]
w_autograd = torch.rand((num_dims,num_dims), requires_grad=True)
b_autograd = torch.rand(num_dims, requires_grad=True)
delta_autograd = torch.rand(1, requires_grad=True)

In [None]:
n_iterations = 10000
learning_rate = 0.1
lambda_param = 0.0001
DELTA = 1.1
for i in range(1, n_iterations + 1):
    
    Z = torch.mm(X_train, w_autograd) + b_autograd
    A = delta_softmax_activation(Z,delta_autograd)
    l2_regularization = torch.sum(w_autograd ** 2)
    loss = cross_entropy_loss(y_train_one_hot, A) \
           + lambda_param * l2_regularization
    
    if w_autograd.grad is not None:
        w_autograd.grad.zero_()
    if b_autograd.grad is not None:
        b_autograd.grad.zero_()
    if delta_autograd.grad is not None:
        delta_autograd.grad.zero_()
        
    loss.backward()
    
    with torch.no_grad():
        w_autograd -= learning_rate * w_autograd.grad
        b_autograd -= learning_rate * b_autograd.grad
        delta_autograd -= learning_rate * delta_autograd.grad
    
    if i == 1 or i % 100 == 0:
        print(delta_autograd)
        print("Loss at iteration {}: {}".format(i, loss))
        print("Non-regularized Loss at iteration {}: {}".format(i, loss - lambda_param * l2_regularization))

    # print(delta_softmax_activation(torch.mm(X_test, w_autograd) + b_autograd, delta_autograd))
test_predictions = torch.argmax(
    delta_softmax_activation(torch.mm(X_test, w_autograd) + b_autograd, torch.tensor(delta_autograd)), axis=1
)
test_accuracy = float(sum(test_predictions == y_test)) / y_test.shape[0]
print("\nFinal Test Accuracy: {}".format(test_accuracy))

### Test

In [None]:
ds_winner_probs_test = 'movies_20_17413_5_10000_0.25_winner_probs_test.csv'

In [None]:
y_rumwt_probs_test = np.genfromtxt(f'{rumwt_folder}/{ds_winner_probs_test}', delimiter=',')
print(y_rumwt_probs_test[1])

In [None]:
y_dmnl_probs_test = delta_softmax_activation(torch.mm(X_test, w_autograd) + b_autograd, delta_autograd).detach().numpy()

In [None]:
print(y_dmnl_probs_test[1])

In [None]:
l1 = np.linalg.norm(y_rumwt_probs_test-y_dmnl_probs_test,ord=1,axis=1)
total_variation = np.amax(l1)
mean_variation = np.mean(l1)
median_variation = np.median(l1)
print(total_variation)
print(mean_variation)
print(median_variation)

In [None]:
kl = kl_div(y_rumwt_probs_test,y_dmnl_probs_test)
total_ce = np.amax(kl)
mean_ce = np.mean(kl)
median_ce = np.median(kl)
print(total_ce)
print(mean_ce)
print(median_ce)

In [None]:
def compute_accuracy(y_pred, y_test_scores, delta=0.25):
    score = 0.0
    for i in range(len(y_pred)):
        top_index = np.argmax(y_pred[i][:-1])
        if y_test_scores[i][top_index] > max(y_test_scores[i]) - delta:
            score += 1
    return score/len(y_pred)

In [None]:
movie_test_scores_df = pd.read_csv(f"{scores_folder}/{test_folder}/movies_20_17413_5_10000_test.csv")
# Look at the first 5 rows of the data
movie_test_scores_df.head()

In [None]:
y_test_scores_str = movie_test_scores_df.to_numpy()
y_test_scores = []
for row in y_test_scores_str:
    y_test_scores.append([cast_char(x) for x in row])
print(y_test_scores[0])

In [None]:
compute_accuracy(y_dmnl_probs_test, y_test_scores)

## goodBooks

In [None]:
ds_book_train = "books_30_47211_5_1000000_0.5_train.csv"
ds_book_test = "books_30_4721_5_10000_0.5_test.csv"

In [None]:
book_train_df = pd.read_csv(f"{base_folder}/{train_folder}/{ds_book_train}")
# Look at the first 5 rows of the data
book_train_df.head()

In [None]:
book_test_df = pd.read_csv(f"{base_folder}/{test_folder}/{ds_book_test}")
# Look at the first 5 rows of the data
book_test_df.head()

In [None]:
X_train_numpy = book_train_df.drop(['slate_ID','no-choice','CHOICE'], axis=1).values
X_test_numpy = book_test_df.drop(['slate_ID','no-choice','CHOICE'], axis=1).values

num_classes = X_train_numpy.shape[1] + 1
y_train_numpy = np.subtract(book_train_df.CHOICE.values,1)
y_test_numpy = np.subtract(book_test_df.CHOICE.values,1)

X_train = torch.tensor(X_train_numpy, dtype=torch.float32)
y_train = torch.tensor(y_train_numpy)
X_test = torch.tensor(X_test_numpy, dtype=torch.float32)
y_test = torch.tensor(y_test_numpy)

In [None]:
print(X_train.shape)
print(X_test.shape)

In [None]:
print(y_train.shape)
print(y_test.shape)

In [None]:
def one_hot_encode(vector, n_classes):
    one_hot = torch.zeros((vector.shape[0], n_classes))\
        .type(torch.LongTensor)  # 1
    return one_hot.scatter(
        1, vector.type(torch.LongTensor).unsqueeze(1), 1
    )


y_train_one_hot = one_hot_encode(y_train,num_classes)
y_test_one_hot = one_hot_encode(y_test,num_classes)

In [None]:
y_test_one_hot[:5]

In [None]:
num_dims = X_train_numpy.shape[1]
w_autograd = torch.rand((num_dims,num_dims), requires_grad=True)
b_autograd = torch.rand(num_dims, requires_grad=True)
delta_autograd = torch.rand(1, requires_grad=True)

In [None]:
n_iterations = 10000
learning_rate = 0.1
lambda_param = 0.0001
DELTA = 1.1
for i in range(1, n_iterations + 1):
    
    Z = torch.mm(X_train, w_autograd) + b_autograd
    A = delta_softmax_activation(Z,delta_autograd)
    l2_regularization = torch.sum(w_autograd ** 2)
    loss = cross_entropy_loss(y_train_one_hot, A) \
           + lambda_param * l2_regularization
    
    if w_autograd.grad is not None:
        w_autograd.grad.zero_()
    if b_autograd.grad is not None:
        b_autograd.grad.zero_()
    if delta_autograd.grad is not None:
        delta_autograd.grad.zero_()
        
    loss.backward()
    
    with torch.no_grad():
        w_autograd -= learning_rate * w_autograd.grad
        b_autograd -= learning_rate * b_autograd.grad
        delta_autograd -= learning_rate * delta_autograd.grad
    
    if i == 1 or i % 100 == 0:
        print(delta_autograd)
        print("Loss at iteration {}: {}".format(i, loss))
        print("Non-regularized Loss at iteration {}: {}".format(i, loss - lambda_param * l2_regularization))

    # print(delta_softmax_activation(torch.mm(X_test, w_autograd) + b_autograd, delta_autograd))
test_predictions = torch.argmax(
    delta_softmax_activation(torch.mm(X_test, w_autograd) + b_autograd, torch.tensor(delta_autograd)), axis=1
)
test_accuracy = float(sum(test_predictions == y_test)) / y_test.shape[0]
print("\nFinal Test Accuracy: {}".format(test_accuracy))

### Test

In [None]:
ds_winner_probs_test = 'books_30_4721_5_10000_0.5_winner_probs_test.csv'

In [None]:
y_rumwt_probs_test = np.genfromtxt(f'{rumwt_folder}/{ds_winner_probs_test}', delimiter=',')
print(y_rumwt_probs_test[1])

In [None]:
y_dmnl_probs_test = delta_softmax_activation(torch.mm(X_test, w_autograd) + b_autograd, delta_autograd).detach().numpy()

In [None]:
print(y_dmnl_probs_test[1])

In [None]:
l1 = np.linalg.norm(y_rumwt_probs_test-y_dmnl_probs_test,ord=1,axis=1)
total_variation = np.amax(l1)
mean_variation = np.mean(l1)
median_variation = np.median(l1)
print(total_variation)
print(mean_variation)
print(median_variation)

In [None]:
kl = kl_div(y_rumwt_probs_test,y_dmnl_probs_test)
total_ce = np.amax(kl)
mean_ce = np.mean(kl)
median_ce = np.median(kl)
print(total_ce)
print(mean_ce)
print(median_ce)