# Experiments

In [None]:
import pandas as pd                    # For file input/output
import numpy as np                     # For vectorized math operations

from sklearn.linear_model import LogisticRegression  # MNL
from sklearn.metrics import accuracy_score

In [None]:
base_folder = "data/mnl_datasets"
scores_folder = "data/slates"
train_folder = "train"
test_folder = "test"

### Sushi Dataset

In [None]:
ds_sushi_train = "sushi_10_4310_3_1000_0.5_train.csv"

In [None]:
sushi_train_df = pd.read_csv(f"{base_folder}/{train_folder}/{ds_sushi_train}")
# Look at the first 5 rows of the data
sushi_train_df.head()

In [None]:
items = sushi_train_df.columns.tolist()[1:-1]
print(items)

#### Training

In [None]:
X_train = sushi_train_df.iloc[:, 1:-1].to_numpy()
y_train = sushi_train_df.iloc[:,-1].to_numpy()

In [None]:
X_train[0]

In [None]:
y_train[:5]

In [None]:
clf = LogisticRegression(multi_class='multinomial',solver='saga',max_iter=1000).fit(X_train,y_train)
clf.predict(X_train[:5, :])

In [None]:
clf.predict_proba(X_train[:5, :])

In [None]:
clf.score(X_train,y_train)

#### Test

In [None]:
rumwt_folder = 'data/rumwt_pred'
lp_rumwt_folder = 'data/lp_rumwt_pred'

In [None]:
ds_sushi_test = "sushi_10_431_3_10000_0.5_test.csv"

In [None]:
sushi_test_df = pd.read_csv(f"{base_folder}/{test_folder}/{ds_sushi_test}")
# Look at the first 5 rows of the data
sushi_test_df.head()

In [None]:
X_test = sushi_test_df.iloc[:, 1:-1].to_numpy()
y_test = sushi_test_df.iloc[:,-1].to_numpy()

In [None]:
ds_winner_probs_train = 'sushi_10_431_3_10000_0.5_winner_probs_train.csv'
ds_winners_train = ds_winner_probs_train[:-9] + 'test.csv'

In [None]:
y_rumwt_probs_test = np.genfromtxt(f'{rumwt_folder}/{ds_winner_probs_train}', delimiter=',')
y_rumwt_test = np.genfromtxt(f'{rumwt_folder}/{ds_winners_train}', delimiter=',')
print(y_rumwt_probs_test[1])

In [None]:
ds_winner_probs_lp_train = ds_winner_probs_train[:-9] + 'rumwt.csv'
ds_winners_lp_train = ds_winner_probs_train[:-9] + 'rumwt.csv'

In [None]:
y_lp_rumwt_probs_test = np.genfromtxt(f'{lp_rumwt_folder}/{ds_winner_probs_lp_train}', delimiter=',')
y_lp_rumwt_test = np.genfromtxt(f'{lp_rumwt_folder}/{ds_winners_lp_train}', delimiter=',')
print(y_lp_rumwt_probs_test[1])

In [None]:
ds_winner_probs_test = ds_winner_probs_train[:-9] + 'test.csv'

In [None]:
y_probs_test = np.genfromtxt(f'{rumwt_folder}/{ds_winner_probs_test}', delimiter=',')
print(y_probs_test[1])

In [None]:
y_mnl_probs_test = clf.predict_proba(X_test)

In [None]:
def total_var(y, y_pred):
    losses = []
    num_items = y_pred.shape[1]
    for i in range(y_pred.shape[0]):
        loss=np.linalg(y,y_pred)
        losses.append(loss)
    return np.array(losses)

In [None]:
def cross_entropy(y, y_pred):
    losses = []
    num_items = y_pred.shape[1]
    for i in range(y_pred.shape[0]):
        loss=-np.sum(y[i]*np.log(y_pred[i]+0.000001))/num_items  # +eps for numerical stability
        losses.append(loss)
    return np.array(losses)

In [None]:
def kl_div(y, y_pred):
    losses = []
    num_items = y_pred.shape[1]
    for i in range(y_pred.shape[0]):
        loss=(-np.sum(y[i]*np.log(y_pred[i]+0.000001))+np.sum(y[i]*np.log(y[i]+0.000001)))/num_items # +eps for numerical stability
        losses.append(loss)
    return np.array(losses)

In [None]:
l1 = np.linalg.norm(y_probs_test-y_mnl_probs_test,ord=1,axis=1)
total_variation = np.amax(l1)
mean_variation = np.mean(l1)
median_variation = np.median(l1)
print(total_variation)
print(mean_variation)
print(median_variation)

In [None]:
l1 = np.linalg.norm(y_probs_test-y_rumwt_probs_test,ord=1,axis=1)
total_variation = np.amax(l1)
mean_variation = np.mean(l1)
median_variation = np.median(l1)
print(total_variation)
print(mean_variation)
print(median_variation)

In [None]:
l1 = np.linalg.norm(y_probs_test-y_lp_rumwt_probs_test,ord=1,axis=1)
total_variation = np.amax(l1)
mean_variation = np.mean(l1)
median_variation = np.median(l1)
print(total_variation)
print(mean_variation)
print(median_variation)

In [None]:
kl = kl_div(y_probs_test, y_mnl_probs_test)
total_ce = np.amax(kl)
mean_ce = np.mean(kl)
median_ce = np.median(kl)
print(total_ce)
print(mean_ce)
print(median_ce)

In [None]:
kl = kl_div(y_probs_test, y_rumwt_probs_test)
total_ce = np.amax(kl)
mean_ce = np.mean(kl)
median_ce = np.median(kl)
print(total_ce)
print(mean_ce)
print(median_ce)

In [None]:
kl = kl_div(y_probs_test, y_lp_rumwt_probs_test)
total_ce = np.amax(kl)
mean_ce = np.mean(kl)
median_ce = np.median(kl)
print(total_ce)
print(mean_ce)
print(median_ce)

In [None]:
y_mnl_test = clf.predict(X_test)

In [None]:
accuracy_score(y_test, y_mnl_test)

In [None]:
accuracy_score(y_test, y_rumwt_test)

In [None]:
accuracy_score(y_test, y_lp_rumwt_test)

In [None]:
y_no_choice = np.max(y_test)*np.ones(y_test.shape)
accuracy_score(y_test, y_no_choice)

#### Top-ranked item Accuracy

In [None]:
def cast_char(c):
    if c == '-':
        return 0
    return int(c)

In [None]:
def compute_accuracy(y_pred, y_test_scores, delta=0.5):
    score = 0.0
    for i in range(len(y_pred)):
        top_index = np.argmax(y_pred[i][:-1])
        if y_test_scores[i][top_index] > max(y_test_scores[i]) - delta:
            score += 1
    return score/len(y_pred)

In [None]:
sushi_test_scores_df = pd.read_csv(f"{scores_folder}/{test_folder}/sushi_10_431_3_10000_test.csv")
# Look at the first 5 rows of the data
sushi_test_scores_df.head()

In [None]:
y_test_scores_str = sushi_test_scores_df.to_numpy()
y_test_scores = []
for row in y_test_scores_str:
    y_test_scores.append([cast_char(x) for x in row])
print(y_test_scores[0])

In [None]:
compute_accuracy(y_mnl_probs_test, y_test_scores)

In [None]:
compute_accuracy(y_rumwt_probs_test, y_test_scores)

In [None]:
compute_accuracy(y_lp_rumwt_probs_test, y_test_scores)

In [None]:
compute_accuracy(y_probs_test, y_test_scores)

In [None]:
import random

len_slate = len(y_test_scores[0])
random_baseline_score = 0
for row in y_test_scores:
    slate = [i for i in range(len_slate) if row[i] > 0]
    if not slate:
        slate = [0]
    random_choice = random.choice(slate)
    if row[random_choice] > max(row) - 0.5:
        random_baseline_score += 1
print(random_baseline_score/len(y_test_scores))

### Tripadvisor Dataset

In [None]:
ds_trip_train = "tripadvisor_10_980_4_10000_0.5_train.csv"

In [None]:
trip_train_df = pd.read_csv(f"{base_folder}/{train_folder}/{ds_trip_train}")
# Look at the first 5 rows of the data
trip_train_df.head()

In [None]:
items = trip_train_df.columns.tolist()[1:-1]
print(items)

#### Training

In [None]:
X_train = trip_train_df.iloc[:, 1:-1].to_numpy()
y_train = trip_train_df.iloc[:,-1].to_numpy()

In [None]:
X_train[0]

In [None]:
y_train[:5]

In [None]:
clf = LogisticRegression(multi_class='multinomial',solver='saga',max_iter=1000).fit(X_train,y_train)
clf.predict(X_train[:5, :])

In [None]:
clf.predict_proba(X_train[:5, :])

In [None]:
clf.score(X_train,y_train)

#### Test

In [None]:
ds_trip_test = "tripadvisor_10_98_4_10000_0.5_test.csv"

In [None]:
trip_test_df = pd.read_csv(f"{base_folder}/{test_folder}/{ds_trip_test}")
# Look at the first 5 rows of the data
trip_test_df.head()

In [None]:
X_test = trip_test_df.iloc[:, 1:-1].to_numpy()
y_test = trip_test_df.iloc[:,-1].to_numpy()

In [None]:
ds_winner_probs_train = ds_trip_test[:-8] + 'winner_probs_train.csv'
ds_winners_train = ds_trip_test[:-8] + 'winners_train.csv'

In [None]:
y_rumwt_probs_test = np.genfromtxt(f'{rumwt_folder}/{ds_winner_probs_train}', delimiter=',')
y_rumwt_test = np.genfromtxt(f'{rumwt_folder}/{ds_winners_train}', delimiter=',')

In [None]:
ds_winner_probs_lp_train = ds_trip_test[:-8] + 'winner_probs_rumwt.csv'
ds_winners_lp_train = ds_trip_test[:-8] + 'winners_rumwt.csv'

In [None]:
y_lp_rumwt_probs_test = np.genfromtxt(f'{lp_rumwt_folder}/{ds_winner_probs_lp_train}', delimiter=',')
y_lp_rumwt_test = np.genfromtxt(f'{lp_rumwt_folder}/{ds_winners_lp_train}', delimiter=',')
print(y_lp_rumwt_probs_test[1])

In [None]:
ds_winner_probs_test = ds_trip_test[:-8] + 'winner_probs_test.csv'

In [None]:
y_probs_test = np.genfromtxt(f'{rumwt_folder}/{ds_winner_probs_test}', delimiter=',')
print(y_probs_test[0])
print(y_rumwt_probs_test[0])

In [None]:
y_mnl_probs_test = clf.predict_proba(X_test)

In [None]:
l1 = np.linalg.norm(y_probs_test-y_mnl_probs_test,ord=1,axis=1)
total_variation = np.amax(l1)
mean_variation = np.mean(l1)
median_variation = np.median(l1)
print(total_variation)
print(mean_variation)
print(median_variation)

In [None]:
l1 = np.linalg.norm(y_probs_test-y_rumwt_probs_test,ord=1,axis=1)
total_variation = np.amax(l1)
mean_variation = np.mean(l1)
median_variation = np.median(l1)
print(total_variation)
print(mean_variation)
print(median_variation)

In [None]:
l1 = np.linalg.norm(y_probs_test-y_lp_rumwt_probs_test,ord=1,axis=1)
total_variation = np.amax(l1)
mean_variation = np.mean(l1)
median_variation = np.median(l1)
print(total_variation)
print(mean_variation)
print(median_variation)

In [None]:
kl = kl_div(y_probs_test, y_mnl_probs_test)
total_ce = np.amax(kl)
mean_ce = np.mean(kl)
median_ce = np.median(kl)
print(total_ce)
print(mean_ce)
print(median_ce)

In [None]:
kl = kl_div(y_probs_test, y_rumwt_probs_test)
total_ce = np.amax(kl)
mean_ce = np.mean(kl)
median_ce = np.median(kl)
print(total_ce)
print(mean_ce)
print(median_ce)

In [None]:
kl = kl_div(y_probs_test, y_lp_rumwt_probs_test)
total_ce = np.amax(kl)
mean_ce = np.mean(kl)
median_ce = np.median(kl)
print(total_ce)
print(mean_ce)
print(median_ce)

In [None]:
y_mnl_test = clf.predict(X_test)

In [None]:
accuracy_score(y_test, y_mnl_test)

In [None]:
accuracy_score(y_test, y_rumwt_test)

In [None]:
accuracy_score(y_test, y_lp_rumwt_test)

In [None]:
y_no_choice = np.max(y_test)*np.ones(y_test.shape)
accuracy_score(y_test, y_no_choice)

#### Top-ranked item Accuracy

In [None]:
def cast_char(c):
    if c == '-':
        return 0
    return float(c)

In [None]:
def compute_accuracy(y_pred, y_test_scores, delta=0.25):
    score = 0.0
    for i in range(len(y_pred)):
        top_index = np.argmax(y_pred[i][:-1])
        if y_test_scores[i][top_index] > max(y_test_scores[i]) - delta:
            score += 1
    return score/len(y_pred)

In [None]:
trip_test_scores_df = pd.read_csv(f"{scores_folder}/{test_folder}/tripadvisor_10_98_4_10000_test.csv")
# Look at the first 5 rows of the data
trip_test_scores_df.head()

In [None]:
y_test_scores_str = trip_test_scores_df.to_numpy()
y_test_scores = []
for row in y_test_scores_str:
    y_test_scores.append([cast_char(x) for x in row])
print(y_test_scores[0])

In [None]:
compute_accuracy(y_mnl_probs_test, y_test_scores)

In [None]:
compute_accuracy(y_rumwt_probs_test, y_test_scores)

In [None]:
compute_accuracy(y_lp_rumwt_probs_test, y_test_scores)

In [None]:
compute_accuracy(y_probs_test, y_test_scores)

In [None]:
import random

len_slate = len(y_test_scores[0])
random_baseline_score = 0
for row in y_test_scores:
    slate = [i for i in range(len_slate) if row[i] > 0]
    if not slate:
        slate = [0]
    random_choice = random.choice(slate)
    if row[random_choice] > max(row) - 0.5:
        random_baseline_score += 1
print(random_baseline_score/len(y_test_scores))

### Young People Spending Habits Dataset

In [None]:
ds_young_train = "young_people_spendinghabits_7_1010_2_1000_0.5_train.csv"

In [None]:
young_train_df = pd.read_csv(f"{base_folder}/{train_folder}/{ds_young_train}")
# Look at the first 5 rows of the data
young_train_df.head()

In [None]:
items = young_train_df.columns.tolist()[1:-1]
print(items)

#### Training

In [None]:
X_train = young_train_df.iloc[:, 1:-1].to_numpy()
y_train = young_train_df.iloc[:,-1].to_numpy()

In [None]:
X_train[0]

In [None]:
y_train[:5]

In [None]:
clf = LogisticRegression(multi_class='multinomial',solver='saga',max_iter=1000).fit(X_train,y_train)
clf.predict(X_train[:5, :])

In [None]:
clf.predict_proba(X_train[:5, :])

In [None]:
clf.score(X_train,y_train)

#### Test

In [None]:
ds_young_test = "young_people_spendinghabits_7_101_2_10000_0.5_test.csv"

In [None]:
young_test_df = pd.read_csv(f"{base_folder}/{test_folder}/{ds_young_test}")
# Look at the first 5 rows of the data
young_test_df.head()

In [None]:
X_test = young_test_df.iloc[:, 1:-1].to_numpy()
y_test = young_test_df.iloc[:,-1].to_numpy()

In [None]:
ds_winner_probs_train = ds_young_test[:-8] + 'winner_probs_train.csv'
ds_winners_train = ds_young_test[:-8] + 'winners_train.csv'

In [None]:
y_rumwt_probs_test = np.genfromtxt(f'{rumwt_folder}/{ds_winner_probs_train}', delimiter=',')
y_rumwt_test = np.genfromtxt(f'{rumwt_folder}/{ds_winners_train}', delimiter=',')
print(y_rumwt_probs_test[1])

In [None]:
ds_winner_probs_lp_train = ds_young_test[:-8] + 'winner_probs_rumwt.csv'
ds_winners_lp_train = ds_young_test[:-8] + 'winners_rumwt.csv'

In [None]:
y_lp_rumwt_probs_test = np.genfromtxt(f'{lp_rumwt_folder}/{ds_winner_probs_lp_train}', delimiter=',')
y_lp_rumwt_test = np.genfromtxt(f'{lp_rumwt_folder}/{ds_winners_lp_train}', delimiter=',')
print(y_lp_rumwt_probs_test[1])

In [None]:
ds_winner_probs_test = ds_young_test[:-8] + 'winner_probs_test.csv'

In [None]:
y_probs_test = np.genfromtxt(f'{rumwt_folder}/{ds_winner_probs_test}', delimiter=',')

In [None]:
y_mnl_probs_test = clf.predict_proba(X_test)

In [None]:
l1 = np.linalg.norm(y_probs_test-y_mnl_probs_test,ord=1,axis=1)
total_variation = np.amax(l1)
mean_variation = np.mean(l1)
median_variation = np.median(l1)
print(total_variation)
print(mean_variation)
print(median_variation)

In [None]:
l1 = np.linalg.norm(y_probs_test-y_rumwt_probs_test,ord=1,axis=1)
total_variation = np.amax(l1)
mean_variation = np.mean(l1)
median_variation = np.median(l1)
print(total_variation)
print(mean_variation)
print(median_variation)

In [None]:
l1 = np.linalg.norm(y_probs_test-y_lp_rumwt_probs_test,ord=1,axis=1)
total_variation = np.amax(l1)
mean_variation = np.mean(l1)
median_variation = np.median(l1)
print(total_variation)
print(mean_variation)
print(median_variation)

In [None]:
kl = kl_div(y_probs_test, y_mnl_probs_test)
total_ce = np.amax(kl)
mean_ce = np.mean(kl)
median_ce = np.median(kl)
print(total_ce)
print(mean_ce)
print(median_ce)

In [None]:
kl = kl_div(y_probs_test, y_rumwt_probs_test)
total_ce = np.amax(kl)
mean_ce = np.mean(kl)
median_ce = np.median(kl)
print(total_ce)
print(mean_ce)
print(median_ce)

In [None]:
kl = kl_div(y_probs_test, y_lp_rumwt_probs_test)
total_ce = np.amax(kl)
mean_ce = np.mean(kl)
median_ce = np.median(kl)
print(total_ce)
print(mean_ce)
print(median_ce)

In [None]:
y_mnl_test = clf.predict(X_test)

In [None]:
accuracy_score(y_test, y_mnl_test)

In [None]:
accuracy_score(y_test, y_rumwt_test)

In [None]:
accuracy_score(y_test, y_lp_rumwt_test)

In [None]:
y_no_choice = np.max(y_test)*np.ones(y_test.shape)
accuracy_score(y_test, y_no_choice)

#### Top-ranked item Accuracy

In [None]:
def cast_char(c):
    if c == '-':
        return 0
    return float(c)

In [None]:
def compute_accuracy(y_pred, y_test_scores, delta=0.5):
    score = 0.0
    for i in range(len(y_pred)):
        top_index = np.argmax(y_pred[i][:-1])
        if y_test_scores[i][top_index] > max(y_test_scores[i]) - delta:
            score += 1
    return score/len(y_pred)

In [None]:
ypsh_test_scores_df = pd.read_csv(f"{scores_folder}/{test_folder}/young_people_spendinghabits_7_101_2_10000_test.csv")
# Look at the first 5 rows of the data
ypsh_test_scores_df.head()

In [None]:
y_test_scores_str = ypsh_test_scores_df.to_numpy()
y_test_scores = []
for row in y_test_scores_str:
    y_test_scores.append([cast_char(x) for x in row])
print(y_test_scores[0])

In [None]:
compute_accuracy(y_mnl_probs_test, y_test_scores)

In [None]:
compute_accuracy(y_rumwt_probs_test, y_test_scores)

In [None]:
compute_accuracy(y_lp_rumwt_probs_test, y_test_scores)

In [None]:
compute_accuracy(y_probs_test, y_test_scores)

In [None]:
import random

len_slate = len(y_test_scores[0])
random_baseline_score = 0
for row in y_test_scores:
    slate = [i for i in range(len_slate) if row[i] > 0]
    if not slate:
        slate = [0]
    random_choice = random.choice(slate)
    if row[random_choice] > max(row) - 0.5:
        random_baseline_score += 1
print(random_baseline_score/len(y_test_scores))

### Movie(lens) Dataset

In [None]:
ds_movie_train = "movies_20_174130_5_100000_0.25_train.csv"

In [None]:
movie_train_df = pd.read_csv(f"{base_folder}/{train_folder}/{ds_movie_train}")
# Look at the first 5 rows of the data
movie_train_df.head()

In [None]:
items = movie_train_df.columns.tolist()[1:-1]
print(items)

#### Training

In [None]:
X_train = movie_train_df.iloc[:, 1:-1].to_numpy()
y_train = movie_train_df.iloc[:,-1].to_numpy()

In [None]:
X_train[0]

In [None]:
y_train[:5]

In [None]:
clf = LogisticRegression(multi_class='multinomial',solver='saga',max_iter=1000).fit(X_train,y_train)
clf.predict(X_train[:5, :])

In [None]:
clf.predict_proba(X_train[:5, :])

In [None]:
clf.score(X_train,y_train)

#### Test

In [None]:
ds_movie_test = "movies_20_17413_5_10000_0.25_test.csv"

In [None]:
movie_test_df = pd.read_csv(f"{base_folder}/{test_folder}/{ds_movie_test}")
# Look at the first 5 rows of the data
movie_test_df.head()

In [None]:
X_test = movie_test_df.iloc[:, 1:-1].to_numpy()
y_test = movie_test_df.iloc[:,-1].to_numpy()

In [None]:
ds_winner_probs_train = ds_movie_test[:-8] + 'winner_probs_train.csv'
ds_winners_train = ds_movie_test[:-8] + 'winners_train.csv'

In [None]:
y_rumwt_probs_test = np.genfromtxt(f'{rumwt_folder}/{ds_winner_probs_train}', delimiter=',')
y_rumwt_test = np.genfromtxt(f'{rumwt_folder}/{ds_winners_train}', delimiter=',')
print(y_rumwt_probs_test[1])

In [None]:
ds_winner_probs_lp_train = ds_movie_test[:-8] + 'winner_probs_rumwt.csv'
ds_winners_lp_train = ds_movie_test[:-8] + 'winners_rumwt.csv'

In [None]:
y_lp_rumwt_probs_test = np.genfromtxt(f'{lp_rumwt_folder}/{ds_winner_probs_lp_train}', delimiter=',')
y_lp_rumwt_test = np.genfromtxt(f'{lp_rumwt_folder}/{ds_winners_lp_train}', delimiter=',')
print(y_lp_rumwt_probs_test[1])

In [None]:
ds_winner_probs_test = ds_movie_test[:-8] + 'winner_probs_test.csv'

In [None]:
y_probs_test = np.genfromtxt(f'{rumwt_folder}/{ds_winner_probs_test}', delimiter=',')

In [None]:
y_mnl_probs_test = clf.predict_proba(X_test)

In [None]:
l1 = np.linalg.norm(y_probs_test-y_mnl_probs_test,ord=1,axis=1)
total_variation = np.amax(l1)
mean_variation = np.mean(l1)
median_variation = np.median(l1)
print(total_variation)
print(mean_variation)
print(median_variation)

In [None]:
l1 = np.linalg.norm(y_probs_test-y_rumwt_probs_test,ord=1,axis=1)
total_variation = np.amax(l1)
mean_variation = np.mean(l1)
median_variation = np.median(l1)
print(total_variation)
print(mean_variation)
print(median_variation)

In [None]:
l1 = np.linalg.norm(y_probs_test-y_lp_rumwt_probs_test,ord=1,axis=1)
total_variation = np.amax(l1)
mean_variation = np.mean(l1)
median_variation = np.median(l1)
print(total_variation)
print(mean_variation)
print(median_variation)

In [None]:
kl = kl_div(y_probs_test, y_mnl_probs_test)
total_ce = np.amax(kl)
mean_ce = np.mean(kl)
median_ce = np.median(kl)
print(total_ce)
print(mean_ce)
print(median_ce)

In [None]:
kl = kl_div(y_probs_test, y_rumwt_probs_test)
total_ce = np.amax(kl)
mean_ce = np.mean(kl)
median_ce = np.median(kl)
print(total_ce)
print(mean_ce)
print(median_ce)

In [None]:
y_mnl_test = clf.predict(X_test)

In [None]:
accuracy_score(y_test, y_mnl_test)

In [None]:
accuracy_score(y_test, y_rumwt_test)

In [None]:
accuracy_score(y_test, y_lp_rumwt_test)

In [None]:
y_no_choice = np.max(y_test)*np.ones(y_test.shape)
accuracy_score(y_test, y_no_choice)

#### Top-ranked item Accuracy

In [None]:
def cast_char(c):
    if c == '-':
        return 0
    return float(c)

In [None]:
def compute_accuracy(y_pred, y_test_scores, delta=0.25):
    score = 0.0
    for i in range(len(y_pred)):
        top_index = np.argmax(y_pred[i][:-1])
        if y_test_scores[i][top_index] > max(y_test_scores[i]) - delta:
            score += 1
    return score/len(y_pred)

In [None]:
movie_test_scores_df = pd.read_csv(f"{scores_folder}/{test_folder}/movies_20_17413_5_10000_test.csv")
# Look at the first 5 rows of the data
movie_test_scores_df.head()

In [None]:
y_test_scores_str = movie_test_scores_df.to_numpy()
y_test_scores = []
for row in y_test_scores_str:
    y_test_scores.append([cast_char(x) for x in row])
print(y_test_scores[0])

In [None]:
compute_accuracy(y_mnl_probs_test, y_test_scores)

In [None]:
compute_accuracy(y_rumwt_probs_test, y_test_scores)

In [None]:
compute_accuracy(y_lp_rumwt_probs_test, y_test_scores)

In [None]:
compute_accuracy(y_probs_test, y_test_scores)

In [None]:
import random

len_slate = len(y_test_scores[0])
random_baseline_score = 0
for row in y_test_scores:
    slate = [i for i in range(len_slate) if row[i] > 0]
    if not slate:
        slate = [0]
    random_choice = random.choice(slate)
    if row[random_choice] > max(row) - 0.25:
        random_baseline_score += 1
print(random_baseline_score/len(y_test_scores))

### (good)Books Dataset

In [None]:
ds_book_train = "books_30_47211_5_1000000_0.5_train.csv"

In [None]:
book_train_df = pd.read_csv(f"{base_folder}/{train_folder}/{ds_book_train}")
# Look at the first 5 rows of the data
book_train_df.head()

In [None]:
items = book_train_df.columns.tolist()[1:-1]
print(items)

#### Training

In [None]:
X_train = book_train_df.iloc[:, 1:-1].to_numpy()
y_train = book_train_df.iloc[:,-1].to_numpy()

In [None]:
X_train[0]

In [None]:
y_train[:5]

In [None]:
clf = LogisticRegression(multi_class='multinomial',solver='saga',max_iter=1000).fit(X_train,y_train)
clf.predict(X_train[:5, :])

In [None]:
clf.predict_proba(X_train[:5, :])

In [None]:
clf.score(X_train,y_train)

#### Test

In [None]:
ds_book_test = "books_30_4721_5_10000_0.5_test.csv"

In [None]:
book_test_df = pd.read_csv(f"{base_folder}/{test_folder}/{ds_book_test}")
# Look at the first 5 rows of the data
book_test_df.head()

In [None]:
X_test = book_test_df.iloc[:, 1:-1].to_numpy()
y_test = book_test_df.iloc[:,-1].to_numpy()

In [None]:
ds_winner_probs_train = ds_book_test[:-8] + 'winner_probs_train.csv'
ds_winners_train = ds_book_test[:-8] + 'winners_train.csv'

In [None]:
ds_winner_probs_lp_train = ds_young_test[:-8] + 'winner_probs_rumwt.csv'
ds_winners_lp_train = ds_young_test[:-8] + 'winners_rumwt.csv'

In [None]:
y_lp_rumwt_probs_test = np.genfromtxt(f'{lp_rumwt_folder}/{ds_winner_probs_lp_train}', delimiter=',')
y_lp_rumwt_test = np.genfromtxt(f'{lp_rumwt_folder}/{ds_winners_lp_train}', delimiter=',')
print(y_lp_rumwt_probs_test[1])

In [None]:
y_rumwt_probs_test = np.genfromtxt(f'{rumwt_folder}/{ds_winner_probs_train}', delimiter=',')
y_rumwt_test = np.genfromtxt(f'{rumwt_folder}/{ds_winners_train}', delimiter=',')

In [None]:
ds_winner_probs_test = ds_book_test[:-8] + 'winner_probs_test.csv'

In [None]:
y_probs_test = np.genfromtxt(f'{rumwt_folder}/{ds_winner_probs_test}', delimiter=',')

In [None]:
y_mnl_probs_test = clf.predict_proba(X_test)

In [None]:
l1 = np.linalg.norm(y_probs_test-y_mnl_probs_test,ord=1,axis=1)
total_variation = np.amax(l1)
mean_variation = np.mean(l1)
median_variation = np.median(l1)
print(total_variation)
print(mean_variation)
print(median_variation)

In [None]:
l1 = np.linalg.norm(y_probs_test-y_rumwt_probs_test,ord=1,axis=1)
total_variation = np.amax(l1)
mean_variation = np.mean(l1)
median_variation = np.median(l1)
print(total_variation)
print(mean_variation)
print(median_variation)

In [None]:
l1 = np.linalg.norm(y_probs_test-y_lp_rumwt_probs_test,ord=1,axis=1)
total_variation = np.amax(l1)
mean_variation = np.mean(l1)
median_variation = np.median(l1)
print(total_variation)
print(mean_variation)
print(median_variation)

In [None]:
kl = kl_div(y_probs_test, y_mnl_probs_test)
total_ce = np.amax(kl)
mean_ce = np.mean(kl)
median_ce = np.median(kl)
print(total_ce)
print(mean_ce)
print(median_ce)

In [None]:
kl = kl_div(y_probs_test, y_rumwt_probs_test)
total_ce = np.amax(kl)
mean_ce = np.mean(kl)
median_ce = np.median(kl)
print(total_ce)
print(mean_ce)
print(median_ce)

In [None]:
kl = kl_div(y_probs_test, y_lp_rumwt_probs_test)
total_ce = np.amax(kl)
mean_ce = np.mean(kl)
median_ce = np.median(kl)
print(total_ce)
print(mean_ce)
print(median_ce)

In [None]:
y_mnl_test = clf.predict(X_test)

In [None]:
accuracy_score(y_test, y_mnl_test)

In [None]:
accuracy_score(y_test, y_rumwt_test)

In [None]:
y_no_choice = np.max(y_test)*np.ones(y_test.shape)
accuracy_score(y_test, y_no_choice)

#### Top-ranked item Accuracy

In [None]:
def cast_char(c):
    if c == '-':
        return 0
    return float(c)

In [None]:
def compute_accuracy(y_pred, y_test_scores, delta=0.5):
    score = 0.0
    for i in range(len(y_pred)):
        top_index = np.argmax(y_pred[i][:-1])
        if y_test_scores[i][top_index] > max(y_test_scores[i]) - delta:
            score += 1
    return score/len(y_pred)

In [None]:
book_test_scores_df = pd.read_csv(f"{scores_folder}/{test_folder}/books_30_4721_5_10000_test.csv")
# Look at the first 5 rows of the data
book_test_scores_df.head()

In [None]:
y_test_scores_str = book_test_scores_df.to_numpy()
y_test_scores = []
for row in y_test_scores_str:
    y_test_scores.append([cast_char(x) for x in row])
print(y_test_scores[0])

In [None]:
compute_accuracy(y_mnl_probs_test, y_test_scores)

In [None]:
compute_accuracy(y_rumwt_probs_test, y_test_scores)

In [None]:
compute_accuracy(y_lp_rumwt_probs_test, y_test_scores)

In [None]:
compute_accuracy(y_probs_test, y_test_scores)

In [None]:
import random

len_slate = len(y_test_scores[0])
random_baseline_score = 0
for row in y_test_scores:
    slate = [i for i in range(len_slate) if row[i] > 0]
    if not slate:
        slate = [0]
    random_choice = random.choice(slate)
    if row[random_choice] > max(row) - 0.25:
        random_baseline_score += 1
print(random_baseline_score/len(y_test_scores))