# Text Classification on Amazon Reviews

This notebook plots the results of various optimization algorithms on the Amazon Reviews distribution shift benchmark.

In [1]:
import sys
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import os
import pickle
import numpy as np
import pandas as pd

sys.path.append("..")
from src.utils.io import var_to_str, get_path, load_results
from src.utils.data import load_dataset

In [7]:
import matplotlib as mpl

mpl.rcParams['lines.linewidth'] = 5
mpl.rcParams['xtick.labelsize'] = 24
mpl.rcParams['ytick.labelsize'] = 24
mpl.rcParams["axes.labelsize"] = 34
mpl.rcParams['legend.fontsize'] = 28
mpl.rcParams['axes.titlesize'] = 32
mpl.rcParams['text.usetex'] = True
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42

In [8]:
def get_suboptimality(
    dataset, model_cfg, train_loss, eps=1e-9, out_path="../results/"
):
    init_loss = train_loss[0]
    path = get_path([dataset, var_to_str(model_cfg)], out_path=out_path)
    f = os.path.join(path, "lbfgs_min_loss.p")
    min_loss = pickle.load(open(f, "rb"))
    subopt = (train_loss - min_loss + eps) / (init_loss - min_loss)
    return subopt

def plot_traj(
    ax,
    dataset,
    model_cfg,
    plot_cfg,
    seeds,
    out_path="../results/",
    verbose=False,
    n_points=16,
    markersize=8,
    n_epochs=None,
):
    filename = plot_cfg["optimizer"]  # "code" name (e.g. "lsvrg")
    label = plot_cfg["label"]  # display name
    color = plot_cfg["color"]
    linestyle = plot_cfg["linestyle"]

    X_train = load_dataset(dataset, data_path="../data/")[0]
    n = len(X_train)
    d = X_train.shape[1]

    path = get_path([dataset, var_to_str(model_cfg), filename], out_path=out_path)
   
    df = pickle.load(open(os.path.join(path, "best_traj.p"), "rb"))
    opt = pickle.load(open(os.path.join(path, "best_cfg.p"), "rb"))
    if verbose:
        print(f"{filename} best config:", opt)
    avg_train_loss = torch.tensor(df["average_train_loss"])
    epoch_len = opt["epoch_len"]

    epochs = torch.arange(len(avg_train_loss))
    subopt = get_suboptimality(
        dataset, model_cfg, avg_train_loss, out_path=out_path
    )
    # rescale algorithms that make multiple gradient evaluations per iteration
    if filename == "lsvrg":
        if epoch_len:
            x = epochs * (epoch_len + n) / n
        else:
            x = epochs * 2
    elif filename == "moreau":
        if epoch_len:
            x = epochs * 2 * epoch_len / n
        else:
            x = epochs * 2
    else:
        if epoch_len:
            x = epochs * min(epoch_len * 64, n) / n
        else:
            x = epochs
    if n_epochs:
        idx = x < min(len(subopt), n_epochs)
    else:
        idx = x < len(subopt)
    downsample = torch.sum(idx).item() // n_points
    ax.plot(
        x[idx][::downsample],
        subopt[idx][::downsample],
        color=color,
        label=label,
        linestyle=linestyle,
        marker=plot_cfg["marker"],
        markersize=markersize,
    )

In [9]:
dataset = "amazon"
loss = "multinomial_cross_entropy"
n_class = 5
l2_reg = 1.0
shift_cost = 1.0

result_dir = "../results"

In [10]:
plot_cfgs = [
    {
        "optimizer": "sgd",
        "label": "SGD",
        "color": "black",
        "linestyle": "solid",
        "marker": ".",
    },
    {
        "optimizer": "lsvrg",
        "label": "LSVRG",
        "color": "cadetblue",
        "linestyle": "solid",
        "marker": "o",
    },
    {
        "optimizer": "saddlesaga",
        "label": "SaddleSAGA",
        "color": "goldenrod",
        "linestyle": "solid",
        "marker": "s",
    },
    {
        "optimizer": "prospect",
        "label": "Prospect (Ours)",
        "color": "tab:red",
        "linestyle": "solid",
        "marker": "^",
    },
]

In [13]:
def plot_ax(ax, seeds, objective, plot_cfg, dataset, epoch_len=None, n=4000, downsample=1, max_epoch=32, markersize=8, out_path="../results/"):
    optimizer = plot_cfg['optimizer']
    model_cfg = {
        "objective": objective, 
        "l2_reg": l2_reg, 
        "loss": "binary_cross_entropy" if dataset == "diabetes" else "squared_error", 
        "n_class": None,
        "shift_cost": shift_cost
    }
    for seed in seeds:
        filename = plot_cfg["optimizer"]
        path = get_path([dataset, var_to_str(model_cfg), filename], out_path=out_path)
        df = pickle.load(open(os.path.join(path, "best_traj.p"), "rb"))
        train_loss = torch.tensor(df["average_train_loss"])
        epochs = torch.arange(len(train_loss))
        subopt = get_suboptimality(
            dataset, model_cfg, train_loss, out_path=result_dir
        )
        if optimizer == "lsvrg":
            x = epochs * 2
        elif optimizer in ["sgd", "srda"]:
            x = epochs
        else:
            x = epochs
        idx = (x <= max_epoch)
        ax.plot(
            x[idx][::downsample], 
            subopt[idx][::downsample], 
            label=plot_cfg['label'], 
            color=plot_cfg['color'],
            marker=plot_cfg['marker'],
            markersize=markersize
        )

def get_iterates(objective, optimizer, seed=1, dataset="amazon"):
    model_cfg = {
        "objective": objective,
        "l2_reg": L2_REG,
        "loss": "multinomial_cross_entropy",
        "n_class": 5,
        "sm_coef": SM_MEDIUM
    }

    path = get_path([dataset, var_to_str(model_cfg), optimizer], out_path=result_dir)
    f = os.path.join(path, f"iterates_{seed}.p")
    return [iterate.view(-1, n_class).detach() for iterate in pickle.load(open(f, "rb"))]

In [14]:
X_train, y_train, X_test, y_test = load_dataset("amazon", data_path="../data/")
z_test = torch.tensor(np.load("../data/amazon/z_test.npy"))

print(X_test.shape)
print(y_test.shape)
print(z_test.shape)

FileNotFoundError: Could not find data files in 'data/amazon'. Did you run 'scripts/download_amazon'?