In [68]:
import numpy as np
import pickle
from rlace import solve_adv_game
from sklearn.utils import shuffle
import torch
import os
from sklearn.linear_model import SGDClassifier
from collections import defaultdict
from sklearn.decomposition import PCA

## Load data

In [93]:
def load_data(split="train", lang="de"):
    

    with open("data/{}_words_{}.pkl".format(split,lang), "rb") as f:
        train = pickle.load(f)
        
    train_x,train_y = train["embedding"].to_list(), train["is_dative"].tolist()
    train_x = np.array([x.numpy() for x in train_x])
    train_y = np.array(train_y)
    return train_x, train_y


def balance(train_x, train_y):

    y_1 = train_y == 1
    num_1 = y_1.sum()
    num_0 = len(train_x) - num_1
    
    n = min(num_0, num_1)
    train_x_1, train_x_0 = train_x[y_1], train_x[~y_1]
    train_x_1 = shuffle(train_x_1, random_state=0)
    train_x_0 = shuffle(train_x_0, random_state=0)
    
    train_x_1 = train_x_1[:n]
    train_x_0 = train_x_0[:n]
    train_y_1, trian_y_0 = np.ones(n), np.zeros(n)
    
    train_x_balanced = np.concatenate([train_x_1, train_x_0], axis=0)
    train_y_balanced = np.concatenate([train_y_1, trian_y_0], axis=0)
    
    train_x_balanced, train_y_balanced = shuffle(train_x_balanced, train_y_balanced, random_state=0)
    
    return train_x_balanced, train_y_balanced


def balance_two_languages(x1, y1, x2, y2):
    """
    assuming both languages are already balanced wrt dative, creates a fully balanced dataset.
    """
    n = min(len(x1), len(x2))
    x1, y1 = x1[:n], y1[:n]
    x2, y2 = x2[:n], y2[:n]
    
    x = np.concatenate([x1, x2], axis=0)
    y = np.concatenate([y1, y2], axis=0)
    
    x,y = shuffle(x, y, random_state=0)
    
    return x, y

In [89]:
train_x_de,train_y_de = load_data("train", "de")
train_x_de, train_y_de = balance(train_x_de,train_y_de)

train_x_pl,train_y_pl = load_data("train", "pl")
train_x_pl, train_y_pl = balance(train_x_pl,train_y_pl)

In [90]:
dev_x_de,dev_y_de = load_data("dev", "de")
dev_x_de, dev_y_de = balance(dev_x_de,dev_y_de)

dev_x_pl,dev_y_pl = load_data("dev", "pl")
dev_x_pl, dev_y_pl = balance(dev_x_pl,dev_y_pl)

In [94]:
train_x_both, train_y_both = balance_two_languages(train_x_de,train_y_de,train_x_pl,train_y_pl)
dev_x_both, dev_y_both = balance_two_languages(dev_x_de,dev_y_de,dev_x_pl,dev_y_pl)

### PCA

In [85]:
# dim=512
# pca = PCA(n_components=dim, random_state=0)
# pca.fit(train_x_both)
# train_x_both = pca.transform(train_x_both)
# train_x_de = pca.transform(train_x_de)
# train_x_de = pca.transform(train_x_pl)

## Run the adversarial game

In [98]:
num_iters = 50000
rank=1
optimizer_class = torch.optim.SGD
optimizer_params_P = {"lr": 0.0025, "weight_decay": 1e-4}
optimizer_params_predictor = {"lr": 0.0025,"weight_decay": 1e-4}
epsilon = 0.005 # stop 0.5% from majority acc
batch_size = 256

output_de = solve_adv_game(train_x_de,train_y_de, train_x_de,train_y_de, rank=rank, device="cpu", out_iters=num_iters,
                       optimizer_class=optimizer_class, optimizer_params_P =optimizer_params_P,
                       optimizer_params_predictor=optimizer_params_predictor, epsilon=epsilon,batch_size=batch_size)


48000/50000. Acc post-projection: 49.609%; best so-far: 49.609%; Maj: 50.000%; G


In [99]:
output_pl = solve_adv_game(train_x_pl,train_y_pl, train_x_pl,train_y_pl, rank=rank, device="cpu", out_iters=num_iters,
                       optimizer_class=optimizer_class, optimizer_params_P =optimizer_params_P,
                       optimizer_params_predictor=optimizer_params_predictor, epsilon=epsilon,batch_size=batch_size)


13000/50000. Acc post-projection: 50.136%; best so-far: 50.136%; Maj: 50.000%; G


In [100]:
output_both = solve_adv_game(train_x_both, train_y_both,train_x_both, train_y_both, rank=rank, device="cpu", out_iters=num_iters,
                       optimizer_class=optimizer_class, optimizer_params_P =optimizer_params_P,
                       optimizer_params_predictor=optimizer_params_predictor, epsilon=epsilon,batch_size=batch_size)


20000/50000. Acc post-projection: 49.796%; best so-far: 49.796%; Maj: 50.000%; G


In [101]:
dir_name = "projections"
if not os.path.exists(dir_name):
    os.mkdir(dir_name)
    

np.save("{}/projection_de_fully_balance.npy".format(dir_name), output_de["P"])
np.save("{}/projection_pl_fully_balanced.npy".format(dir_name), output_pl["P"])
np.save("{}/projection_both_fully_balanced.npy".format(dir_name), output_both["P"])

In [102]:
lang2P = {"pl": output_pl["P"], "de": output_de["P"], "both": output_both["P"]}
lang2data = {"pl": {"train": {"x": train_x_pl, "y": train_y_pl}, "dev": {"x": dev_x_pl, "y": dev_y_pl}}, 
             "de": {"train": {"x": train_x_de, "y": train_y_de}, "dev": {"x": dev_x_de, "y": dev_y_de}},
            "both": {"train": {"x": train_x_both, "y": train_y_both}, "dev": {"x": dev_x_both, "y": dev_y_both}}}

## Eval

In [103]:
clf = SGDClassifier(max_iter=200000, tol=1e-4) # linear hinge-loss classifier
train_test_acc = defaultdict(dict) #train_test_acc[lang1][lang2] stores the accuracy in gender prediction when using a projection trained on lang1, on examples belonging to lang2
for lang1 in ["pl", "de", "both"]:
    for lang2 in ["pl", "de", "both"]:
        P = lang2P[lang1]
        x_train,y_train = lang2data[lang2]["train"]["x"], lang2data[lang2]["train"]["y"]
        x_dev,y_dev = lang2data[lang2]["dev"]["x"], lang2data[lang2]["dev"]["y"]
        clf.fit(x_train@P, y_train)
        train_test_acc[lang1][lang2] = clf.score(x_dev@P,y_dev)

In [104]:
train_test_acc

defaultdict(dict,
            {'pl': {'pl': 0.4948717948717949,
              'de': 0.9076923076923077,
              'both': 0.7474358974358974},
             'de': {'pl': 0.9333333333333333,
              'de': 0.5564102564102564,
              'both': 0.7410256410256411},
             'both': {'pl': 0.9025641025641026,
              'de': 0.8435897435897436,
              'both': 0.4935897435897436}})