In [None]:
MAX_LEN = 510
DEFAULT_NUM_EXAMPLES = 2500

In [None]:
import random
import mauve 
import copy
import itertools
import pandas as pd
import numpy as np
import json


In [None]:
from sklearn.preprocessing import normalize
from datasets import load_dataset
from collections import Counter
from nltk.tokenize import sent_tokenize

import sys
import os
sys.path.insert(1, os.path.join(sys.path[0], '..'))
from util import util
from h01_data.get_clusters import get_clusters
from utils import clean, load_gpt2_dataset, get_representations


In [None]:
from nltk.corpus import stopwords
stops = set(stopwords.words('english'))

## Datasets

### WebText

In [None]:
p_text = load_gpt2_dataset('data/amazon.valid.jsonl', num_examples=DEFAULT_NUM_EXAMPLES) # human
p_text2 = load_gpt2_dataset('data/amazon.valid.jsonl', num_examples=DEFAULT_NUM_EXAMPLES*2)[DEFAULT_NUM_EXAMPLES:]
q_text = load_gpt2_dataset('data/amazon-xl-1542M.valid.jsonl', num_examples=DEFAULT_NUM_EXAMPLES)
q_text2 = load_gpt2_dataset('data/amazon-xl-1542M.valid.jsonl', num_examples=DEFAULT_NUM_EXAMPLES*2)[DEFAULT_NUM_EXAMPLES:]


### Yelp Polarity

In [None]:
num_train = 10000
num_test = 5000
sent_dataset = load_dataset('yelp_polarity', split='train').shuffle(seed=0)[:num_train]
sent_dataset_test = load_dataset('yelp_polarity', split='test').shuffle(seed=0)[:num_test]

In [None]:
sent_text, sent_labels = zip(*[(t.replace('\n', ' '), l) for t, l in zip(sent_dataset['text'], sent_dataset['label']) if t])
sent_test_text, sent_test_labels = zip(*[(t.replace('\n', ' '), l) for t, l in zip(sent_dataset_test['text'], sent_dataset_test['label']) if t])

sent_p_text = sent_text[:num_train//2]
sent_p_text2 = sent_text[num_train//2:]
sent_test_p_text = sent_test_text[:num_test//2]
sent_test_p_text2 =  sent_test_text[num_test//2:]


### News Category

In [None]:
author_dataset = load_dataset('Fraser/news-category-dataset', split='train')
author_dataset_test = load_dataset('Fraser/news-category-dataset', split='test')
author_labels, author_labels_test  = [x.split(",")[0] for x in author_dataset['authors']],  [x.split(",")[0] for x in author_dataset_test['authors']]
counts = sorted(Counter(author_labels).items(), key=lambda item: item[1], reverse=True)
authors = set([a for a, n in counts if n >=400])


author_labels_filtered = [a for a in author_labels if (a and a in authors)]
author_labels_test_filtered = [a for a in author_labels_test if (a and a in authors)]

with open('articles_full.txt') as f:
    articles = f.read().splitlines()
    
with open('articles_full_test.txt') as f:
    articles_test = f.read().splitlines()
    
assert len(articles) == len(author_labels_filtered)
assert len(articles_test) == len(author_labels_test_filtered)

articles, author_labels_filtered = zip(*[(a,b) for a,b in zip(articles, author_labels_filtered) if a])
articles_test, author_labels_test_filtered = zip(*[(a,b) for a,b in zip(articles_test, author_labels_test_filtered) if a])

num = len(articles)//2
authors_p_text = articles[:num]
authors_p_text2 = articles[num:]

num = len(articles_test)//2
authors_test_p_text = articles_test[:num]
authors_test_p_text2 = articles_test[num:]


### 20 NewsGroup

In [None]:
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train',remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test',remove=('headers', 'footers', 'quotes'))

In [None]:
num = len(newsgroups_train.data)//2
data, labels = zip(*[(t.replace('\n', ' '), l) for t, l in zip(newsgroups_train['data'], newsgroups_train['target']) if t])
news_p_text = data[:num]
news_p_text2 = data[num:num*2]
test_news_data, test_news_labels = zip(*[(t.replace('\n', ' '), l) for t, l in zip(newsgroups_test['data'], newsgroups_test['target']) if t])
test_news_p_text = test_news_data[:len(test_news_data)//2]
test_news_p_text2 =  test_news_data[len(test_news_data)//2:]

news_labels = labels[:num*2]


## Experiments

In [None]:
%load_ext rpy2.ipython
from rpy2.robjects import r, pandas2ri
pandas2ri.activate()

In [None]:
%%R
library(lme4)
library(ggplot2)
library(dplyr)

### Base Tests

In [None]:
articles = ['the', 'a', 'an']
mapping = {'the':'a', 'a':'the', 'an':'the','The':'A', 'A':'The', 'An':'The'}

In [None]:
p_text_trunc = tuple([' '.join(i.split()[0:MAX_LEN]) for i in p_text])
q_text_trunc = tuple([' '.join(i.split()[0:MAX_LEN]) for i in q_text])
p_text2_trunc = tuple([' '.join(i.split()[0:MAX_LEN]) for i in p_text2])
base_text = p_text2_trunc
p_text2_short = tuple([' '.join(i.split()[0:MAX_LEN//3]) for i in base_text])
p_text2_remove = tuple([' '.join([i for i in string.split() if i not in articles]) for string in base_text])
temp = [i.split()for i in base_text]
for i in temp:
    last = i.pop()
    random.Random(0).shuffle(i)
    i.append(last)
p_text2_rand = tuple([' '.join(string) for string in temp])
p_text2_nostop = tuple([' '.join([w for w in string.split() if w.lower() not in stops]) for string in base_text])
sentences = [sent_tokenize(t) for t in base_text]
new_inds = list(range(len(sentences)))
random.Random(0).shuffle(new_inds)
p_text2_sent_swap = []
for i,j in enumerate(new_inds):
    p_text2_sent_swap.append(' '.join(sentences[i][:len(sentences[i])//2] + sentences[j][len(sentences[j])//2:]))
p_text2_sent_swap = tuple(p_text2_sent_swap)
text_names = {p_text_trunc: 'p_text', p_text2_trunc: 'p_text2', q_text_trunc:'q_text',
              p_text2_remove: 'p_text2_remove',  p_text2_rand:'p_text2_rand', p_text2_nostop:'p_text2_nostop',
             p_text2_short:'p_text2_short',p_text2_sent_swap: 'p_text2_sent_swap'}

In [None]:
q_texts = [p_text2_trunc, p_text2_remove, p_text2_rand, p_text2_nostop, q_text_trunc, p_text2_sent_swap, p_text2_short]
default_args = {'p_text':p_text_trunc, 'q_text':p_text2_trunc, 'featurize_model_name':'gpt2','kmeans_num_redo':1, 
                          'kmeans_max_iter':2, 'kmeans_explained_var':0.9, 'is_mean':False, 
                          'device_id':0, 'max_text_length': MAX_LEN, 'verbose':True, 'laplace': 1}
args = {'featurize_model_name': ['bert-base-cased', 'bert-large-cased','gpt2','gpt2-medium','gpt2-large','gpt2-xl'],
        'q_text': q_texts, 
        'is_mean':[True, False],
        'num_buckets':[10, 20, 50, 100, 250, 500], 
        'laplace': [0, 1]}

def param_search(args, default_args):
    def func(cur_args):
        params = copy.deepcopy(default_args)
        params.update(cur_args)
        p_text, q_text = params.pop('p_text'), params.pop('q_text')
        model_name, mean = params['featurize_model_name'], params['is_mean']
        
        params['p_features'] = get_representations(p_text, model_name, mean)
        params['q_features'] = get_representations(q_text, model_name, mean)
        return mauve.compute_mauve(**params)
    keys = args.keys()
    vals = args.values()
    data = []
    for comb in list(itertools.product(*vals)):
        temp = {k:v for k,v in zip(keys,comb)}
        m = func(temp)
        for val in ['mauve', 'forward_kl', 'backward_kl','exponentiated_kl','js']:
            temp[val] = getattr(m, val)
        temp['q_text'] = text_names[temp['q_text']]
        data.append(temp)
    return pd.DataFrame(data)
base_df = param_search(args,default_args)  
base_df['auc'] = 1 - base_df['mauve']
base_df['q_text'] = base_df.q_text.astype('string')

In [None]:
value_vars=['auc', 'forward_kl', 'backward_kl','exponentiated_kl','js']
id_vars = set(base_df.columns) - set(value_vars)
base_df_long = pd.melt(base_df, value_vars=value_vars, id_vars=id_vars, var_name="metric")
base_df_long.loc[:, "metric"] = base_df_long["metric"] +  base_df_long["laplace"].astype(str)
%R -i base_df_long

In [None]:
%%R
legend_measure <- 'auc0'
tmp <- filter(base_df_long, metric==legend_measure, num_buckets==500, is_mean, featurize_model_name=="gpt2-xl")
levels <- tmp[order(-tmp$value),]$q_text

In [None]:
%%R
metrics <- c('forward_kl1', 'backward_kl1','exponentiated_kl1','js0','auc0' )
names(metrics) <- c('Forward', 'Backward','Exponentiated','JS','AUC' )
df_subset <- filter(base_df_long, metric %in% metrics, is_mean, featurize_model_name %in% c("gpt2-xl", 'bert-large-cased'))#, q_text != 'p_text2_rand', q_text != 'p_text2_nostop')
supp.labs <- c("gpt2",  "gpt2-medium", "gpt2-large", "gpt2-xl","bert-base-cased", 'bert-large-cased')
names(supp.labs) <- c("GPT-2 Small",  "GPT-2 Medium", "GPT-2 Large","GPT-2 XL", "BERT Base", "BERT Large")
dist_labels = c(p_text2=expression(paste(italic("p"))), p_text2_remove=expression(paste(italic("p")["no art"])), 
                                  p_text2_rand=expression(paste(italic("p")["rand"])), p_text2_nostop=expression(paste(italic("p")["no stop"])), 
                                  q_text=expression(paste(italic("q"))), p_text2_sent_swap=expression(paste(italic("p")["swap"])),
                                  p_text2_short=expression(paste(italic("p")["short"]))
                             )
dist_labels_2 <- c(p_text2=expression(paste(italic("p")^(2))), p_text2_remove=expression(paste(italic("p")["no art"]^(1))), 
                                  p_text2_rand=expression(paste(italic("p")["rand"]^(1))), p_text2_nostop=expression(paste(italic("p")["no stop"]^(1))), 
                                  q_text=expression(paste(italic("q"))), p_text2_sent_swap=expression(paste(italic("p")["swap"]^(1))),
                                  p_text2_short=expression(paste(italic("p")["short"]^(1)))
                             )
ggplot(aes(x = num_buckets, y = value, 
           color=factor(q_text, levels),
           shape=factor(q_text, levels)), data = df_subset) +
    geom_point(size=3) +
    geom_line() +
    labs(x = "Number of Clusters", y= expression(Delta)) +
    scale_color_discrete(name='Comparison\n      Text', labels=dist_labels) +
    scale_shape_manual(name='Comparison\n      Text', labels=dist_labels, values=sample(seq(15,21))) +
    facet_wrap(factor(featurize_model_name, levels=supp.labs, labels=names(supp.labs))~factor(metric, levels=metrics, names(metrics)), scales="free_y", ncol=5) +
    scale_x_continuous(trans='log2', breaks=c(10, 20, 50, 100, 250, 500)) +
    theme_bw() +
    theme(text=element_text(size=13,family="serif"), 
         axis.text.y=element_text(size=10,family="serif"),
         axis.title.y=element_text(size=17,family="serif"),
         legend.title=element_text(size=13,family="serif"),
         axis.text.x=element_text(size=10,family="serif", angle=30),
         aspect.ratio=1)


### Classifying Categories

In [None]:
test_text1, test_text2 = sent_test_p_text, sent_test_p_text2
train_text1, train_text2 = sent_p_text, sent_p_text2
trn_labels, tst_labels = sent_labels, sent_test_labels
test_text1, test_text2 = test_news_p_text, test_news_p_text2
train_text1, train_text2 = news_p_text, news_p_text2
trn_labels, tst_labels = news_labels, test_news_labels

default_args = {'test_text': test_text1 + test_text2, 'train_text': (train_text1, train_text2),
                'cluster_text': (tuple(p_text), tuple(q_text)), 
                'featurize_model_name':'gpt2','kmeans_num_redo':5, 
                'kmeans_max_iter':100, 'kmeans_explained_var':0.9, 'is_mean':False, 
                'device_id':0, 'max_text_length': MAX_LEN, 'verbose':True}
args = {'featurize_model_name':["bert-base-cased", 'bert-large-cased',"gpt2",  "gpt2-medium", "gpt2-large", "gpt2-xl"],
        'kmeans_explained_var':[0.9], 
        'cluster_text': [(train_text1, train_text2), (tuple(p_text), tuple(q_text))],
        'is_mean': [True, False],
        'num_buckets':[len(set(trn_labels)), len(set(trn_labels))*2, 10, 25, 50, 100, 250, 500]}


def param_search(args, default_args, train_labels, test_labels):
    def func(cur_args):
        params = copy.deepcopy(default_args)
        params.update(cur_args)
        test_text = params['test_text']
        p_train_text, q_train_text = params['train_text']
        p_cluster_text, q_cluster_text = params['cluster_text']
        model_name, mean = params['featurize_model_name'], params['is_mean']
        
        test_representations = get_representations(test_text, model_name, mean)
        train_representations = np.concatenate([get_representations(p_train_text, model_name, mean), 
                                                get_representations(q_train_text, model_name, mean)], axis=0)
        
        p_cluster_representations = get_representations(p_cluster_text, model_name, mean)
        q_cluster_representations = get_representations(q_cluster_text, model_name, mean)
        clustering_model = train_clusters(p_cluster_representations, q_cluster_representations, 
                                          params['num_buckets'], params['kmeans_explained_var'], 0, norm='l2')
        _, labels = get_clusters(clustering_model['pca'], clustering_model['kmeans'].index,
                                 clustering_model['dimensionality'], test_representations)
        _, labels_clusters = get_clusters(clustering_model['pca'], clustering_model['kmeans'].index,
                                          clustering_model['dimensionality'], train_representations)

        return  list(labels_clusters), list(labels)
    
    keys = args.keys()
    vals = args.values()
    data = []
    base_counts = Counter(train_labels)
    default_class = max(dict(base_counts),key=dict(base_counts).get)
    base_acc = base_counts[default_class]/len(train_labels)
    all_combs = list(itertools.product(*vals))
    for comb in all_combs:
        temp = {k:v for k,v in zip(keys,comb)}
        train_preds, test_preds = func(temp)
        assert len(test_preds) == len(test_labels)
        num_buckets = temp['num_buckets'] if 'num_buckets' in temp else default_args['num_buckets']

        def counts_per_class(preds, labels, num_classes):
            classes = [[] for i in range(num_classes)]
            for i,j  in enumerate(preds):
                classes[j-1].append(labels[i])
            return classes
        train_classes = counts_per_class(train_preds, train_labels, num_buckets)
        test_classes = counts_per_class(test_preds, test_labels, num_buckets)
        print(num_buckets, len(np.unique(test_preds)))
        train_counts = [Counter(i) for i in train_classes]
        test_counts = [Counter(i) for i in test_classes]
        acc = sum([test_counts[i].get(max(dict(c),key=dict(c).get) if c else default_class,0) for i, c in enumerate(train_counts)])/len(test_labels)
        temp['cluster_text'] = hash(temp['cluster_text'])
        temp['acc'] = acc
        temp['base'] = base_acc
        temp['pred'] = test_preds
        temp['classes'] = test_classes
        data.append(temp)
    return pd.DataFrame(data)

acc_df = param_search(args, default_args, trn_labels, tst_labels)  

In [None]:
acc_df_r = acc_df.drop(['pred','classes'], axis=1)
acc_df_r['cluster_text'] = acc_df_r['cluster_text'].astype(str)
acc_df_r.loc[acc_df['cluster_text'] == hash((tuple(p_text), tuple(q_text))), 'cluster_text'] = "WebText" 
%R -i acc_df_r

In [None]:
%%R
supp.labs <- c("GPT Small",  "GPT Medium", "GPT Large","GPT XL", 'BERT', 'BERT Large')
names(supp.labs) <- c("gpt2",  "gpt2-medium", "gpt2-large", "gpt2-xl", "bert-base-cased", 'bert-large-cased')

#data.names <- c("WebText", "Training Set")
ggplot(aes(x = num_buckets, y = acc, 
           color=factor(featurize_model_name, levels=names(supp.labs), labels=supp.labs),
           shape=factor(featurize_model_name, levels=names(supp.labs), labels=supp.labs),
          ), 
       data = filter(acc_df_r, featurize_model_name %in% names(supp.labs))) +
    geom_point(size=5) +
    geom_line() +
    geom_hline(aes(yintercept = base), linetype="longdash") +
    facet_wrap(is_mean~factor(cluster_text))+ 
    scale_x_continuous(trans='log2', breaks=c(2,5,10,50,25,100,250,500)) +
    scale_shape_manual(values = c(15,16,17,18, 19, 20)) +
    theme_bw() +
    labs(x = "Number of Clusters", y="Accuracy", shape="GPT-2", color="GPT-2", linetype="Cluster Text") +
    theme(text=element_text(size=22,family="serif"), 
         axis.text.y=element_text(size=19,family="serif"),
        axis.text.x=element_text(size=13,family="serif", angle=30),
         aspect.ratio=1.2)


## Surface Features

In [None]:
from examples.load_gpt2_dataset import load_gpt2_dataset
p_text = load_gpt2_dataset('data/amazon.valid.jsonl', num_examples=num) 
q_text = load_gpt2_dataset('data/amazon-xl-1542M.valid.jsonl', num_examples=num)

In [None]:
test_text1, test_text2 =  p_text2, q_text2
train_text1, train_text2 = p_text, q_text
trn_stop_y = [sum([1 if w in stops else 0 for w in t.lower().split()])/len(t.split()) for t in  p_text + q_text]
tst_stop_y = [sum([1 if w in stops else 0 for w in t.lower().split()])/len(t.split()) for t in  p_text2 + q_text2]

trn_punct_y = [sum([1 if not w.isalpha() else 0 for w in t.split()])/len(t.split()) for t in  p_text + q_text]
tst_punct_y = [sum([1 if not w.isalpha() else 0 for w in t.split()])/len(t.split()) for t in  p_text2 + q_text2]

default_args = {'test_text': (tuple(test_text1), tuple(test_text2)), 'cluster_text': (tuple(p_text), tuple(q_text)), 
                'featurize_model_name':'gpt2','kmeans_num_redo':5, 
                'kmeans_max_iter':100, 'kmeans_explained_var':0.9, 'is_mean':False, 
                'device_id':0, 'max_text_length': MAX_LEN, 'verbose':True}
args = {'featurize_model_name':["bert-base-cased", 'bert-large-cased','gpt2','gpt2-medium','gpt2-large'],
        'kmeans_explained_var':[0.9], 
        'is_mean': [True, False],
        'num_buckets':[5, 10, 20, 50, 100, 250, 500]}


def param_search(args, default_args, train_ys, test_ys):
    def func(cur_args):
        params = copy.deepcopy(default_args)
        params.update(cur_args)
        p_text, q_text = params['test_text']
        p_cluster_text, q_cluster_text = params['cluster_text']
        model_name, mean = params['featurize_model_name'], params['is_mean']
        
        p_representations = get_representations(p_text, model_name, mean)
        q_representations = get_representations(q_text, model_name, mean)
        p_cluster_representations = get_representations(p_cluster_text, model_name, mean)
        q_cluster_representations = get_representations(q_cluster_text, model_name, mean)
        clustering_model = train_clusters(p_cluster_representations, q_cluster_representations, 
                                          params['num_buckets'], params['kmeans_explained_var'], 0, norm='l2')
        _, labels = get_clusters(clustering_model['pca'], clustering_model['kmeans'].index,
                                 clustering_model['dimensionality'], 
                                 np.concatenate([p_representations, q_representations], axis=0))
        _, labels_clusters = get_clusters(clustering_model['pca'], clustering_model['kmeans'].index,
                                          clustering_model['dimensionality'], 
                                          np.concatenate([p_cluster_representations, q_cluster_representations], axis=0))

        return  list(labels_clusters), list(labels)
    
    keys = args.keys()
    vals = args.values()
    data = []
    y_bar = np.mean(train_ys)
    sst = sum((test_ys - y_bar)**2)
    for comb in list(itertools.product(*vals)):
        temp = {k:v for k,v in zip(keys,comb)}
        train_preds, test_preds = func(temp)
        assert len(test_preds) == len(test_ys)
        num_buckets = temp.get('num_buckets', default_args.get('num_buckets',0))
        def vals_per_class(preds, ys, num_classes):
            classes = [[] for i in range(num_classes)]
            for i,j  in enumerate(preds):
                classes[j-1].append(ys[i])
            return classes
        
        train_classes = vals_per_class(train_preds, train_ys, num_buckets)      
        test_classes = vals_per_class(test_preds, test_ys, num_buckets)
        train_coefs = [np.mean(c) for c in train_classes]
        sse = sum([sum([(y - train_coefs[i])**2 for y in ys]) for i, ys in enumerate(test_classes)])
        temp['cluster_text'] = hash(temp.get('cluster_text', default_args.get('cluster_text')))
        temp['R^2'] = 1 - sse/sst
        temp['pred'] = test_preds
        temp['classes'] = test_classes
        data.append(temp)
    return pd.DataFrame(data)

punct_r2_df = param_search(args, default_args, trn_punct_y, tst_punct_y)
stop_r2_df = param_search(args, default_args, trn_stop_y, tst_stop_y)

In [None]:
r2_df = pd.concat([punct_r2_df.assign(feature='punct'), stop_r2_df.assign(feature='stop')])
r2_df_r = r2_df.drop(['pred','classes'], axis=1)
r2_df_r['cluster_text'] = r2_df['cluster_text'].astype(str)
%R -i r2_df_r

In [None]:
%%R
model_levels <- c("gpt2",  "gpt2-medium", "gpt2-large",  "bert-base-cased", 'bert-large-cased')

# data.names <- c("WebText", "Training Set")
ggplot(aes(x = num_buckets, y = `R^2`, 
           color=factor(featurize_model_name, levels=model_levels),
           shape=factor(featurize_model_name, levels=model_levels)), data = r2_df_r) +
    geom_point(size=4) +
    geom_line() +
    facet_wrap(is_mean~factor(feature, levels=c('punct', 'stop'), labels=c('Punctuation', 'Stopwords')))+
    scale_x_continuous(trans='log2', breaks=c(2,5,10,50,20,100,250,500)) +
    theme_bw() +
    geom_hline(aes(yintercept = 0), linetype=2) +
    labs(x = "Number of Clusters", y=expression(paste(R^2))) +
    scale_color_manual(values=c("#F8766D", "#C49A00", "#A58AFF", "#00C094", 'green', 'orange'), labels=c("Small", "Medium", "Large",  "Base", "Large2"), name="GPT-2") +
    scale_shape_manual(values = c(15,16,17,18, 19, 20), labels=c("Small", "Medium", "Large",  "Base", "Large2"), name="GPT-2") +
    theme(text=element_text(size=22,family="serif"), 
         axis.text.y=element_text(size=17,family="serif"),
         legend.title=element_text(size=20,family="serif"),
        axis.text.x=element_text(size=17,family="serif", angle=30),
         aspect.ratio=1.25)
