In [None]:
model_name = 'princeton-nlp/Sheared-LLaMA-1.3B'
dataset_name = 'tab'
target_epsilon = 'inf'
model_config = f'{model_name.replace("/", "_")}_{dataset_name}_DP_{target_epsilon}'
synthetic_data_path = f'./data/synthetic/{model_config}_outputs-final.csv'# Path to the CSV file where the outputs are saved

### Downstream Utility Evaluation

In [None]:
import pandas as pd
import json
from transformers import TrainingArguments as HfTrainingArguments
from synthtexteval.eval.downstream.classify.train_classifier import TrainingArguments, ModelArguments, Classifier, Arguments
from synthtexteval.utils.utils import create_classification_dataset
from synthtexteval.utils.filtering import process_df

#### Classification: Creating the dataset

Filtering data and creating a structured format out of raw synthetic text.

We have assumed that the synthetic text is generated with labels (the labels typically serve as control codes in most setups).

In [None]:
# Create mapping from the original data
# And creating a test set for evaluating the model once it is trained
from datasets import load_from_disk, concatenate_datasets
from synthtexteval.utils.utils import encode_labels

In [None]:
tab_data = load_from_disk('./data/generator/data/tab/')
col_names = [i for i in tab_data['train'].column_names if i not in ['country', 'text', 'year']]
tab_data = tab_data.remove_columns(col_names)
tab_data['train'] = concatenate_datasets([tab_data['train'], tab_data['validation'], tab_data['test']])

_, _ = encode_labels(tab_data['train'], label_column = 'country', json_mapping_exists = False, 
                                json_mapping_path = f'./data/benchmark/classification/data/{dataset_name}-mapping.json', multilabel=False)


In [None]:
tab_data = load_from_disk('./data/generator/data/tab/')
df, _ = encode_labels(tab_data['validation'], label_column = 'country', json_mapping_exists = True, 
                                json_mapping_path = f'./data/benchmark/classification/data/{dataset_name}-mapping.json', multilabel=False)
test_file_path = f'./data/benchmark/classification/data/test/{dataset_name}/test.csv'
print(f"Saving test file to: {test_file_path}")
df.to_csv(test_file_path)

In [None]:
# Use this mapping for converting the synthetic data to a structured format
df = pd.read_csv(synthetic_data_path)
df = process_df(df, text_column = 'output_text')
_, _, _ = create_classification_dataset(df, label_column = 'country', json_mapping_path = f'./data/benchmark/classification/data/{dataset_name}-mapping.json', json_mapping_exists = True,
                                        output_dir = f'./data/benchmark/classification/data/{model_config}', multilabel = False, train_ratio = 0.7, test_ratio = 0.15, val_ratio = 0.15)

In [None]:
with open(f'./data/benchmark/classification/data/{dataset_name}-mapping.json') as f:
    data = json.load(f)
    n_labels_task = len(data)
print(f"Number of labels: {n_labels_task}")

#### Classification: Training the model

This can also be run as a script. Sample script provided in eval.downstream.classify

In [None]:
if __name__ == "__main__":
        train_args, model_args = TrainingArguments(), ModelArguments()

        model_args.model_name = 'bert-base-uncased'
        model_args.text_field = 'output_text'
        model_args.label_field = 'Label'
        model_args.path_to_dataset = f'./data/benchmark/classification/data/{model_config}'
        model_args.path_to_model = f'./data/benchmark/classification/models/{model_args.model_name}_{model_config}'
        model_args.n_labels = n_labels_task
        model_args.is_train = True
        model_args.problem_type = 'single_label_classification'
        args = Arguments(train=train_args, model=model_args)

        print("Training:\n")
        obj = Classifier(args = args)
        obj.finetune_model()

#### Classification: Testing the model

In [None]:
if __name__ == "__main__":
        train_args, model_args = TrainingArguments(), ModelArguments()
        model_args.is_train = False
        model_args.is_test = True
        model_args.text_field = 'text'
        model_args.label_field = 'Label'

        model_args.model_name = 'bert-base-uncased'
        model_args.path_to_model = f'./data/benchmark/classification/models/{model_args.model_name}_{model_config}'
        model_args.path_to_dataset = f'./data/benchmark/classification/data/test/{dataset_name}/test.csv'
        model_args.path_to_output_csv = f'./data/benchmark/classification/test-results/{model_args.model_name}_{model_config}_test_outputs.csv'
        model_args.path_to_aggregated_results = './data/benchmark/classification/compiled_benchmark_results.csv'

        model_args.n_labels = n_labels_task
        model_args.problem_type = "single_label_classification"
        model_args.retain_columns = ['country', 'year']

        args = Arguments(train=train_args, model=model_args)
        print("Testing:\n")
        obj = Classifier(args = args)
        obj.test_model()

#### Classification: Fairness auditing of the trained classifier

In [None]:
from synthtexteval.eval.downstream.classify.visualize import tabulate_results

path_to_test_output = f'./data/benchmark/classification/test-results/{model_args.model_name}_{model_config}_test_outputs.csv'
tabulate_results([path_to_test_output], n_labels = n_labels_task, print_fairness=True, subgroup_type="country", problem_type = "multiclass")

### Descriptive Analysis of Synthetic Data

In [None]:
import nltk
import pandas as pd
from synthtexteval.eval.descriptive.descriptor import TextDescriptor
from synthtexteval.eval.descriptive.arguments import TextDescriptorArgs
nltk.download('punkt_tab')

In [None]:
from datasets import load_from_disk
synth_df = pd.read_csv(synthetic_data_path)
real_texts = load_from_disk('./data/generator/data/tab')
len_samples = len(synth_df) if len(synth_df)<len(real_texts['train']) else len(real_texts['train'])
synth_df = synth_df.head(len_samples)
real_texts = real_texts['train'].select(range(len_samples))
real_texts = real_texts[ 'text']

#### Text length and distributional comparisons

In [None]:
desc_analyze = TextDescriptor(texts = synth_df['output_text'].tolist(), args = TextDescriptorArgs(produce_plot=True), reference_texts = real_texts)

In [None]:
desc_analyze._compare_to_reference_distribution(metrics = ['text-length', 'jaccard', 'cosine'])

In [None]:
import pyLDAvis
pyLDAvis.enable_notebook()
tm = desc_analyze._topic_modeling_display(num_topics=3)

In [None]:
tm

### Privacy Leakage Assessment

#### Privacy: Defining the entities

In [None]:
import re
import pandas as pd
from datasets import load_from_disk

real_texts = load_from_disk('./data/generator/data/tab')
real_texts = real_texts['train']
synth_df = pd.read_csv(synthetic_data_path)

In [None]:
entities = []
for i in real_texts['annotations']:
    try:
        for annotator in i:
            for entity in i[annotator]['entity_mentions']:
                if(entity['entity_type'] in ['PERSON', 'DATETIME']):
                    entities.append(entity['span_text'])
    except Exception as e:
        continue

In [None]:
print(len(entities))

#### Evaluating leakage of entities

In [None]:
from synthtexteval.eval.privacy.metrics import entity_leakage, search_and_compute_EPO

In [None]:
total_leakage, privacy_analysis = entity_leakage(synth_df['output_text'].tolist(), entities, 'privacy-leakage.pkl')

In [None]:
print(f"Percentage of leaked entities: {100*total_leakage:.3f} %")

#### Evaluating span memorization

Doing this only for 15 entities as it is time-intensive

In [None]:
entities = entities[:15]

In [None]:
t_df = pd.DataFrame({'text': synth_df['output_text'].tolist()[:10]})


search_and_compute_EPO(synth_file = synth_df, ref_file = t_df, 
                       synth_phrase_file_path = 'synth-outputs.csv', ref_phrase_file_path = 'ref-outputs.csv',
                       entity_patterns = fake_entities, max_window_len = 3,
                       text_field = text_field)

In [None]:
print(f"Percentage of leaked entity contexts: {100*total_leakage:.3f} %")

### Qualitative Evaluation Against Real Data

In [None]:
import pandas as pd
from synthtexteval.eval.text_quality.metrics import TextQualityEval
from synthtexteval.eval.text_quality.arguments import MauveArgs, LMArgs, FrechetArgs
from dataclasses import dataclass
from datasets import load_from_disk


df = pd.DataFrame({})
synthetic_samples = pd.read_csv(synthetic_data_path)
real_samples = load_from_disk('./data/generator/data/tab')
len_samples = len(synthetic_samples) if len(synthetic_samples)<len(real_samples['train']) else len(real_samples['train'])
synthetic_samples = synthetic_samples.head(len_samples)
real_samples = real_samples['train'].select(range(len_samples))

df['source'] = synthetic_samples['output_text']
df['reference'] = real_samples['text']

@dataclass
class args_temp:
    FrechetArgs:FrechetArgs
    MauveArgs:MauveArgs
    LMArgs:LMArgs

args_ = args_temp(FrechetArgs, MauveArgs, LMArgs)
qual_estimator = TextQualityEval(args_)

In [None]:
qual_estimator.calculate_perplexity(df)
qual_estimator.calculate_fid_score(df)

In [None]:
qual_estimator.print_metrics(qual_estimator.return_results())