# Setup

In [3]:
import os
import openai
openai.organization = os.getenv("OPENAI_ORGANISATION")
openai.api_key = os.getenv("OPENAI_API_KEY")

In [4]:
def get_file_names_from_df(df):
    return df['file'].split('-')[1].split('#')[0]

In [5]:
from lrml_utils import *
from training_utils import compute_lrml


# Postprocessing functions for LRML calculations
def clean_pred(lrml, simplifications, added_spaces=True):
    prefix = ' ' if added_spaces else ''

    # postprocessing_
    lrml = lrml.strip()
    lrml = lrml.replace('[', '(').replace(']', ')').replace(
        '{', '(').replace('}', ')')
    lrml = lrml.replace(').', ')')
    lrml = fix_then(lrml, prefix=prefix)
    if '(' in lrml:
        # Fix errors is postprocessing
        lrml = reverse_loop(lrml, prefix=prefix)
        lrml = reverse_resolve_expressions(lrml, fix_errors=True, prefix=prefix)
        lrml = reverse_combine_rel_and_var(lrml, prefix=prefix)

        lrml = reverse_move_and_or_to_data_node(lrml)
        lrml = reverse_units(lrml, prefix=prefix)

        # postprocessing
        lrml = remove_duplicate_expressions(lrml, prefix + 'obligation')
        lrml = remove_duplicate_expressions(lrml, prefix + 'expression')
    return lrml

def fix_then(lrml, prefix):
    tree = parse_to_tree(lrml)
    if len(tree.children) == 1:
        thens = findall(tree, filter_=lambda x: ((x.name.strip() == 'then')))
        if len(thens) > 0:
            thens[0].parent = tree
    return node_to_lrml(tree)

In [6]:
# Read the text file
def read_file(file_name):
    with open(file_name, 'r') as file:
        data = file.read()
    return data

In [7]:
from IPython.display import Markdown
from openai import OpenAIError, APIError
import time

# def ask_GPT(contextualisation, prompt, should_display=True, model="gpt-4-1106-preview", temperature=0): 
def ask_GPT(contextualisation, prompt, should_display=True, model="gpt-3.5-turbo-0301", temperature=0): 
    try:
        result = openai.ChatCompletion.create(model=model,
                                 messages=[{"role": "system", "content": contextualisation},
                                           {"role": "user", "content": prompt}], 
                                           temperature=temperature)
    except OpenAIError or APIError:
        time.sleep(10)
        result = openai.ChatCompletion.create(model=model,
                                    messages=[{"role": "system", "content": contextualisation},
                                            {"role": "user", "content": prompt}], 
                                            temperature=temperature)

    if should_display:
        display(Markdown(result['choices'][0]['message']['content']))
    return result['choices'][0]['message']['content']


In [None]:
import pandas as pd
import tiktoken
enc = tiktoken.get_encoding("cl100k_base")

df = pd.read_csv('data/lrml_ds_v8_sel.csv')
df['spaced_lrml'] = df['lrml'].apply(tree_based_spacing)


train_df = df.loc[df['random_split'] == 1]
train_df['spaced_lrml'] = train_df['lrml'].apply(tree_based_spacing)
train_df['input'] = train_df.apply(lambda j: "Source: " + get_file_names_from_df(j) + " " + j['text'] + "\nTarget: ", axis=1)

valid_df = df.loc[df['random_split'] == 2]
valid_df['spaced_lrml'] = valid_df['lrml'].apply(tree_based_spacing)
valid_df['input'] = valid_df.apply(lambda j: "Source: " + get_file_names_from_df(j) + " " + j['text'] + "\nTarget: ", axis=1)
valid_df['inputoutput'] = valid_df['input'] + valid_df['spaced_lrml']
max_valid = valid_df['inputoutput'].apply(enc.encode).apply(len).max()

test_df = df.loc[df['random_split'] == 3]
test_df['spaced_lrml'] = test_df['lrml'].apply(tree_based_spacing)
test_df['input'] = test_df.apply(lambda j: "Source: " + get_file_names_from_df(j) + " " + j['text'] + "\nTarget: ", axis=1)
test_df['inputoutput'] = test_df['input'] + test_df['spaced_lrml']
max_test = test_df['inputoutput'].apply(enc.encode).apply(len).max()


# Document split was only tested in last experiment
train_doc_df = df.loc[df['doc_split'] == 1]
train_doc_df['spaced_lrml'] = train_doc_df['lrml'].apply(tree_based_spacing)
train_doc_df['input'] = train_doc_df.apply(lambda j: "Source: " + get_file_names_from_df(j) + " " + j['text'] + "\nTarget: ", axis=1)

test_doc_df = df.loc[df['doc_split'] == 3]
test_doc_df['spaced_lrml'] = test_doc_df['lrml'].apply(tree_based_spacing)
test_doc_df['input'] = test_doc_df.apply(lambda j: "Source: " + get_file_names_from_df(j) + " " + j['text'] + "\nTarget: ", axis=1)
test_doc_df['inputoutput'] = test_doc_df['input'] + test_doc_df['spaced_lrml']
max_test_doc = test_doc_df['inputoutput'].apply(enc.encode).apply(len).max()



# Better sampling

In [12]:
import pandas as pd
from nltk import ngrams
from collections import Counter
import random

def calculate_ngram_overlap(text1, text2, n):
    ngrams_text1 = set(ngrams(text1.split(), n))
    ngrams_text2 = set(ngrams(text2.split(), n))
    return len(ngrams_text1.intersection(ngrams_text2))

def sample_rows(train_df, input_text, n_range, num_samples):
    selected_samples = []
    ngrams_matched = set()

    while len(selected_samples) < num_samples:
        n = random.choice(n_range)
        train_df['overlap'] = train_df['text'].apply(lambda x: calculate_ngram_overlap(x, input_text, n))
        sorted_df = train_df.sort_values('overlap', ascending=False)

        selected_samples_df = pd.DataFrame(selected_samples, columns=train_df.columns)
        sorted_df = sorted_df[~sorted_df['text'].isin(selected_samples_df['text'])]

        for _, row in sorted_df.iterrows():
            if ngrams_matched.intersection(row['text'].split()):
                continue

            selected_samples.append(row)
            ngrams_matched.update(row['text'].split())

            if len(selected_samples) == num_samples:
                break

        ngrams_matched = set()

    return pd.DataFrame(selected_samples)

In [13]:
import wandb

def get_sample_text_for_text(text, contextualisation, reversed, doc_split=False, max_length=4060):
    # 90 is the minimum estimate of tokens for a sample
    number_of_samples = int(max_length/90)
    if doc_split:
        selected_samples_df = sample_rows(train_doc_df, text, range(1,3), number_of_samples)
    else:
        selected_samples_df = sample_rows(train_df, text, range(1,3), number_of_samples)

    return get_sample_text_for_df(selected_samples_df, contextualisation, reversed=reversed, max_length=max_length)


# Only makes sure the length is not too long
def get_sample_text_for_df(df, contextualisation, wandb_log=True, reversed=False, max_length=4060):
    number_of_samples = len(df)
    if reversed:
        df = df[::-1]
    # selected_samples_df = sample_rows(train_df, text, range(1,3), 35, defaults=[i for index, i in defaults.iterrows()])
    example_random_sm = '\n\n'.join(["Source: " + get_file_names_from_df(i) + " " + i['text'] + "\nTarget: " + i['spaced_lrml'] for index, i in df.iterrows()])
    # Leave some space for prediction longer than the ground truth
    while len(enc.encode(example_random_sm)) + len(enc.encode(contextualisation)) + max_valid > max_length:
    # while len(enc.encode(example_random_sm)) + len(enc.encode(contextualisation)) + max_valid > 8000:
        number_of_samples -= 1
        if not reversed:
            example_random_sm = '\n\n'.join(["Source: " + get_file_names_from_df(i) + " " + i['text'] + "\nTarget: " + i['spaced_lrml'] for index, i in df[:number_of_samples].iterrows()])
        else:
            example_random_sm = '\n\n'.join(["Source: " + get_file_names_from_df(i) + " " + i['text'] + "\nTarget: " + i['spaced_lrml'] for index, i in df[-number_of_samples:].iterrows()])
    if wandb_log:
        wandb.log({'num_samples': number_of_samples})
    print("Number of samples: ", number_of_samples)
    return example_random_sm

In [14]:
import evaluate
metric = evaluate.load('bleu')

In [15]:
import wandb
from tqdm import tqdm

import re

# For self-reflection experiments
def parse_number(text):
    # Search for the pattern "Option <number>" in the text
    match = re.search(r'Option (\d+)', text)
    
    if match:
        # Extract the number from the matched pattern
        return int(match.group(1))
    else:
        # Return None or raise an error if no number is found
        return None
    
def run_experiment(name, sample_df, contextualisation, lrml_col='spaced_lrml', everyNth=1, should_display=False, reversed=False, cot=False, step_by_step=False, valid_df=valid_df, self_reflect=False, self_reflect_df_names=[], options=False, gpt4=False, default_temperature=0, max_length=4060, doc_split=False, cot_exemplars=None):
    with wandb.init(project='EPPM', entity='stefan_fuchs_phd', config={'run_name': name}):
        is_first = True
        if gpt4:
            model="gpt-4-1106-preview"
        else:
            model="gpt-3.5-turbo-0301"
        # Examples in System Intel
        preds = []
        begin = 0
        end = len(valid_df)

        name = name + "_" + str(len(valid_df.iloc[range(begin, end, everyNth)]))

        if sample_df is not None:
            _system_prompt = contextualisation + get_sample_text_for_df(sample_df, contextualisation, reversed=reversed, max_length=max_length)
            sample_df.to_csv('eppm_preds/' + name + '.csv')
        # Function defined for self-reflection experiments
        if self_reflect:
            validation_scores = get_validation_scores(valid_df, self_reflect_df_names)

        for i in tqdm(range(begin, end, everyNth)):
            temperature=default_temperature
            j = valid_df.iloc[i]
            if cot:
                if step_by_step:
                    prompt = j['input'].replace('Target: ', "Let's think step by step:\n")
                else:
                    prompt = j['input'].replace('Target: ', "")
            elif self_reflect: 
                # Function defined for self-reflection experiments
                prompt = get_self_reflection_prompt(j, validation_scores[i], options)
            else:
                prompt = j['input']
            if should_display:
                print(j[lrml_col])
            # print(j['lrml'])
            check_condition = lambda x: 'if(' not in x
            if cot:
                if sample_df is not None:
                    other_samples = get_sample_text_for_df(sample_df, contextualisation, reversed=reversed, max_length=max_length)
                    _system_prompt = other_samples + '\n\n' + contextualisation
                else:
                    _system_prompt = contextualisation
                if cot_exemplars:
                    # Gets per clause exemplars through get_sample_text_for_text and appends cot_exemplars in the end
                    _system_prompt = contextualisation + get_sample_text_for_text(j['text'], contextualisation + cot_exemplars, reversed=reversed, max_length=max_length, doc_split=doc_split) + cot_exemplars

                check_condition = lambda x: 'target:' not in x.lower()
            elif self_reflect: 
                _system_prompt = contextualisation
                if options:
                    check_condition = lambda x: not re.search(r'Option (\d+)', x)

            elif sample_df is None:
                _system_prompt = contextualisation + get_sample_text_for_text(j['text'], contextualisation, reversed=reversed, max_length=max_length, doc_split=doc_split)

            if is_first:
                print(_system_prompt + '\n\n' + prompt)
                is_first = False

            response = ask_GPT('', _system_prompt + '\n\n' + prompt, should_display=should_display, model=model)            

            while check_condition(response) and temperature < 2.0:
                print('LOOP')
                print(response)
                print('')
                print(_system_prompt + '\n\n' + prompt)

                response = ask_GPT('', _system_prompt + '\n\n' + prompt, should_display=should_display, temperature=temperature, model=model)
                temperature += 0.4
            if check_condition(response):
                print('ALTERNATIVE')
                prompt = "Source: " + get_file_names_from_df(j) + " " + j['text'] + "\nTarget: "
                response = ask_GPT('', _system_prompt + '\n\n' + prompt, should_display=should_display, temperature=2.0, model=model)
            if check_condition(response):
                print("ERROR")
            if self_reflect and options:
                print(response)
                number = parse_number(response)
                # Default to 0
                if number is None:
                    number = 0
                response = list(validation_scores[i].values())[number]
            preds.append(response)
            

        with open('eppm_preds/' + name + '.txt', 'w') as f:
            for pred in preds:
                f.write(pred + '\n')
        
        preds =  [pred.split('Target:')[1].strip() if 'Target:' in pred else pred for pred in preds]


        if lrml_col in valid_df.columns:
            scores = compute_lrml(predictions=[clean_pred(pred, []) for pred in preds], references=[clean_pred(lrml, []) for lrml in valid_df[lrml_col].iloc[range(begin, end, everyNth)].tolist()], entity_weight=2, filter_empty=True)
            scores.update(metric.compute(predictions=preds, references=valid_df[lrml_col].iloc[range(begin, end, everyNth)].tolist()))
        else:
            scores = {}
            
        wandb.run.name = name
        wandb.log(scores)
        wandb.log({'prompt': _system_prompt, 'exemplar_num': len(sample_df) if sample_df is not None else 0})

        print(scores)

        return preds, scores

# Sampling

In [19]:
new_df = train_df.loc[train_df['selected_round_2'] == 1]
len(new_df)

45

In [None]:
# Selected samples exeriments with different numbers of samples
for i in reversed([1,3,5,10,15,20,25,30]):
    new_df = new_df.sample(i)
    exp_name = 'selected_' + str(i)
    print(exp_name)
    contextualisation = ''
    preds, score = run_experiment(exp_name, new_df, contextualisation, lrml_col='spaced_lrml', everyNth=1)

In [None]:
# Different Sets of Random Samples --- Maximum amout fitting into the token limit
for i in range(10):
    new_df = train_df.sample(50)
    exp_name = 'random_' + str(i)
    print(exp_name)
    contextualisation = ''
    preds, score = run_experiment(exp_name, new_df, contextualisation, lrml_col='spaced_lrml', everyNth=1)

In [None]:
# Download the following repository and navigate to it: https://github.com/rmunro/pytorch_active_learning
cd ../../pytorch_active_learning

In [None]:
# Withing pytorch_active_learning the diversity_sampling file can be found
from diversity_sampling import DiversitySampling

In [None]:
sampler = DiversitySampling()

In [None]:
# Navigate back into the EPPM repository
cd ../Chapter8

/Users/stefanfuchs/Repos/MultiTaskDecoding-Exp


In [None]:
data = []
for i, j  in train_df.iterrows():
    data.append([i, j['text'], '', 'random', 0])
data_valid = []
for i, j  in valid_df.iterrows():
    data_valid.append([i, j['text'], '', 'random', 0])
samples = sampler.get_representative_samples(data_valid, data, 40)

In [None]:
new_df = train_df.loc[[i[0] for i in samples]]

In [None]:
exp_name = 'representative_sampling'
print(exp_name)
contextualisation = ''
preds, score = run_experiment(exp_name, new_df, contextualisation, lrml_col='spaced_lrml', everyNth=1, reversed=False)

In [None]:
# Stratified sampling by file
indices = []
for i in set(train_df['file']):
    indices.extend(train_df[train_df['file'] == i].sample(min(len(train_df[train_df['file'] == i]), 2)).index.tolist())
new_df = train_df.loc[indices]
len(new_df)

31

In [None]:
exp_name = 'stratified_sampling'
print(exp_name)
contextualisation = ''
new_df = new_df.sample(frac=1)
# df = None
preds, score = run_experiment(exp_name, new_df, contextualisation, lrml_col='spaced_lrml', everyNth=1, reversed=False)

In [None]:
from sentence_transformers import SentenceTransformer, util
# all-mpnet-base-v2  bert-base-nli-mean-tokens
embedder = SentenceTransformer('all-mpnet-base-v2')
corpus_embeddings = embedder.encode(train_df['text'].tolist())

# Cluster by embedding
from sklearn.cluster import KMeans
num_clusters = 40
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_

clustered_sentences = [[] for i in range(num_clusters)]
for sentence_id, cluster_id in enumerate(cluster_assignment):
    clustered_sentences[cluster_id].append(train_df['text'].tolist()[sentence_id])



In [None]:
# Searching by similarity of either valit lrml to train text, or valid text to train text
for comparison in ['text', 'lrml']:
    search_embeddings = embedder.encode(valid_df[comparison].tolist())
    hits = util.semantic_search(search_embeddings, corpus_embeddings, score_function=util.dot_score)

    # first of each hits
    indices = [hits[i][0]['corpus_id'] for i in range(len(hits))]

    clustered_ids = [[] for i in range(num_clusters)]

    for sentence_id, cluster_id in enumerate(cluster_assignment):
        clustered_ids[cluster_id].append(sentence_id)

    ids = [random.choice(i) for i in clustered_ids]
    new_df = train_df.iloc[ids]
    len(new_df)

    new_df = train_df.iloc[list(set(indices))].sample(frac=1)

    # LRML clustering
    exp_name = 'semantic_clustering_'+ comparison
    print(exp_name)
    contextualisation = ''
    preds, score = run_experiment(exp_name, new_df, contextualisation, lrml_col='spaced_lrml', everyNth=1, reversed=False)


In [None]:
# Override method to use semantic retrieval for per clause sampling
def get_sample_text_for_text(text, reversed):
    number_of_samples = 45
    search_embeddings = embedder.encode([text])
    hits = util.semantic_search(search_embeddings, corpus_embeddings, score_function=util.dot_score, top_k=number_of_samples)
    indices = [i['corpus_id'] for i in hits[0]]
    selected_samples_df = train_df.iloc[list(set(indices))]
    return get_sample_text_for_df(selected_samples_df, reversed=reversed)

In [None]:
# LRML per clause with semantic search
reversed_options = [True, False]

for reversed in reversed_options:
    exp_name = 'perclause_semantic_search'
    if reversed:
        exp_name += '_reversed'
    print(exp_name)
    contextualisation = ''
    # per clause triggered through df being None
    preds, score = run_experiment(exp_name, None, contextualisation, lrml_col='spaced_lrml', everyNth=1, reversed=reversed)

In [None]:
# Revert back to n-gram based per clause sampling
def get_sample_text_for_text(text, contextualisation, reversed, doc_split=False, max_length=4060):
    # 90 is the minimum estimate of tokens for a sample
    number_of_samples = int(max_length/90)
    if doc_split:
        selected_samples_df = sample_rows(train_doc_df, text, range(1,3), number_of_samples)
    else:
        selected_samples_df = sample_rows(train_df, text, range(1,3), number_of_samples)

    return get_sample_text_for_df(selected_samples_df, contextualisation, reversed=reversed, max_length=max_length)

In [None]:
# n-gram based per clause sampling
for reversed in reversed_options:
    exp_name = 'perclause'
    if reversed:
        exp_name += '_reversed'
    print(exp_name)
    contextualisation = ''
    # per clause triggered through df being None
    preds, score = run_experiment(exp_name, None, contextualisation, lrml_col='spaced_lrml', everyNth=1, reversed=reversed)

In [None]:
# Per clause sampling with GPT-4

# exp_name = 'perclause_reversed_gpt4_8000'
exp_name = 'perclause_reversed_gpt4'
print(exp_name)
contextualisation = ''
preds, score = run_experiment(exp_name, None, contextualisation, lrml_col='spaced_lrml', everyNth=1, reversed=True, gpt4=True)

# Contextualising

In [None]:
# Run Contextualisation experiments with 10 and 30 samples
names = ['intro', 'spec', 'important', 'references', 'full']
num_samples = [10, 30]
for num in num_samples:
    new_df = pd.read_csv('eppm_preds/selected_' + str(num) + '_71.csv')

    for name in names:
        exp_name = 'prompts/context_' + name + '_' + str(num)
        print(exp_name)
        contextualisation = read_file('prompts/prompt_eppm_' + name + '.txt')
        preds, score = run_experiment(exp_name, new_df, contextualisation, lrml_col='spaced_lrml', everyNth=1, reversed=False, gpt4=False)


In [None]:
# Run per clause experiments for intro and full contextualisation
names = ['intro', 'full']
# New df results in the per clause sampling strategy
new_df = None

for name in names:
    exp_name = 'context_' + name + '_perclause_reverse'
    print(exp_name)
    contextualisation = read_file('prompts/prompt_eppm_' + name + '.txt')
    preds, score = run_experiment(exp_name, new_df, contextualisation, lrml_col='spaced_lrml', everyNth=1, reversed=True)


# CoT

In [208]:
cot = read_file('prompts/prompt_eppm_CoT.txt')
cot2 = read_file('prompts/prompt_eppm_CoT_align.txt')
len(enc.encode(cot)), len(enc.encode(cot2))

(2354, 2033)

In [None]:
names = ['CoT', 'CoT_align', 'CoT_full_align']

new_df = None

for name in names:
    exp_name = 'context_' + name 
    print(exp_name)
    contextualisation = read_file('prompts/prompt_eppm_' + name + '.txt')

    preds, score = run_experiment(exp_name, new_df, contextualisation, lrml_col='spaced_lrml', everyNth=1, reversed=False, cot=True, step_by_step=False, should_display=False)

In [12]:
# Prepare df for additional samples

used_df = pd.read_csv('eppm_preds/selected_10_71.csv')
rest_df = pd.read_csv('eppm_preds/selected_max30_71.csv')[:30]

# Remove all used samples from rest_df
rest_df = rest_df[~rest_df['text'].isin(used_df['text'])]
len(rest_df)

20

In [None]:
names = ['CoT', 'CoT_5']

new_df = rest_df

for name in names:
    exp_name = 'context_' + name + '_addsamples'
    print(exp_name)
    contextualisation = read_file('prompts/prompt_eppm_' + name + '.txt')

    preds, score = run_experiment(exp_name, new_df, contextualisation, lrml_col='spaced_lrml', everyNth=1, reversed=False, cot=True, step_by_step=True, should_display=False)

# Self reflection

In [None]:
# I can use the predictions saved in the files to get the samples for valid_df
# For example Per Clause samples/Selective/Random?
# For training of this process I'll use the best performing 10 training samples and generate 5 options with different random samples + Intro Sentence?


In [None]:
# One Time Code

# Make predictions with random samples for the 10 selected samples

# _df = pd.read_csv('eppm_preds/selected_10_71.csv')
# # train_df without _df
# _train_df = train_df[~train_df['text'].isin(_df['text'])]
# _df['input'] = _df.apply(lambda j: "Source: " + get_file_names_from_df(j) + " " + j['text'] + "\nTarget: ", axis=1)


# # Random
# for i in range(3):
#     new_df = _train_df.sample(50)
#     exp_name = 'random_for_train_10_' + str(i)
#     print(exp_name)
#     contextualisation = ''
#     preds, score = run_experiment(exp_name, new_df, contextualisation, lrml_col='spaced_lrml', everyNth=1, valid_df=_df)


In [27]:
# Load the sample for those predictions No Separator (imitated by ;;;) - each line is one prediction
_df = pd.read_csv('eppm_preds/selected_10_71.csv')
dfs = [pd.read_csv('eppm_preds/random_for_train_10_' + str(i) + '_10.txt', sep=';;;', header=None) for i in range(3)]

  dfs = [pd.read_csv('eppm_preds/random_for_train_10_' + str(i) + '_10.txt', sep=';;;', header=None) for i in range(3)]
  dfs = [pd.read_csv('eppm_preds/random_for_train_10_' + str(i) + '_10.txt', sep=';;;', header=None) for i in range(3)]
  dfs = [pd.read_csv('eppm_preds/random_for_train_10_' + str(i) + '_10.txt', sep=';;;', header=None) for i in range(3)]


In [23]:
def shuffle_dict(my_dict):
    keys = list(my_dict.keys())
    random.shuffle(keys)

    # Create a new dictionary with the shuffled keys
    shuffled_dict = {key: my_dict[key] for key in keys}
    return shuffled_dict

# For each of the 10 samples in _df, check which of the 3 predictions in the dfs is the closest to the sample
def score_samples(_df, dfs):
    all_scores = []
    for i in range(len(_df)):
        scores = {}
        for j in range(len(dfs)):
            score = compute_lrml(predictions=[clean_pred(dfs[j][0][i], [])], references=[clean_pred(_df['spaced_lrml'].iloc[i], [])], entity_weight=2, filter_empty=True)['lrml_f_score']
            # Avoid duplicates
            while score in scores:
                score += 0.000000001
            # For each of the dataframes take the first and only column and the i-th row which is a prediction
            scores[score] = dfs[j][0][i]
        # Shufle dicts to not learn to pick the best run continuosly
        all_scores.append(shuffle_dict(scores))
    return all_scores


In [24]:
def get_max_index(score_dict):
    key_list = list(score_dict)
    index = key_list.index(max(score_dict))
    return index

def get_predicted_values(scores, options):
    indices = list(range(len(scores)))
    if options:
        return '\n'.join([str(['Option ' + str(i) + ': ' + j[1] for i, j in enumerate(scores.items())][i]) for i in indices])
    else:
        return '\n'.join([str([j for i, j in scores.items()][i]) for i in indices])


def get_self_reflection_samples(_df, dfs, choose=True, options=False):
    all_scores = score_samples(_df, dfs)
    if options:
        return '\n\n'.join(["Source: " + get_file_names_from_df(i) + " " + i['text']  + "\n" + get_predicted_values(all_scores[index], options) + "\nTarget: Option " + str(get_max_index(all_scores[index])) for index, i in _df.iterrows()])
    elif not choose:
        return '\n\n'.join(["Source: " + get_file_names_from_df(i) + " " + i['text']  + "\n" + get_predicted_values(all_scores[index], options) + "\nTarget: " + i['spaced_lrml'] for index, i in _df.iterrows()])
    return '\n\n'.join(["Source: " + get_file_names_from_df(i) + " " + i['text']  + "\n" + get_predicted_values(all_scores[index], options) + "\nTarget: " + all_scores[index][max(all_scores[index])] for index, i in _df.iterrows()])


In [25]:
def get_validation_scores(valid_df, df_names):
    dfs = [pd.read_csv('eppm_preds/' + i + '.txt', sep=';;;', header=None) for i in df_names]
    return score_samples(valid_df, dfs)

def get_self_reflection_prompt(row, scores, options):
    return "Source: " + get_file_names_from_df(row) + " " + row['text']  + "\n" + get_predicted_values(scores, options) + "\nTarget: "


In [None]:
names = {
    'self_reflect_mixed': ['random_1_71', 'context_full_perclause_reverse_71', 'semantic_clustering_71'],
    'self_reflect_best': ['perclause_reversed_71', 'context_full_perclause_reverse_71', 'context_intro_perclause_reverse_71'],
}

#  [choose, options]
reflection_modes = { 'choose': [True, False], 'options': [False, True], 'improve': [False, False]}

use_gpt4=False

for name, df_names in names.items():
    for reflection_mode_name, reflection_mode in reflection_modes.items():
        exp_name = name + '_' + reflection_mode_name
        print(exp_name)
        dfs = [pd.read_csv('eppm_preds/random_for_train_10_' + str(i) + '_10.txt', sep=';;;', header=None) for i in range(3)]
        contextualisation = get_self_reflection_samples(_df, dfs, choose=reflection_mode[0], options=reflection_mode[1])

        preds, score = run_experiment(exp_name, None, contextualisation, lrml_col='spaced_lrml', everyNth=1, reversed=False, cot=False, step_by_step=False, should_display=False, self_reflect=True, self_reflect_df_names=df_names, options=reflection_mode[1], gpt4=use_gpt4)

# Recreate Training Set

In [None]:
random_1_71 = pd.read_csv('eppm_preds/random_1_71.csv')
# train_df without random_1_71
_train_df = train_df[~train_df['text'].isin(random_1_71['text'][:32])]


In [None]:
_train_df['inputoutput'] = _train_df['input'] + _train_df['spaced_lrml']
max_valid = _train_df['inputoutput'].apply(enc.encode).apply(len).max()
max_valid

In [None]:
# intro --- Test allowed length -> 32
contextualisation = read_file('prompts/prompt_eppm_intro.txt')
get_sample_text_for_df(random_1_71, contextualisation, reversed=False, wandb_log=False)

Number of samples:  32


"Source: E2AS1 Parapets require a drained cavity for claddings except for vertical corrugated steel as outlined in Table 3.\nTarget: if( and( has( parapet, cladding), not( is( cladding. material, steel)), is( steel. type, vertical corrugated))), then( obligation( and( has( parapet, drained cavity), for( drained cavity, cladding))))\n\nSource: D1AS1 Access Route; Single Isolated Step 1.3.2 Threshold weather stops projecting no more than 20 mm above the threshold finished surface are acceptable.\nTarget: if( and( has( access route, single isolated step), is( single isolated step, threshold weather stop), less than equal( single isolated step. height, 20 mm), above( single isolated step. height, threshold finished surface))), then( permission( has( access route, single isolated step)))\n\nSource: E2AS1 For slatted decks, a minimum gap of 12 mm shall be provided between the exterior wall and the adjacent decking slat.\nTarget: if( is( deck, slatted)), then( obligation( and( adjacent( deck.

In [None]:
names = ['train_df'] # 75.39979152186623

new_df = None

for name in names:
    exp_name = name 
    print(exp_name)

    contextualisation = read_file('prompts/prompt_eppm_intro.txt')

    # Only the first 32 samples of random_1_71 will be used
    preds, score = run_experiment(exp_name, random_1_71, contextualisation, lrml_col='spaced_lrml', everyNth=1, reversed=False, cot=False, step_by_step=False, should_display=False, self_reflect=False, valid_df=_train_df)

In [288]:
prediction_df = pd.read_csv('eppm_preds/train_df_544.txt', sep=';;;', header=None)

  prediction_df = pd.read_csv('eppm_preds/train_df_544.txt', sep=';;;', header=None)


In [295]:
# Map the predicted values to the training dataframe
for i in range(len(_train_df)):
    _train_df['spaced_lrml'].iloc[i] = prediction_df[0][i]
_train_df.iloc['spaced_lrml'] = prediction_df[0]
_train_df = _train_df[:-1]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  _train_df['spaced_lrml'].iloc[i] = prediction_df[0][i]


In [299]:
# Map the predicted values to the original dataframe

# Assuming the column 'spaced_lrml' doesn't exist in the original df, create it as NaN values
df['spaced_lrml'] = float('NaN')

# Check if each row in df is present in _train_df
mask = df.index.isin(_train_df.index)


# Update the 'spaced_lrml' column for rows in df that are present in _train_df
df.loc[mask, 'spaced_lrml'] = _train_df['spaced_lrml']

# Calculate and update the 'spaced_lrml' column for rows in df that are not present in _train_df
df.loc[~mask, 'spaced_lrml'] = df.loc[~mask, 'lrml'].apply(tree_based_spacing)

In [None]:
df.to_csv('data/lrml_ds_v8_gen_data.csv', index=False)

# Predict additional

In [15]:
# Due to copyright only the 150 additional generations can be shown
add_df = pd.read_csv('data/lrml_additional.csv')
add_df['input'] = add_df.apply(lambda j: "Source: " + get_file_names_from_df(j) + " " + j['text'] + "\nTarget: ", axis=1)

In [None]:
exp_name = 'additional_train'
print(exp_name)
contextualisation = read_file('prompts/prompt_eppm_intro.txt')
preds, score = run_experiment(exp_name, None, contextualisation, lrml_col='spaced_lrml', everyNth=1, reversed=True, valid_df=add_df)

In [20]:
add_df['spaced_lrml'] = preds


In [21]:
add_df['random_split'] = 1
add_df['doc_split'] = 1

In [22]:
add_df.to_csv('data/lrml_additional.csv', index=False)

In [23]:
# Reload original
df = pd.read_csv('data/lrml_ds_v8_sel.csv')

In [24]:
# Add add_df to df
df = pd.concat([df, add_df], ignore_index=True)

In [26]:
df.to_csv('data/lrml_ds_v8_add_data_150.csv', index=False)

In [108]:
# Reload original
df = pd.read_csv('data/lrml_ds_v8_sel.csv')

# GPT-4

## Validation Set

In [None]:
# Validate Gpt-4 with and without context and with per clause sampling
names = ['no','full']

for name in names:
    exp_name = 'context_' + name + '_perclause_reverse'
    print(exp_name)
    contextualisation = read_file('prompts/prompt_eppm_' + name + '.txt')
    preds, score = run_experiment(exp_name, None, contextualisation, lrml_col='spaced_lrml', everyNth=1, reversed=True, gpt4=True)

In [None]:
# Per clause sampling with longer context lenghts
names = ['full']
lengths = [6000, 8000]

for name in names:
    for length in lengths:
        exp_name = 'context_' + name + '_perclause_reversed_gpt4_' + str(length)
        print(exp_name)
        contextualisation = read_file('prompts/prompt_eppm_' + name + '.txt')
        preds, score = run_experiment(exp_name, None, contextualisation, lrml_col='spaced_lrml', everyNth=1, reversed=True, gpt4=True, max_length=length)

In [None]:
# Chain of thought new: Context + alignment-based + step by step
names = ['CoT_align_stepbystep']
context = 'full'

for name in names:
    exp_name = 'context_' + context + '_' + name + '_perclause_6000'
    print(exp_name)
    contextualisation = read_file('prompts/prompt_eppm_' + context + '.txt')
    cot_exemplars = read_file('prompts/prompt_eppm_' + name + '.txt')

    preds, score = run_experiment(exp_name, None, contextualisation, lrml_col='spaced_lrml', everyNth=1, reversed=True, cot=True, step_by_step=True, should_display=False, gpt4=True, max_length=6000, cot_exemplars=cot_exemplars)

In [None]:
# GPT-4 Self Reflections
names = {
    'self_reflect_best-6000_gpt4': ['context_no_perclause_reversed_gpt4_71', 'context_full_perclause_reversed_gpt4_71', 'context_full_perclause_reversed_gpt4_6000_71']
}

#  [choose, options]
reflection_modes = { 'choose': [True, False]}

use_gpt4=False

for name, df_names in names.items():
    for reflection_mode_name, reflection_mode in reflection_modes.items():
        exp_name = name + '_' + reflection_mode_name
        print(exp_name)
        dfs = [pd.read_csv('eppm_preds/random_for_train_10_' + str(i) + '_10.txt', sep=';;;', header=None) for i in range(3)]
        contextualisation = get_self_reflection_samples(_df, dfs, choose=reflection_mode[0], options=reflection_mode[1])

        preds, score = run_experiment(exp_name, None, contextualisation, lrml_col='spaced_lrml', everyNth=1, reversed=False, cot=False, step_by_step=False, should_display=False, self_reflect=True, self_reflect_df_names=df_names, options=reflection_mode[1], gpt4=use_gpt4)

## Test sets

In [None]:
# Validate Gpt-4 with and without context and with per clause sampling
names = ['no', 'intro', 'full']

test_splits = [['test', False, test_df], ['doc_test', True, test_doc_df]]

for name in names:
    for test_split in test_splits:
        exp_name = 'context_' + name + '_perclause_reverse_' + test_split[0]
        print(exp_name)
        contextualisation = read_file('prompts/prompt_eppm_' + name + '.txt')
        preds, score = run_experiment(exp_name, None, contextualisation, lrml_col='spaced_lrml', everyNth=1, reversed=True, gpt4=True, doc_split=test_split[1], valid_df=test_split[2])

In [None]:
# Per clause sampling with longer context lenghts
names = ['full']
lengths = [6000]
test_splits = [['test', False, test_df], ['doc_test', True, test_doc_df]]

for name in names:
    for length in lengths:
        for test_split in test_splits:
            exp_name = 'context_' + name + '_perclause_reversed_gpt4_' + str(length) + '_' + test_split[0]
            print(exp_name)
            contextualisation = read_file('prompts/prompt_eppm_' + name + '.txt')
            preds, score = run_experiment(exp_name, None, contextualisation, lrml_col='spaced_lrml', everyNth=1, reversed=True, gpt4=True, max_length=length, doc_split=test_split[1], valid_df=test_split[2])

In [None]:
test_splits = [
    ['test', False, test_df, ['context_no_perclause_reversed_gpt4_test_71', 'context_full_perclause_reversed_gpt4_6000_test_71', 'context_full_perclause_reversed_gpt4_test_71']], 
    ['doc_test', True, test_doc_df, ['context_no_perclause_reversed_gpt4_doc_test_55', 'context_full_perclause_reversed_gpt4_6000_doc_test_55', 'context_full_perclause_reversed_gpt4_doc_test_55']]
    ]

#  [choose, options]
reflection_modes = { 'choose': [True, False]}

use_gpt4=False

for test_split in test_splits:
    for reflection_mode_name, reflection_mode in reflection_modes.items():
        exp_name = 'self_reflect_best-6000_gpt4_' + reflection_mode_name + '_' + test_split[0]
        print(exp_name)
        dfs = [pd.read_csv('eppm_preds/random_for_train_10_' + str(i) + '_10.txt', sep=';;;', header=None) for i in range(3)]
        contextualisation = get_self_reflection_samples(_df, dfs, choose=reflection_mode[0], options=reflection_mode[1])

        preds, score = run_experiment(exp_name, None, contextualisation, lrml_col='spaced_lrml', everyNth=1, reversed=False, cot=False, step_by_step=False, should_display=False, self_reflect=True, self_reflect_df_names=test_split[3], options=reflection_mode[1], gpt4=use_gpt4, doc_split=test_split[1], valid_df=test_split[2])

# Examples

In [147]:
dfs = [pd.read_csv('eppm_preds/context_full_perclause_reverse_71.txt', sep=';;;', header=None)]
valid_df.reset_index(drop=True, inplace=True)
# Use self reflection functionality for scoring the predictions
print(get_self_reflection_samples(valid_df, dfs, False))

  dfs = [pd.read_csv('eppm_preds/context_full_perclause_reverse_71.txt', sep=';;;', header=None)]


Source: CAS2 1.4.6 For the purposes of risk group SI the term 'bed' means the number of people that are under care or detention. It can include people on: a) Beds, or b) Recliner or lounge chairs, or c) Dentist chairs, or d) Treatment tables, or e) Any other furniture where an occupant may be for a period of treatment, in care or detention.
(51.877711957171314, 'if( is( space. risk group, si)), then( define( bed, and( or( is( furniture, bed), is( furniture, recliner chair), is( furniture, lounge chair), is( furniture, dentist chair), is( furniture, treatment table), is( furniture, other)), is( furniture, occupant))))')
Target: if( is( space. risk group, si)), then( and( is( bed, person), within( person, or( care, detention)), include( bed, or( recliner, lounge chair, dentist chair, treatment table, treatment furniture))))

Source: CAS2 This Acceptable Solution is one of three Acceptable Solutions that provide a means of establishing compliance with NZBC Clauses C1 to C6 Protection from

# Number of Samples

In [13]:
pd.set_option('display.float_format', lambda x: '%.2f' % x)

df = pd.read_csv('data/eppm_num_samples.csv')

df = df[[col for col in df.columns if not col.endswith('MIN') and not col.endswith('MAX')]]

# Average all values per column
column_averages = df.mean()

column_averages

Step                                                                  35.00
context_no_perclause_reversed_gpt4_doc_test_55 - num_samples          32.35
context_full_perclause_reversed_gpt4_6000_test_71 - num_samples       39.61
context_full_perclause_reversed_gpt4_6000_doc_test_55 - num_samples   43.02
context_full_perclause_reversed_gpt4_8000_71 - num_samples            55.13
context_full_perclause_reversed_gpt4_6000_71 - num_samples            39.77
context_full_perclause_reversed_gpt4_doc_test_55 - num_samples        26.47
context_full_perclause_reversed_gpt4_test_71 - num_samples            25.01
context_full_perclause_reversed_gpt4_71 - num_samples                 25.01
context_no_perclause_reversed_gpt4_71 - num_samples                   30.37
context_intro_perclause_reversed_gpt4_71 - num_samples                30.10
context_no_perclause_reversed_gpt4_test_71 - num_samples              30.44
context_full_CoT_align_stepbystep_perclause_6000_71 - num_samples     38.99
dtype: float