In [1]:
import os
import openai

# os.environ["OPENAI_API_KEY"] = "EMPTY" # Let's ensure we're not going to successfully make any API calls
# openai.api_key = os.environ["OPENAI_API_KEY"]

# my_server = 'http://192.168.1.76:8081/v1'

# os.environ["OPENAI_BASE_URL"] = my_server
# os.environ["OPENAI_API_BASE"] = my_server
# openai.base_path = my_server
# openai.api_base = my_server

import getpass, os 
secret_key = getpass.getpass('Enter OpenAI secret key: ') 
os.environ['OPENAI_API_KEY'] = secret_key

In [2]:
def expand_pos_code(pos_code):
    pos_dict = {
        'A-': 'adjective (gloss should be modifier)',
        'C-': 'conjunction (gloss should be conjunction)',
        'D-': 'adverb (gloss should be adverbial, e.g., "not" instead of "no")',
        'I-': 'interjection',
        'N-': 'noun',
        'P-': 'preposition',
        'RA': 'definite article',
        'RD': 'demonstrative pronoun',
        'RI': 'interrogative/indefinite pronoun (add a "?" to the end of the gloss, e.g., "who?")',
        'RP': 'personal pronoun',
        'RR': 'relative pronoun',
        'V-': 'verb (if a finite verb, include the implied subject, e.g., "he said", "they said", "you said", "we said", etc.)',
        'X-': 'particle'
    }
    return pos_dict.get(pos_code, '') # Return empty string if not found

def expand_parsing_code(parsing_code):
    labels = [
        ('person', {'1': '1st', '2': '2nd', '3': '3rd'}),
        ('tense', {'P': 'present', 'I': 'imperfect', 'F': 'future', 'A': 'aorist', 'X': 'perfect', 'Y': 'pluperfect'}),
        ('voice', {'A': 'active', 'M': 'middle', 'P': 'passive'}),
        ('mood', {'I': 'indicative', 'D': 'imperative', 'S': 'subjunctive', 'O': 'optative', 'N': 'infinitive', 'P': 'participle'}),
        ('case', {'N': 'nominative', 'G': 'genitive', 'D': 'dative', 'A': 'accusative'}),
        ('number', {'S': 'singular', 'P': 'plural'}),
        ('gender', {'M': 'masculine', 'F': 'feminine', 'N': 'neuter'}),
        ('degree', {'C': 'comparative', 'S': 'superlative'})
    ]
    
    expanded_labels = []
    for i, (label, mapping) in enumerate(labels):
        code = parsing_code[i] if i < len(parsing_code) else '-'
        expanded_labels.append(mapping.get(code, ''))
    
    expanded_labels = [label for label in expanded_labels if label != ''] # Remove empty labels
    return " | ".join(expanded_labels)

# Test the functions
print(expand_pos_code('A-'))  # Output: "adjective"
print(expand_parsing_code('3FMI-S--'))  # Output: "3rd | future | middle | indicative | singular | - | - | -"


adjective (gloss should be modifier)
3rd | future | middle | indicative | singular


In [3]:
import csv

# input_csv = "/Users/ryderwishart/SBLGNT-syntax-cleanup/vocabulary/SBLGNT_norms.txt"
input_csv = "/Users/ryderwishart/SBLGNT-syntax-cleanup/vocabulary/all_norms.tsv"
output_csv = "/Users/ryderwishart/SBLGNT-syntax-cleanup/vocabulary/generalized_form_sensitive_glosses_sept_5_2023.csv"

all_unique_parse_codes = set()
all_unique_pos = set()
with open(input_csv, 'r') as infile:
    csv_reader = csv.reader(infile, delimiter='\t')
    for row in csv_reader:
        norm = row[0]
        lemma = row[1]
        pos = row[2]
        all_unique_pos.add(pos)
        parse_code = row[3]
        all_unique_parse_codes.add(parse_code)
        glosses = row[4].split('|')

In [4]:
def process_csv(input_file, output_file, max_rows=10):
    with open(input_file, 'r', encoding='utf8') as infile, open(output_file, 'w', newline='') as outfile:
        input_file_lines_length = 0
        for line in infile.readlines():
            input_file_lines_length += 1
        print(f"Input file has {input_file_lines_length} lines.")
        
        # Reset the file pointer to the beginning
        infile.seek(0)

        csv_reader = csv.reader(infile, delimiter='\t')
        csv_writer = csv.writer(outfile, delimiter='\t')

        count = 0
        
        if max_rows < 1:
            max_rows_to_process = input_file_lines_length
        else:
            max_rows_to_process = max_rows
            
        print(f"Processing {max_rows_to_process} rows.")
        
        i = 0
        
        for row in csv_reader:
            i += 1
            
            if i < 8930 + 12592 + 3296 + 2093:
                continue
            
            
            if count == 0:
                pass # Skip the header row
            
            
            if i <= max_rows_to_process + 8929 + 12591 + 3295 + 2092:
                # print(row)
                word_form = row[0]
                lemma = row[1]
                pos = row[2]
                parse_code = row[3]
                glosses = row[4].split('|')
                
                # if not 'verb' in expand_pos_code(pos):
                #     continue
                
                parse_description = expand_pos_code(pos) + ' ' + expand_parsing_code(parse_code)
                

In [5]:
print(len(all_unique_parse_codes), len(all_unique_pos))

453 14


In [6]:
examples = '''A chat where the helpful assistant always returns a single generalized English gloss for a Greek word with parsing information and example glosses. Always answer with only a single English gloss, no matter what.
USER: καί (conjunction ) ['and', '', 'but', 'also', 'although']
ASSISTANT: and
USER: καί (adverb ) ['also', 'even', 'and', '', '[so] also']
ASSISTANT: also
USER: δέ (conjunction ) ['now', 'and', 'then', 'however', 'but']
ASSISTANT: but
USER: λέγοντες (verb (if a finite verb, include the implied subject, e.g., "he said", "they said", "you said", "we said", etc.) present | active | participle | nominative | plural | masculine) ['saying', '', 'those saying', 'saying,', 'saying']
ASSISTANT: saying
USER: ἀποκριθείς (verb (if a finite verb, include the implied subject, e.g., "he said", "they said", "you said", "we said", etc.) aorist | passive | participle | nominative | singular | masculine) ['answering', 'having answered', 'and ~ having answered', 'and ~ having answered,', 'answering']
ASSISTANT: answering
USER: ὁ (definite article nominative | singular | masculine) ['the', '-', 'the [one]', 'who [is]', '']
ASSISTANT: the
USER: ἐγέννησε(ν) (verb (if a finite verb, include the implied subject, e.g., "he said", "they said", "you said", "we said", etc.) 3rd | aorist | active | indicative | singular) ['begat', 'fathered']
ASSISTANT: he begat
USER: τοῦ (definite article genitive | singular | masculine) ['the', '-', 'of the', 'of', 'who [is]']
ASSISTANT: of the
USER: ἐστί(ν) (verb (if a finite verb, include the implied subject, e.g., "he said", "they said", "you said", "we said", etc.) 3rd | present | active | indicative | singular) ['is', 'it is', 'he is', 'are', 'is he']
ASSISTANT: he/it is
USER: ἐν (preposition ) ['in', 'by', 'with', 'on', 'among']
ASSISTANT: in
USER: αὐτοῦ (personal pronoun genitive | singular | masculine) ['of him', 'him', 'he', 'his', '']
ASSISTANT: his
USER: αὐτοῦ (personal pronoun genitive | singular | neuter) ['it', 'of it', 'of them', 'it', 'of it']
ASSISTANT: its
USER: καί (adverb (gloss should be adverbial, e.g., "not" instead of "no")) ['also', 'even', 'and', '', '[so] also']
ASSISTANT: also
USER: οὐ (adverb (gloss should be adverbial, e.g., "not" instead of "no")) ['not', 'no', 'nothing', 'neither', 'never']
ASSISTANT: not
USER: εἶπε(ν) (verb (if a finite verb, include the implied subject, e.g., "he said", "they said", "you said", "we said", etc.) 3rd | aorist | active | indicative | singular) ['said', 'he said', '', 'commanded', 'he says']
ASSISTANT: he said
USER: ἔσται (verb (if a finite verb, include the implied subject, e.g., "he said", "they said", "you said", "we said", etc.) 3rd | future | middle | indicative | singular) ['will be', 'it will be', 'will it be', 'he will be', 'shall have been']
ASSISTANT: he/it will be
USER: οὐδέ (adverb (gloss should be adverbial, e.g., "not" instead of "no")) ['not even', 'neither', 'neither', 'even', 'nether']
ASSISTANT: neither
USER: λέγουσι(ν) (verb (if a finite verb, include the implied subject, e.g., "he said", "they said", "you said", "we said", etc.) 3rd | present | active | indicative | plural) ['they say', 'say', 'said', 'do pronounce', 'saying']
ASSISTANT: they say
USER: εἰσί(ν) (verb (if a finite verb, include the implied subject, e.g., "he said", "they said", "you said", "we said", etc.) 3rd | present | active | indicative | plural) ['are', 'there are', 'they are', 'are they', '']
ASSISTANT: they are
USER: ἦλθε(ν) (verb (if a finite verb, include the implied subject, e.g., "he said", "they said", "you said", "we said", etc.) 3rd | aorist | active | indicative | singular) ['came', 'he went', 'he came', 'is come', 'she came']
ASSISTANT: he came
USER: ἦν (verb (if a finite verb, include the implied subject, e.g., "he said", "they said", "you said", "we said", etc.) 3rd | imperfect | active | indicative | singular) ['was', 'he was', 'there was', 'came about', 'he remained']
ASSISTANT: he/it was
USER: ὧδε (adverb (gloss should be adverbial, e.g., "not" instead of "no")) ['here', 'here [is]', 'here', 'here.', 'here,']
ASSISTANT: here
USER: εἶ (verb (if a finite verb, include the implied subject, e.g., "he said", "they said", "you said", "we said", etc.) 2nd | present | active | indicative | singular) ['you are', 'are', '[it] is', 'are you', 'are']
ASSISTANT: you are
USER: τοῦ (definite article genitive | singular | neuter) ['the', 'of the', '-', 'against', 'than']
ASSISTANT: of the
'''

In [7]:
examples = '''A chat where the helpful assistant always returns a single generalized English gloss for a Greek word with parsing information and example glosses. Always answer with only a single English gloss, no matter what.
USER: καί (conjunction ) ['and', '', 'but', 'also', 'although']
ASSISTANT: and
USER: καί (adverb ) ['also', 'even', 'and', '', '[so] also']
ASSISTANT: also
USER: δέ (conjunction ) ['now', 'and', 'then', 'however', 'but']
ASSISTANT: but
USER: ἐν (preposition ) ['in', 'by', 'with', 'on', 'among']
ASSISTANT: in
USER: αὐτοῦ (personal pronoun genitive | singular | masculine) ['of him', 'him', 'he', 'his', '']
ASSISTANT: his
USER: τοῦ (definite article genitive | singular | neuter) ['the', 'of the', '-', 'against', 'than']
ASSISTANT: of the
'''

In [241]:
def generate_gloss(prompt, temperature=0.2):
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "user", "content": prompt}
        ],
        max_tokens=10,
        n=1,
        stop=['\n', ';'],
        temperature=temperature,
    )
    return response.choices[0].message.content.strip()

def process_csv(input_file, output_file, max_rows=10):
    with open(input_file, 'r', encoding='utf8') as infile, open(output_file, 'w', newline='') as outfile:
        input_file_lines_length = 0
        for line in infile.readlines():
            input_file_lines_length += 1
        print(f"Input file has {input_file_lines_length} lines.")
        
        # Reset the file pointer to the beginning
        infile.seek(0)

        csv_reader = csv.reader(infile, delimiter='\t')
        csv_writer = csv.writer(outfile, delimiter='\t')

        count = 0
        
        if max_rows < 1:
            max_rows_to_process = input_file_lines_length
        else:
            max_rows_to_process = max_rows
            
        print(f"Processing {max_rows_to_process} rows.")
        
        i = 0
        
        for row in csv_reader:
            i += 1
            
            if i < 8930 + 12592 + 3296 + 2093:
                continue
            
            
            if count == 0:
                pass # Skip the header row
            
            
            if i <= max_rows_to_process + 8929 + 12591 + 3295 + 2092:
                # print(row)
                word_form = row[0]
                lemma = row[1]
                pos = row[2]
                parse_code = row[3]
                glosses = row[4].split('|')
                
                # if not 'verb' in expand_pos_code(pos):
                #     continue
                
                parse_description = expand_pos_code(pos) + ' ' + expand_parsing_code(parse_code)
                
                # parse_description = parse_to_description(parse_code)
    #             prompt = f"""
    # As an AI language model, please generate a single, concise English gloss for the Greek word '{word_form}', based on the given glosses: {glosses}. The gloss should be in English, exclude any contextual information enclosed in brackets (square or round), and be as general as possible to cover all instances. Be woodenly literal but not archaic in your translations. Additionally, the gloss should reflect that this word form has the following morphosyntactic parsing: '{parse_description}'.

    # Examples:
    # - Greek word: Πέτρος; Glosses: 'peter; peter [said]; peter [with that]'; Parse code: 'noun masculine nominative singular'; General gloss: 'Peter'
    # - Greek word: ἀνθρώπων; Glosses: 'men; of men; among men; man; than men'; Parsing: 'noun masculine genitive plural'; General gloss: 'of men'

    # Generate a general gloss for the Greek word {word_form} with these glosses: {glosses} and parse code: {parse_code}
    # """
#                 prompt = f"""Generate a concise English gloss for the Greek word '{word_form}' \
# with glosses: {glosses} and parse code: '{parse_description}'. \
# Remove context in brackets and be literal but not archaic.

# Examples:
# in: 'Πέτρος', "'peter; peter [said]; peter [with that]'", 'NON-MNS'
# gloss: 'Peter'
# in: 'πατρός', "'father'", 'NON-MGS'
# gloss: "of father"
# in: 'κυρίῳ', "'lord; master; to [the] lord; [my] lord; for [the] lord'", 'NON-MDS'
# gloss: 'to lord'
# in: 'πνεύματος', "'spirit; of [the] spirit; [of the] spirit; with [the] spirit; in spirit'", 'NON-NGS'
# gloss: 'of spirit'

# ONLY capitalize when the input word is capitalized.
# Do not add punctuation to the glosses. ONLY give a single gloss. \
# ONLY give a single ENGLISH gloss. Don't repeat the Greek word.

# Begin: 
# in: '{word_form}', '{glosses}', '{parse_description}'
# gloss:"""
          
          
                # prompt = f"""Generate a concise English gloss for the Greek word (with parsing {parse_description}) '{word_form}'. ONLY show the gloss, do not add any other information. gloss: """

                system = 'A chat where the helpful assistant always returns a single generalized English gloss for a Greek word with parsing information and example glosses. Always answer with only a single English gloss, no matter what.'
                # user = f'USER: With specific glosses: {glosses[:5]}, please generate a single, concise English gloss for the Greek word {word_form} ({parse_description}).'
                user = f'USER: {word_form} ({parse_description.strip()}) {glosses[:5]}'
                bot = 'ASSISTANT:'
                

                
                prompt = f"""{system}
{user}
{bot}"""
                prompt = f"""{examples}
{user}
{bot}"""




                # print(user)
                import re

                current_temperature = 0
                new_gloss = generate_gloss(prompt, temperature=current_temperature)
                
                # Repeatedly increase the temperature until we get a valid gloss
                excluded_prompt_start_strings = [
                    "I'm sorry",
                    "The input",
                    "I apologize",
                    "This is an example",
                    "Sorry, ",
                    "The assistant",
                    "The response",
                    "The given word",
                    "The provided",
                    "Error: ",
                    "Hi",
                    "Hello",
                    "Please ",
                    "I'm not sure",
                    "I don't know",
                    "I don't understand",
                    "I don't have",
                    "It's worth ",
                    "It is worth ",
                    "It's important to ",
                    "It is important to ",
                    "It's challengting to ",
                    "It is challengting to ",
                    "Since Gree",
                    "The ",
                    "An error occurred",
                    "For a ",
                    "ASSISTANT",
                    "Firmly adhering to the directive "
                ]
                
                attempts = 0
                while attempts < 20 and any([new_gloss.startswith(s) for s in excluded_prompt_start_strings]) or ', ' in new_gloss or ']' in new_gloss or '[' in new_gloss or new_gloss == '' or '*' in new_gloss or '~' in new_gloss:
                    attempts += 1
                    current_temperature += 0.1 if current_temperature < 0.9 else 0
                    new_gloss = generate_gloss(prompt, temperature=current_temperature)
                
                # Capitalize if word_form is capitalized
                if word_form[0].isupper():
                    new_gloss = new_gloss.capitalize()
                
                new_row = [row[0], word_form, row[2], new_gloss, parse_code, glosses]
                csv_writer.writerow(new_row)
                print(new_gloss, '\t\t\t<--', f'{word_form}', f'{parse_description}', count, 'of ', max_rows_to_process)
                # print(new_gloss)
                count += 1


In [242]:
process_csv(input_csv, output_csv, max_rows=0)

Input file has 39211 lines.
Processing 39211 rows.
falsehood 			<-- ψεῦδος noun accusative | singular | neuter 0 of  39211
The provided Greek word is a compound word and 			<-- ψυχῆς noun genitive | singular | feminine 1 of  39211
air 			<-- ἀέρος noun genitive | singular | masculine 2 of  39211
love 			<-- ἀγάπη noun nominative | singular | feminine 3 of  39211
Goods 			<-- ἀγαθοῖς adjective (gloss should be modifier) dative | plural | neuter 4 of  39211
goodness 			<-- ἀγαθωσύνῃ noun dative | singular | feminine 5 of  39211
Good (as a modifier) 			<-- ἀγαθός adjective (gloss should be modifier) nominative | singular | masculine 6 of  39211
let love 			<-- ἀγαπάτω verb (if a finite verb, include the implied subject, e.g., "he said", "they said", "you said", "we said", etc.) 3rd | present | active | imperative | singular 7 of  39211
Given that you asked for a single generalized English gl 			<-- ἀγαπητά adjective (gloss should be modifier) nominative | plural | neuter 8 of  39211
belov