In [2]:
import fighting_words as fw
import pandas as pd
import spacy
import os
import csv

In [None]:
df_gpt = pd.read_csv('data/data_gpt_processed.csv', sep=',', encoding='latin1')
df_llama = pd.read_csv('data/data_llama_processed.csv', sep=',', encoding='latin1')

In [9]:
def load_dataset(filepath, sep=';'):
    return pd.read_csv(filepath, sep=sep, encoding='ISO-8859-1')

def process_and_save(data_path, prefix):
    df = pd.read_csv(data_path, sep=';')
    verbs, adjectives = fw.get_fighting_words(df)
    fw.save_to_csv(verbs, f'{prefix}_vrb.csv')
    fw.save_to_csv(adjectives, f'{prefix}_adj.csv')

In [None]:
'''
Solely adjectives and verbs
'''

# Dictionary of dataset paths and their prefixes
datasets = {
    'data/df_gpt.csv': 'gpt',
    'data/df_llama.csv': 'llama'
}

nlp = spacy.load('nl_core_news_lg')
for path, prefix in datasets.items():
    process_and_save(path, prefix, nlp)

In [10]:
'''
Genre
'''

def process_datasets(datasets, nlp):
    """Process each dataset for different genres and save results."""
    for name, df in datasets.items():
        grouped_datasets = {genre: group for genre, group in df.groupby('Genre')}
        for genre in ['thriller', 'literaire fictie', 'romantisch']:
            df_genre = grouped_datasets.get(genre)
            if df_genre is not None:
                fw_adj, fw_vrb = fw.get_fighting_words(df_genre, nlp)
                fw.save_to_csv(fw_adj, f'{name}_{genre}_adj.csv')
                fw.save_to_csv(fw_vrb, f'{name}_{genre}_vrb.csv')

Vocab size is 653
Comparing language...
Vocab size is 262
Comparing language...
Vocab size is 661
Comparing language...
Vocab size is 255
Comparing language...
Vocab size is 492
Comparing language...
Vocab size is 236
Comparing language...
Vocab size is 610
Comparing language...
Vocab size is 229
Comparing language...
Vocab size is 869
Comparing language...
Vocab size is 305
Comparing language...
Vocab size is 637
Comparing language...
Vocab size is 244
Comparing language...


In [19]:
'''
Prompt
'''

def process_datasets(datasets):
    """Process each dataset for specified prompt types and save results."""
    for name, df in datasets.items():
        grouped_datasets = {prompt: group for prompt, group in df.groupby('Prompt_type')}
        for prompt in ['instructional', 'completion', 'question-answer', 'contextual']:
            process_prompt(grouped_datasets, name, prompt)

def process_prompt(grouped_datasets, dataset_name, prompt):
    """Process and save data for a specific prompt type."""
    nlp = spacy.load('nl_core_news_lg')
    df_prompt = grouped_datasets.get(prompt)
    if df_prompt is not None:
        fw_adj, fw_vrb = fw.get_fighting_words(df_prompt , nlp)
        save_to_csv(fw_adj, f'{dataset_name}_{prompt}_adj.csv')
        save_to_csv(fw_vrb, f'{dataset_name}_{prompt}_vrb.csv')

def save_to_csv(data, filename):
    """Save data to CSV file in a specified directory."""
    save_path = 'fw_data/'
    os.makedirs(save_path, exist_ok=True)
    full_path = os.path.join(save_path, filename)
    with open(full_path, 'w', newline='') as file:
        writer = csv.writer(file)
        for item, zscore in data:
            writer.writerow([item, zscore])