In [1]:
import json
import re
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments, pipeline, GPT2LMHeadModel, GPT2Config
from peft import LoraConfig, PeftModel, get_peft_model
from datasets import load_dataset
from trl import SFTTrainer
from accelerate import Accelerator
from ollama_interact import *

def parse_qa_from_text(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()

    # Use regex to split based on "Q \d+:" to find questions and their answers
    chunks = re.split(r'Q \d+:\n', text)

    # Remove the first empty chunk if it exists
    if chunks[0] == '':
        chunks.pop(0)

    qa_data = []

    # Define the regex patterns for instruction (question), input, and output (answer)
    question_pattern = re.compile(r'instruction_hebrew: (.*?)\n')
    input_pattern = re.compile(r'input_hebrew: (.*?)\n')
    answer_pattern = re.compile(r'output_hebrew: (.*)', re.DOTALL)

    for chunk in chunks:
        question_match = question_pattern.search(chunk)
        input_match = input_pattern.search(chunk)
        answer_match = answer_pattern.search(chunk)

        if question_match and answer_match:
            # Clean the answer text to replace newline followed by number and dot to maintain the format
            cleaned_answer = re.sub(r'\n\d+\.', '', answer_match.group(1))
            qa_data.append({
                'question': question_match.group(1),
                'input': input_match.group(1),
                'answer': cleaned_answer.strip()  # Strip to remove leading/trailing whitespace
            })

    return qa_data

In [2]:
data_name = "vicgalle/alpaca-gpt4"

dataset = load_dataset(
    data_name, 
)

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', 'text'],
        num_rows: 52002
    })
})

In [4]:
# Path to the file you uploaded
file_path = 'alpaca_gpt4_hebrew.txt'
qa_list = parse_qa_from_text(file_path)

# Convert to JSON and print the JSON string
json_data = json.dumps(qa_list, indent=4, ensure_ascii=False)
json_file_path = 'converted_data.json'

with open(json_file_path, 'w', encoding='utf-8') as file:
    file.write(json_data)

In [30]:
dataset['train'][36]

{'instruction': 'Analyze the given text for its tone.',
 'input': 'The world has been greatly impacted by the COVID-19 pandemic and it has drastically changed our lives.',
 'output': 'The tone of the text is serious and somber. The use of terms such as "greatly impacted," "drastically changed," and "pandemic" suggest the seriousness and gravity of the situation, and convey a sense of heaviness and concern.',
 'text': 'Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nAnalyze the given text for its tone.\n\n### Input:\nThe world has been greatly impacted by the COVID-19 pandemic and it has drastically changed our lives.\n\n### Response:\nThe tone of the text is serious and somber. The use of terms such as "greatly impacted," "drastically changed," and "pandemic" suggest the seriousness and gravity of the situation, and convey a sense of heaviness and concern.'

In [29]:
qa_list[36]

{'question': 'בדקו את הטקסט שניתן לכם כדי לקבוע את מצב הרוח שלו.',
 'input': 'העולם עבר טלטלה משמעותית בעקבות מגפת הקורונה, והיא שינתה את חיינו באופן דרמטי.',
 'answer': 'אופי הטקסט הוא רציני וקודר. שימוש בביטויים כמו "השפעה משמעותית מאוד", "שינוי באופן דרסטי", ו"מגיפה" מרמז על החומרה והכבדות של המצב, ומביע תחושה של חרדה וכובד.'}

In [6]:
len(qa_list)

21601

In [7]:
import random
new_data = []

for item in qa_list:
    new_data.append({
        'instruction': item['question'],
        'input': item['input'],
        'output': item['answer'],
    })

for item in dataset['train']:
    if random.random() < 0.1:
        new_data.append({
            'instruction': item['instruction'],
            'input': item['input'],
            'output': item['output'],
        })

In [8]:
# Generate translation requests
for he_item, en_item in zip(qa_list, dataset['train']):
    content_to_translate = random.choice(['question_he', 'answer_he'])
    direction = random.choice(['he_to_en', 'en_to_he'])
    if content_to_translate == 'question_he':
        if direction == 'he_to_en':
            sentence = he_item['question'] 
            translation = en_item['instruction']
        else:
            sentence = en_item['instruction']
            translation = he_item['question']
    else:
        if direction == 'he_to_en':
            sentence = he_item['answer'] 
            translation = en_item['output']
        else:
            sentence = en_item['output']
            translation = he_item['answer']
        
    prompt_1 = 'אנא תרגם את המשפט הבא לאנגלית, המשפט בעברית:' if direction == 'he_to_en' else 'Please translate the following sentence to Hebrew:'
    prompt_2 = 'The translation of the sentence in English is:' if direction == 'he_to_en' else 'התרגום של המשפט בעברית הוא:'

    new_data.append({
        'instruction': f'{prompt_1} {sentence}',
        'input': '',
        'output': f'{prompt_2} {translation}',
    })

In [19]:
len(new_data)

48336

In [20]:
file_path = 'Hebrew_Questions_and_Answers.txt'
# Format the data into a string suitable for saving
formatted_data = "".join(f"<s>[INST]{entry['input']} {entry['instruction']}[/INST] {entry['output']} </s>" for entry in new_data)

# Writing data to file
with open(file_path, 'w', encoding='utf-8') as file:
    file.write(formatted_data)

file_path

'Hebrew_Questions_and_Answers.txt'

In [None]:
import re

# Open the original text file and read the content
with open('Hebrew_Questions_and_Answers.txt', 'r', encoding='utf-8') as file:
    content = file.read()

# Find all segments enclosed by <s>[INST] and </s>
segments = re.findall(r'(<s>\[INST\].*?</s>)', content, re.DOTALL)

# Replace newlines within each segment with \n
processed_segments = [segment.replace('\n', '\\n') for segment in segments]

# Open a new file to write the processed content
with open('output.txt', 'w', encoding='utf-8') as file:
    for segment in processed_segments:
        file.write(segment + '\n')  # Write each processed segment followed by a newline

In [16]:
# Convert to JSON and print the JSON string
json_data = json.dumps(new_data, indent=4, ensure_ascii=False)
json_file_path = 'all_data.json'

with open(json_file_path, 'w', encoding='utf-8') as file:
    file.write(json_data)