In [2]:
import openai
import configparser

# Read the configuration file
config = configparser.ConfigParser()
config.read('config.ini')

# Retrieve the API key from the configuration file
api_key = config.get('openai', 'api_key')

# Use the API key in your code
openai.api_key = api_key

In [2]:
# Step 1: Collecting Story Prompts
prompts = [
    "In a small town by the river, there lived a mysterious old man.",
    "The spaceship landed on an unknown planet, revealing a surprising discovery.",
    "She opened the door to her childhood home, only to find something unexpected.",
    # Add more prompts as needed
]

In [3]:

# Step 2: Preprocessing Prompts
tokenized_prompts = []
for prompt in prompts:
    # Tokenize prompt into smaller chunks, e.g., sentences or paragraphs
    tokenized_prompt = prompt.split(". ")
    tokenized_prompts.extend(tokenized_prompt)

In [4]:
tokenized_prompts

['In a small town by the river, there lived a mysterious old man.',
 'The spaceship landed on an unknown planet, revealing a surprising discovery.',
 'She opened the door to her childhood home, only to find something unexpected.']

In [6]:
# Step 3: Data Augmentation (if desired)
# Implement data augmentation techniques here, e.g., synonym replacement, word swapping, etc.

# Step 4: Create Prompt-Response Pairs
prompt_response_pairs = []
for tokenized_prompt in tokenized_prompts:
    # Generate response from the GPT-4 API
    response = openai.Completion.create(
        model="text-davinci-003",
        temperature=0.7,
        prompt=tokenized_prompt,
        max_tokens=1024,  # Set the desired maximum length of the generated story
        n=1,  # Set the number of responses to generate
        stop=None,  # Set any stopping criteria, if needed
    )
    # Extract the generated story from the API response
    generated_story = response.choices[0].text.strip()
    # Create prompt-response pair
    prompt_response_pair = (tokenized_prompt, generated_story)
    prompt_response_pairs.append(prompt_response_pair)


In [8]:
prompt_response_pairs

[('In a small town by the river, there lived a mysterious old man.',
  "He was a quiet man who rarely spoke to anyone, but the townspeople knew he was wise and kind. Each day, the old man would take long walks along the riverbank, collecting sticks and stones that he would later use to build sculptures and small works of art.\n\nWhenever someone in the town was in need of advice or help, they would seek out the old man and he would always offer wisdom and kindness. He had a deep understanding of the world and could often offer insight into seemingly impossible problems.\n\nOver the years, the old man's reputation grew and soon people from all over the area were coming to him for advice. He was respected and admired by the townspeople, who saw him as a wise and benevolent figure.\n\nThe old man's life was quiet and he was content with his simple existence. He never asked for anything in return and would often give away his sculptures and artwork to those in need. He was a mysterious fig

In [9]:
# Prompt Engineering

# Step 1: Adding Context to Prompts
original_prompt = "In a small town by the river, there lived a mysterious old man."
context = "He was known for his eccentric behavior and strange inventions."

# Combine the original prompt with additional context
augmented_prompt = original_prompt + " " + context

# Step 2: Modifying Prompt Format
question_format_prompt = "What happens when " + original_prompt + "?"
gap_filling_prompt = "Complete the story: " + original_prompt + " ___________."

In [10]:
response = openai.Completion.create(
        model="text-davinci-003",
        temperature=0.7,
        prompt=augmented_prompt,
        max_tokens=1024,  # Set the desired maximum length of the generated story
        n=1,  # Set the number of responses to generate
        stop=None,  # Set any stopping criteria, if needed
    )
# Extract the generated story from the API response
generated_story = response.choices[0].text.strip()
generated_story

"He rarely left his house, but when he did, he would often be seen tinkering with some strange contraption or another.\n\nPeople in the town whispered about the old man, wondering what he was up to. Some even said they had seen him flying around town on a strange contraption of his own invention. Some kids even dared each other to sneak into the old man's house and see what he was doing, but no one ever did.\n\nOne day, a young boy decided to take on the challenge. He snuck into the old man's house and discovered a workshop full of strange, mechanical devices and gadgets. The old man was not there, but the boy was able to piece together what he was up to.\n\nThe old man had invented a time machine! He was using it to travel back in time to experience different moments in history. The boy was amazed and filled with wonder. He quickly ran back to town and told everyone what he had discovered.\n\nThe old man soon became famous for his time machine and the people of the town began to respe

In [13]:
from transformers import MarianMTModel, MarianTokenizer

# Load pre-trained model and tokenizer for English to French translation
model_name = 'Helsinki-NLP/opus-mt-en-fr'
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

def translate_text(text, source_lang, target_lang):
    # Tokenize the input text
    input_ids = tokenizer.encode(text, return_tensors='pt', source_lang=source_lang)

    # Generate translation
    translated_ids = model.generate(input_ids=input_ids, 
                                    decoder_start_token_id=model.config.pad_token_id, 
                                    num_beams=4, 
                                    max_length=128,
                                    early_stopping=True)
    
    # Decode the translated tokens
    translated_text = tokenizer.decode(translated_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
    
    return translated_text

# Example usage
source_text = "Hello, how are you?"
source_language = "en"
target_language = "fr"

translated_text = translate_text(source_text, source_language, target_language)

print(f"Source text ({source_language}): {source_text}")
print(f"Translated text ({target_language}): {translated_text}")


Keyword arguments {'source_lang': 'en'} not recognized.


Source text (en): Hello, how are you?
Translated text (fr): Bonjour, comment ça va ?


In [18]:
from transformers import MarianMTModel, MarianTokenizer
import torch
from torch.utils.data import DataLoader

# Load pre-trained model and tokenizer for English to French translation
model_name = 'Helsinki-NLP/opus-mt-en-fr'
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

def translate_text(text, source_lang, target_lang):
    # Tokenize the input text
    input_ids = tokenizer.encode(text, return_tensors='pt', source_lang=source_lang)

    # Generate translation
    translated_ids = model.generate(input_ids=input_ids, 
                                    decoder_start_token_id=model.config.pad_token_id, 
                                    num_beams=4, 
                                    max_length=128,
                                    early_stopping=True)
    
    # Decode the translated tokens
    translated_text = tokenizer.decode(translated_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
    
    return translated_text

# Define evaluation dataset
eval_dataset = [
    {'source_text': "Hello, how are you?", 'target_text': "Bonjour, comment ça va ?"},
    {'source_text': "I love this place!", 'target_text': "J'adore cet endroit !"},
    # Add more evaluation samples as needed
]

def evaluate_translation(model, dataset):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    model.eval()

    total_loss = 0.0
    total_correct = 0

    dataloader = DataLoader(dataset, batch_size=1, shuffle=False)

    with torch.no_grad():
        for batch in dataloader:
            source_text = batch['source_text'][0]
            target_text = batch['target_text'][0]

            # Translate the source text
            translated_text = translate_text(source_text, source_language, target_language)

            # Compute perplexity
            input_ids = tokenizer.encode(target_text, return_tensors='pt').to(device)
            target_ids = tokenizer.encode(target_text, return_tensors='pt').to(device)
            outputs = model(input_ids=input_ids, labels=target_ids)
            loss = outputs.loss.item()
            total_loss += loss

            # Compute accuracy
            predicted_ids = tokenizer.encode(translated_text, return_tensors='pt').to(device)
            correct = torch.all(torch.eq(predicted_ids, target_ids)).item()
            total_correct += correct

    num_samples = len(dataset)
    avg_loss = total_loss / num_samples
    accuracy = total_correct / num_samples

    return avg_loss, accuracy

# Example usage
source_language = "en"
target_language = "fr"

avg_loss, accuracy = evaluate_translation(model,eval_dataset)
perplexity = 2 ** avg_loss

print(f"Perplexity: {perplexity}")
print(f"Accuracy: {accuracy}")


Keyword arguments {'source_lang': 'en'} not recognized.
Keyword arguments {'source_lang': 'en'} not recognized.


Perplexity: 2.9522607509748147
Accuracy: 1.0


The evaluation results show a perplexity of 2.952 and an accuracy of 1.0, which indicates that the translation model performed well on the evaluation dataset

In [40]:
# Import the required libraries
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch import optim
import torch.nn as nn

from transformers import MarianMTModel, MarianTokenizer
from torch.utils.data import DataLoader

# Load pre-trained model and tokenizer for English to French translation
model_name = 'Helsinki-NLP/opus-mt-en-fr'
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# Define the evaluation dataset
eval_dataset = [
    {'source_text': "Hello, how are you?", 'target_text': "Bonjour, comment ça va ?"},
    {'source_text': "I love this place!", 'target_text': "J'adore cet endroit !"},
    {'source_text': "What time is it?", 'target_text': "Quelle heure est-il ?"},
    {'source_text': "Can you help me?", 'target_text': "Pouvez-vous m'aider ?"},
    # Add more evaluation samples as needed
]

# Tokenization
def tokenize_text(text):
    tokenized_text = tokenizer.encode(text, return_tensors='pt')
    return tokenized_text

# Normalization
def normalize_text(text):
    normalized_text = text.lower()
    return normalized_text

def multi_task_learning(model, dataset):
    # Define loss functions for each task
    task1_criterion = nn.CrossEntropyLoss()
    
    # Define optimizers for each task
    task1_optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    # Training loop for multi-task learning
    for epoch in range(num_epochs):
        model.train()
        running_loss1 = 0.0

        for data in dataset:
            source_text, target_text = data['source_text'], data['target_text']
            
            # Tokenize the source and target text
            inputs = tokenizer.encode(source_text, return_tensors='pt')
            labels_task1 = tokenizer.encode(target_text, return_tensors='pt')
            
            # Forward pass and compute losses for each task
            outputs = model(
                input_ids=inputs[0],
                decoder_input_ids=labels_task1.input_ids[:, :-1],  # Exclude the last token
                labels=labels_task1.input_ids[:, 1:]  # Exclude the first token
            )
            loss1 = task1_criterion(outputs.logits, labels_task1.input_ids[:, 1:])  # Exclude the first token
            
            # Backward pass and update parameters for each task
            task1_optimizer.zero_grad()
            loss1.backward()
            task1_optimizer.step()
            
            running_loss1 += loss1.item()     
        # Print training losses for each epoch
        print(f"Epoch [{epoch+1}/{num_epochs}], Task 1 Loss: {running_loss1 / len(dataset)}")


# Distillation
def distillation(large_model, small_model, dataset):
    # Define optimizer and loss function for distillation
    optimizer = optim.Adam(small_model.parameters(), lr=0.001)
    distillation_loss = nn.MSELoss()
    
    # Training loop for distillation
    for epoch in range(num_epochs):
        large_model.train()
        small_model.train()
        running_loss = 0.0
        
        for data in dataset:
            inputs, labels = data['source_text'], data['target_text']
            
            # Forward pass on the large model and get outputs
            outputs_large = large_model(inputs)
            
            # Forward pass on the small model and get outputs
            outputs_small = small_model(inputs)
            
            # Compute distillation loss
            loss = distillation_loss(outputs_small, outputs_large.detach())
            
            # Backward pass and update parameters
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
        
        # Print training loss for each epoch
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss / len(dataset)}")

# Main program
if __name__ == "__main__":
    # Fine-tuning with advanced techniques
    num_epochs = 10

    # Perform multi-task learning
    multi_task_learning(model, eval_dataset)

    # Create a smaller model for distillation
    small_model = T5ForConditionalGeneration.from_pretrained('t5-small')

    # Perform distillation
    distillation(model, small_model, eval_dataset)

    # Evaluate on the dataset
    for sample in eval_dataset:
        source_text = sample['source_text']
        target_text = sample['target_text']

        # Tokenize the source text
        tokenized_source_text = tokenize_text(source_text)

        # Generate the translation using the model
        outputs = model.generate(input_ids=tokenized_source_text, max_length=50, do_sample=True)
        translated_text = tokenizer.decode(outputs[0])

        # Normalize the target text
        normalized_target_text = normalize_text(target_text)

        # Print the results
        print("Source Text:", source_text)
        print("Target Text:", target_text)
        print("Translated Text:", translated_text)
        print("Normalized Target Text:", normalized_target_text)
        print()

Source Text: Hello, how are you?
Target Text: Bonjour, comment ça va ?
Translated Text: <pad> Bonjour, comment allez-vous?</s>
Normalized Target Text: bonjour, comment ça va ?

Source Text: I love this place!
Target Text: J'adore cet endroit !
Translated Text: <pad> J'adore cet endroit!</s>
Normalized Target Text: j'adore cet endroit !

Source Text: What time is it?
Target Text: Quelle heure est-il ?
Translated Text: <pad> Quelle heure est-il?</s>
Normalized Target Text: quelle heure est-il ?

Source Text: Can you help me?
Target Text: Pouvez-vous m'aider ?
Translated Text: <pad> Tu peux m'aider?</s>
Normalized Target Text: pouvez-vous m'aider ?

