In [None]:
from transformers import LlamaTokenizer, LlamaForCausalLM, AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer
from memory import Memory
import torch

## Check if Cuda is available

In [None]:
print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.current_device())

## Load Dialogue Model

In [None]:
tokenizer = AutoTokenizer.from_pretrained("databricks/dolly-v2-3b", padding_side="left")
model = AutoModelForCausalLM.from_pretrained("databricks/dolly-v2-3b", device_map="auto",
                                             torch_dtype=torch.bfloat16, cache_dir='dolly-v2-3b')
print(model)

## Sentence Embedding Model

In [None]:
sentence_model = SentenceTransformer(
        'sentence-transformers/paraphrase-MiniLM-L6-v2',
        cache_folder='paraphrase-MiniLM-L6-v2'
    )
print(sentence_model)

## Define Player and NPC Names

In [None]:
npc_name = 'Balgruuf the Greater'
player_name = 'Player'

## Define Memories with Importance Rating

In [None]:
memories = [
        'is the jarl of Whiterun.',
        'is a Nord'
        'resides in the great hall Dragonsreach in Whiterun.',
        'wears noble cloths.',
        'wears a crown.',
        'has a unique war axe.',
        'has a brother called Hrongar.',
        'has three children called Frothar, Dagny and Nelkir.',
        'has no wife.',
        'puts Whiteruns interests first.',
        'did not permit the Imperials to garrison soldiers in the city.',
        'takes no sides in the war.',
        'is always on the side of Whiterun',
        'does not like Urlfric and Galmar, because they attacked Whiterun.',
        'does not like the stormcloaks, because they attacked Whiterun.',
        'is concerned about the dragons, because they attacked Whiterun.',
        'worships Talos, the god of the Nords.',
        'hates the Thalmor.',
        'is friends with Irileth, because he fought with her in the war.',
    ]
memories = [npc_name + " " + m + " " for m in memories]
memory_ratings = [4, 1, 2, 1, 2, 3, 3, 2, 5, 2, 3, 4, 3, 3, 5, 2, 4, 4]

## Initialize Memory for Dialogue 

In [None]:
memory = Memory(memories, memory_ratings)

# Start the NPC Dialogue

In [None]:
# Number of sentences used in initial prompt
num_sentences = 4

# Start conversation history
conversation = ""

# Start dialogue loop
while True:
    # Get user input
    user_input = input('Player: ')
    #print("\nPlayer: " + user_input)

    # Generate initial prompt based on current user input
    initial_prompt = memory.generate_prompt(user_input, sentence_model, num_sentences=num_sentences)
    print(f'\nInitial Prompt: {initial_prompt}')

    # Add user input to conversation history
    conversation += f"\n{player_name}:\n" + user_input + f"\n{npc_name}: "

    # Add initial prompt to conversation history to generate model input
    input_text = initial_prompt + "\n" + conversation
    input_length = len(input_text)

    # Tokenize input
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to('cuda')

    # Generate respones from dialogue model
    generation_output = model.generate(
        input_ids=input_ids, max_new_tokens=64, temperature=0.2
    )

    # Decode response
    output = tokenizer.decode(generation_output[0])

    # Limit reponse to NPC response
    split_string = output[input_length:].split('\n', 2) # .split(f'\n{player_name}:', 1)
    response = split_string[1] 
    print(f"\n{npc_name}: " + response)

    # Add response to conversation history
    conversation += response

## Setting a higher Temperature:
This causes more interesting and more creative responses. However, it also causes more hallucinations. This means the dialogue model will more likely make up stuff that is not part of the NPC-memory. 

In [None]:
# Number of sentences used in initial prompt
num_sentences = 4

# Start conversation history
conversation = ""

# Start dialogue loop
while True:
    # Get user input
    user_input = input('Player: ')
    #print("\nPlayer: " + user_input)

    # Generate initial prompt based on current user input
    initial_prompt = memory.generate_prompt(user_input, sentence_model, num_sentences=num_sentences)
    print(f'\nInitial Prompt: {initial_prompt}')

    # Add user input to conversation history
    conversation += f"\n{player_name}:\n" + user_input + f"\n{npc_name}: "

    # Add initial prompt to conversation history to generate model input
    input_text = initial_prompt + "\n" + conversation
    input_length = len(input_text)

    # Tokenize input
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to('cuda')

    # Generate respones from dialogue model
    generation_output = model.generate(
        input_ids=input_ids, max_new_tokens=64, temperature=0.8
    )

    # Decode response
    output = tokenizer.decode(generation_output[0])

    # Limit reponse to NPC answer
    split_string = output[input_length:].split('\n', 2) # .split(f'\n{player_name}:', 1)
    response = split_string[1] 
    print(f"\n{npc_name}: " + response)

    # Add response to conversation history
    conversation += response

## Adding more sentences to the inital prompt
This causes the initial prompt to contain more information. However, the quality of the inital prompt is more dependent on the quality of the similarity scoring. Therefore a longer input prompt is not always usefull.

In [None]:
# Number of sentences used in initial prompt
num_sentences = 4

# Start conversation history
conversation = ""

# Start dialogue loop
while True:
    # Get user input
    user_input = input('Player: ')
    #print("\nPlayer: " + user_input)

    # Generate initial prompt based on current user input
    initial_prompt = memory.generate_prompt(user_input, sentence_model, num_sentences=num_sentences)
    print(f'\nInitial Prompt: {initial_prompt}')

    # Add user input to conversation history
    conversation += f"\n{player_name}:\n" + user_input + f"\n{npc_name}: "

    # Add initial prompt to conversation history to generate model input
    input_text = initial_prompt + "\n" + conversation
    input_length = len(input_text)

    # Tokenize input
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to('cuda')

    # Generate respones from dialogue model
    generation_output = model.generate(
        input_ids=input_ids, max_new_tokens=64, temperature=0.2
    )

    # Decode response
    output = tokenizer.decode(generation_output[0])

    # Limit reponse to NPC response
    split_string = output[input_length:].split('\n', 2) # .split(f'\n{player_name}:', 1)
    response = split_string[1] 
    print(f"\n{npc_name}: " + response)

    # Add response to conversation history
    conversation += response

## Reducing the number of sentences in the initial prompt
This causes worse responses, since the dialogue model heavily relies on the quality of the sentence embedding model. If the sentence embedding model fails, the initial prompt will be unrelated to the players input. 

In [None]:
# Number of sentences used in initial prompt
num_sentences = 1

# Start conversation history
conversation = ""

# Start dialogue loop
while True:
    # Get user input
    user_input = input('Player: ')
    #print("\nPlayer: " + user_input)

    # Generate initial prompt based on current user input
    initial_prompt = memory.generate_prompt(user_input, sentence_model, num_sentences=num_sentences)
    print(f'\nInitial Prompt: {initial_prompt}')

    # Add user input to conversation history
    conversation += f"\n{player_name}:\n" + user_input + f"\n{npc_name}: "

    # Add initial prompt to conversation history to generate model input
    input_text = initial_prompt + "\n" + conversation
    input_length = len(input_text)

    # Tokenize input
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to('cuda')

    # Generate respones from dialogue model
    generation_output = model.generate(
        input_ids=input_ids, max_new_tokens=64, temperature=0.2
    )

    # Decode response
    output = tokenizer.decode(generation_output[0])

    # Limit reponse to NPC response
    split_string = output[input_length:].split('\n', 2) # .split(f'\n{player_name}:', 1)
    response = split_string[1] 
    print(f"\n{npc_name}: " + response)

    # Add response to conversation history
    conversation += response

## Dont use memory weighting
The scores of each memory are independent of the memory ratings. Therefore the dialogue models reponses are closer to the player input. However, the dialogue model neglects important parts of the NPCs memory.

In [None]:
# Number of sentences used in initial prompt
num_sentences = 4

# Start conversation history
conversation = ""

# Start dialogue loop
while True:
    # Get user input
    user_input = input('Player: ')
    #print("\nPlayer: " + user_input)

    # Generate initial prompt based on current user input
    initial_prompt = memory.generate_prompt(user_input, sentence_model, num_sentences=num_sentences, weighted=False)
    print(f'\nInitial Prompt: {initial_prompt}')

    # Add user input to conversation history
    conversation += f"\n{player_name}:\n" + user_input + f"\n{npc_name}: "

    # Add initial prompt to conversation history to generate model input
    input_text = initial_prompt + "\n" + conversation
    input_length = len(input_text)

    # Tokenize input
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to('cuda')

    # Generate respones from dialogue model
    generation_output = model.generate(
        input_ids=input_ids, max_new_tokens=64, temperature=0.2
    )

    # Decode response
    output = tokenizer.decode(generation_output[0])

    # Limit reponse to NPC response
    split_string = output[input_length:].split('\n', 2) # .split(f'\n{player_name}:', 1)
    response = split_string[1] 
    print(f"\n{npc_name}: " + response)

    # Add response to conversation history
    conversation += response