In [1]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
import string

  from .autonotebook import tqdm as notebook_tqdm
2022-12-08 15:48:13.484552: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Instantiate model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/GODEL-v1_1-base-seq2seq")
model = AutoModelForSeq2SeqLM.from_pretrained("microsoft/GODEL-v1_1-base-seq2seq")

In [3]:
def predict(user_input, history):

    # Basic instruction
    instruction = 'Instruction: given a dialog context, you need to response empathically.'

    # Knowledge given
    knowledge = '  '

    # Copy history of dialog
    dialog_list = history.copy()

    # Append user input to dialog
    dialog_list.append(user_input)

    # Prepare dialog to be used in the model
    dialog = ' EOS '.join(dialog_list)

    # Build query 
    ### Question: What does [CONTEXT] do???
    query = f"{instruction} [CONTEXT] {dialog} {knowledge}"

    # Tokenize the new input sentence
    new_user_input_ids = tokenizer.encode(f"{query}", return_tensors='pt')

    # Set params for model output
    top_p = 0.9 # What is this???
    min_length = 8
    max_length = 200

    # Generate output
    output = model.generate(new_user_input_ids, 
                            min_length=int(min_length), 
                            max_length=int(max_length), 
                            top_p=top_p, 
                            do_sample=True)          

    # Decode output
    ### Removed ".tolist()"
    response = tokenizer.decode(output[0], skip_special_tokens=True)   

    # Create new history of dialog
    dialog_list.append(response)

    return response, dialog_list

In [12]:
input1 = 'Can you help my?'
history1 = []
predict(input1, history1)

('oh this is all good', ['Can you help my?', 'oh this is all good'])

In [14]:
# create list of bad words
with open("../filter/bad_words.txt") as file:
    bad_words = [line.rstrip() for line in file]


In [15]:
bad_words

['2g1c',
 '2 girls 1 cup',
 'acrotomophilia',
 'alabama hot pocket',
 'alaskan pipeline',
 'anal',
 'anilingus',
 'anus',
 'apeshit',
 'arsehole',
 'ass',
 'asshole',
 'assmunch',
 'auto erotic',
 'autoerotic',
 'babeland',
 'baby batter',
 'baby juice',
 'ball gag',
 'ball gravy',
 'ball kicking',
 'ball licking',
 'ball sack',
 'ball sucking',
 'bangbros',
 'bangbus',
 'bareback',
 'barely legal',
 'barenaked',
 'bastard',
 'bastardo',
 'bastinado',
 'bbw',
 'bdsm',
 'beaner',
 'beaners',
 'beaver cleaver',
 'beaver lips',
 'beastiality',
 'bestiality',
 'big black',
 'big breasts',
 'big knockers',
 'big tits',
 'bimbos',
 'birdlock',
 'bitch',
 'bitches',
 'black cock',
 'blonde action',
 'blonde on blonde action',
 'blowjob',
 'blow job',
 'blow your load',
 'blue waffle',
 'blumpkin',
 'bollocks',
 'bondage',
 'boner',
 'boob',
 'boobs',
 'booty call',
 'brown showers',
 'brunette action',
 'bukkake',
 'bulldyke',
 'bullet vibe',
 'bullshit',
 'bung hole',
 'bunghole',
 'busty',


In [16]:
# create a list of trigger words
with open("../filter/trigger_words.txt") as file:
    trigger_words = [line.rstrip() for line in file]


In [17]:
trigger_words

['suicide',
 'suicidal',
 'kill',
 'death',
 'dead',
 'murder',
 'self-murder',
 'self-slaughter',
 'self-suicide',
 'cut my throat',
 'cut my veins',
 'slice my veins',
 'jump off a bridge',
 'in front of a train',
 'fall off a bridge',
 'hang myself',
 'hang up',
 'take sleeping pills',
 'not want to wake up',
 'no longer want to wake up']

In [18]:
# basic preprocessing for filtering 
def filter_preprocessing(sentence):    
    # lower all words
    sentence = sentence.lower()    
    # remove punctuation
    for punctuation in string.punctuation:
        sentence = sentence.replace(punctuation, '')        
    # strip withespaces
    sentence = sentence.strip()    
    return sentence  

In [19]:
filter_preprocessing('Hello, I like you!')

'hello i like you'

In [20]:
# Check for bad words
def check_bad_words(sentence, bad_words): 
    
    # preprocessing
    sentence = filter_preprocessing(sentence)
    
    # Check
    for word in sentence.split():        
        if word in bad_words:
            return True                
    else:
        return False

In [22]:
evil_sentence = 'I Fuck you'
check_bad_words(evil_sentence, bad_words)

True

In [23]:
good_sentence = 'I love you'
check_bad_words(good_sentence, bad_words)

False

In [24]:
# Check for trigger words
def check_trigger_words(sentence, trigger_words):
    
    # preprocessing
    sentence = filter_preprocessing(sentence)
    
    # Check        
    for word in sentence.split():        
        if word in trigger_words:
            return True                
    else:
        return False

In [25]:
trigger_sentence = 'I will Kill myself'
check_trigger_words(trigger_sentence, trigger_words)

True

In [26]:
no_trigger_sentence = 'I will love myself'
check_trigger_words(no_trigger_sentence, trigger_words)

False

In [35]:
# Get nice output
def get_nice_response(model_response, user_input, history, history_updated):    
        
    # Preprocessing 
    response_sentence = filter_preprocessing(model_response)
    
    # Check and update response
    if check_bad_words(response, bad_words):
        new_response, new_history_updated = predict(user_input, history)
        return new_response, new_history_updated
    else:
        return response, history_updated
    

In [37]:
input2 = 'Nice to see you'
model_response = 'I Fuck you'
history = []
history_updated = ['Nice to see you', model_response]

get_nice_response(model_response, input2, history, history_updated)


("It's nice to see you too", ['Nice to see you', "It's nice to see you too"])

In [39]:
def predict_with_filter(user_input, history, bad_words, trigger_words):
    
    # Check for bad words and and if true: respond without predicting   
    if check_bad_words(user_input, bad_words):
        response = 'Rethink your language'
        return response, history
    
    # Check for trigger words and if true: and respond without predicting
    if check_trigger_words(user_input, trigger_words):
        response = 'You used a trigger word'
        return response, history
    
    # Get prediction from model
    model_response, history_updated = predict(user_input, history)    
    
    # Get new response if response contains bad words    
    model_response, history_updated = get_nice_response(model_response, user_input, history, history_updated)
    
    return model_response, history_updated
    
    


In [50]:
input2 = 'I love you very much'
history2 = []
predict_with_filter(input2, history2, bad_words, trigger_words)

('My friend, what does that mean?',
 ['I love you very much', 'My friend, what does that mean?'])

In [51]:
input3 = 'I fuck you'
history3 = []
predict_with_filter(input3, history4, bad_words, trigger_words)

('Rethink your language', [])

In [52]:
input4 = 'I kill you'
history4 = []
predict_with_filter(input4, history4, bad_words, trigger_words)

('You used a trigger word', [])