In [1]:
import pandas as pd
import re

In [2]:
# Define the threshold for toxicity
TOXICITY_THRESHOLD = 0.8
DATA_PATH = '../data/'
# Load the dataset
df = pd.read_csv(DATA_PATH + 'raw/filtered.tsv', delimiter='\t')
df.head()

Unnamed: 0,id,reference,translation,similarity,lenght_diff,ref_tox,trn_tox
0,0,"If Alkar is flooding her with psychic waste, t...","if Alkar floods her with her mental waste, it ...",0.785171,0.010309,0.014195,0.981983
1,1,Now you're getting nasty.,you're becoming disgusting.,0.749687,0.071429,0.065473,0.999039
2,2,"Well, we could spare your life, for one.","well, we can spare your life.",0.919051,0.268293,0.213313,0.985068
3,3,"Ah! Monkey, you've got to snap out of it.","monkey, you have to wake up.",0.664333,0.309524,0.053362,0.994215
4,4,I've got orders to put her down.,I have orders to kill her.,0.726639,0.181818,0.009402,0.999348


In [4]:
# Build a dictionary of toxic phrases and their replacements
toxic_dict = {}
for index, row in df.iterrows():
    if row['ref_tox'] > TOXICITY_THRESHOLD:
        toxic_dict[row['reference'].lower()] = row['translation'].lower()

In [6]:
# Helper function to maintain the same case
def replace_with_same_case(match_obj):
    match_str = match_obj.group(0)
    replacement = toxic_dict[match_str.lower()]
    if match_str[0].isupper():
        return replacement.capitalize()
    return replacement

# Function to detoxify text
def detoxify(text):
    pattern = re.compile('|'.join(re.escape(key) for key in toxic_dict.keys()), re.IGNORECASE)
    return pattern.sub(replace_with_same_case, text)

In [7]:
# Test the function with a new sentence
test_sentence = "I like that shit."
print(detoxify(test_sentence))

I like this.


In [8]:
# Test the function with a new sentence
test_sentence = "Trying to keep me fucking drugged so I don't know what's going on."
print(detoxify(test_sentence))

You want to fool me so i don't know what's going on.


In [9]:
# Test the function with a new sentence
test_sentence = "Damn,man,i wanted the old football coach,man!"
print(detoxify(test_sentence))

Oh, man, i wanted an old american football game, man!


In [10]:
from transformers import pipeline
classifier = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english')

Downloading (…)lve/main/config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [11]:
def is_toxic(text_to_analyze, comments = False):
  # Run the classifier (model and tokenizer)
  results = classifier(text_to_analyze)

  # Interpret the results
  for result in results:
      label = result['label']
      score = result['score']

      # Heuristic for toxicity based on negative sentiment score
      res = label == 'NEGATIVE' and score > TOXICITY_THRESHOLD
      if comments:
        # Print results
        print(f"Label: {label}, Score: {score}")
        if res:
            print("The text may be considered toxic.")
        else:
            print("The text is unlikely to be toxic.")
      return res

In [12]:
# Example text
toxic_text = "i like that shit"
text_to_analyze =detoxify(toxic_text)
print(text_to_analyze)
print("Is toxic:", is_toxic( text_to_analyze, True))

i like that shit
Label: POSITIVE, Score: 0.6746519207954407
The text is unlikely to be toxic.
Is toxic: False
