In [10]:
from datasets import load_dataset
import jieba
import pandas as pd
import re
from tqdm import tqdm
import spacy

# 加载毒性词典
lexicon = load_dataset("textdetox/multilingual_toxic_lexicon", split="de")
toxic_words = set(lexicon["text"])

# 中文分词正则备用
space_re = re.compile(r"\s+")

#def delete_detox(text: str) -> str:
 #   words = jieba.cut(text)
  #  return "".join([w for w in words if w not in toxic_words])

#nlp = spacy.load("es_core_news_sm")

# 删除毒词（支持词形还原）
#def delete_detox(text: str) -> str:
 #   doc = nlp(text)
  #  clean_tokens = [token.text for token in doc if token.lemma_.lower() not in toxic_words]
   # return spacy.tokens.Doc(doc.vocab, words=clean_tokens).text


def delete_detox(text: str) -> str:
    words = space_re.split(text)
    return " ".join([w for w in words if w.lower().strip() not in toxic_words])


# 加载原始数据
es_ds = load_dataset("textdetox/multilingual_paradetox", split="de")
es_toxic = es_ds["toxic_sentence"]
langs = ["de"] * len(es_toxic)

# 去毒
es_detox = []
for sent in tqdm(es_toxic, desc="Detoxifying via deletion"):
    es_detox.append(delete_detox(sent))

# 构建 DataFrame
result_df = pd.DataFrame({
    "toxic_sentence": es_toxic,
    "neutral_sentence": es_detox,
    "lang": langs
})

# 保存
result_df.to_csv("baseline_delete_output_de.tsv", sep="\t", index=False)
print("Delete baseline detox completed.")

Detoxifying via deletion: 100%|██████████| 400/400 [00:00<00:00, 193620.50it/s]

Delete baseline detox completed.





In [11]:
toxic_words

{'MILF',
 'abficker',
 'aerschen',
 'affenarsch',
 'analritter',
 'arsch',
 'arsche',
 'arschen',
 'arschficken',
 'arschficker',
 'arschfratze',
 'arschgeburt',
 'arschgeige',
 'arschgesicht',
 'arschig',
 'arschkalt',
 'arschkrampe',
 'arschlecker',
 'arschloch',
 'arschlocher',
 'arschlochkleber',
 'arschlöcher',
 'arschlöchern',
 'bastard',
 'bastarde',
 'bescheissen',
 'bescheißen',
 'beschissenen',
 'bimbo',
 'blasen',
 'bloed',
 'blöd',
 'blöde',
 'blöden',
 'blöder',
 'blödes',
 'blödsinn',
 'bratze',
 'brustwarzen',
 'brüste',
 'bumsen',
 'crackhure',
 'depp',
 'deppen',
 'doedel',
 'doof',
 'doofe',
 'drecksschlampe',
 'duemmsten',
 'dumm',
 'dumme',
 'dummen',
 'dummer',
 'dummes',
 'dummkopf',
 'durchficken',
 'dödel',
 'dümmlichste',
 'dümmsten',
 'fick',
 'ficke',
 'ficken',
 'ficker',
 'fickfresse',
 'fickgesicht',
 'fickgesichter',
 'fickst',
 'fickt',
 'fiesling',
 'flittchen',
 'fotze',
 'fotzen',
 'fotzengesicht',
 'fratze',
 'fucker',
 'furz',
 'furzen',
 'furzt',
 

In [15]:
if 'toxic_words' in toxic_words:
    print(1)