Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

helped fix major limitations. added 4 tests for it #49

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 46 additions & 3 deletions better_profanity/better_profanity.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from collections.abc import Iterable

from .constants import ALLOWED_CHARACTERS
from .constants import ALLOWED_CHARACTERS, ALLOWED_CONTANING_PROFANITY
from .utils import (
any_next_words_form_swear_word,
get_complete_path_of_file,
Expand All @@ -13,7 +13,7 @@


class Profanity:
def __init__(self, words=None):
def __init__(self, words=None, whitelist=None):
"""
Args:
words (Iterable/str): Collection of words or file path for a list of
Expand Down Expand Up @@ -43,6 +43,11 @@ def __init__(self, words=None):
}
self.MAX_NUMBER_COMBINATIONS = 1
self.ALLOWED_CHARACTERS = ALLOWED_CHARACTERS

self.whitelist = whitelist or set([])
self.whitelist = set(self.whitelist)
self.whitelist.update(ALLOWED_CONTANING_PROFANITY)

self._default_wordlist_filename = get_complete_path_of_file(
"profanity_wordlist.txt"
)
Expand Down Expand Up @@ -98,7 +103,9 @@ def _populate_words_to_wordset(self, words, *, whitelist_words=None):
)

# Validation
whitelist_words = whitelist_words or []
whitelist_words = whitelist_words or set([])
self.whitelist.update(whitelist_words)

for index, word in enumerate(whitelist_words):
if not isinstance(word, str):
raise ValueError(
Expand Down Expand Up @@ -176,11 +183,13 @@ def _hide_swear_words(self, text, censor_char):
cur_word = ""
continue


# Iterate the next words combined with the current one
# to check if it forms a swear word
next_words_indices = self._update_next_words_indices(
text, next_words_indices, index
)

contains_swear_word, end_index = any_next_words_form_swear_word(
cur_word, next_words_indices, self.CENSOR_WORDSET
)
Expand All @@ -194,16 +203,50 @@ def _hide_swear_words(self, text, censor_char):
if cur_word.lower() in self.CENSOR_WORDSET:
cur_word = get_replacement_for_swear_word(censor_char)



censored_text += cur_word + char
cur_word = ""




# Final check
if cur_word != "" and skip_index < len(text) - 1:
if cur_word.lower() in self.CENSOR_WORDSET:
cur_word = get_replacement_for_swear_word(censor_char)


# Check if removeing letters from behind makes a swear word
cur_word = self._check_for_profanity_within(cur_word, censor_char, [])
censored_text += cur_word

return censored_text

def _check_for_profanity_within(self, cur_word, censor_char, next_words_indices):
"""Checks if there is profanity within """

if cur_word in self.CENSOR_WORDSET:
return cur_word

if not cur_word.lower() in self.whitelist:
for idx, chr in iter(enumerate(cur_word)):
if cur_word[idx:].lower() in self.CENSOR_WORDSET:
cur_word = cur_word[:idx] + get_replacement_for_swear_word(censor_char)

break

cur_check_word = cur_word + 'a'

for idx, chr in iter(enumerate(cur_word)):
if cur_check_word.lower() in self.CENSOR_WORDSET:
cur_word = get_replacement_for_swear_word(censor_char) + cur_word[len(cur_check_word):]
break

cur_check_word = cur_check_word[:-1]

return cur_word

def _get_start_index_of_next_word(self, text, start_idx):
"""Return the index of the first character of the next word in the given text."""
start_idx_of_next_word = len(text)
Expand Down
6 changes: 6 additions & 0 deletions better_profanity/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,9 @@
# Pre-load the unicode characters
with open(get_complete_path_of_file("alphabetic_unicode.json"), "r") as json_file:
ALLOWED_CHARACTERS.update(load(json_file))



with open(get_complete_path_of_file("contaning_allowed.txt"), "r") as txt_file:
ALLOWED_CONTANING_PROFANITY = [a.strip() for a in txt_file.readlines()]

4 changes: 4 additions & 0 deletions better_profanity/contaning_allowed.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
night
nightmare
laborum
hello
21 changes: 21 additions & 0 deletions tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,27 @@ def test_censorship_with_ending_swear_word(self):
bad_text = "That wh0re gave m3 a very good H@nD j0b."
censored_text = "That **** gave m3 a very good ****."
self.assertEqual(profanity.censor(bad_text), censored_text)

def test_obstructing_letter_1(self):
bad_text = "Afoobar"
censored_text = "A****"
self.assertEqual(profanity.censor(bad_text), censored_text)

def test_obstructing_letter_multible(self):
bad_text = "AAAAAAAAAfoobar"
censored_text = "AAAAAAAAA****"
self.assertEqual(profanity.censor(bad_text), censored_text)

def test_end_letter_obstructing(self):
bad_text = "foobarAAAAAAAAA"
censored_text = "****AAAAAAAAA"
self.assertEqual(profanity.censor(bad_text), censored_text)

def test_clean_word_that_contains(self):
clean_text = "night"
self.assertEqual(profanity.censor(clean_text), clean_text)


def test_censorship_empty_text(self):
empty_text = ""
self.assertEqual(profanity.censor(empty_text), empty_text)
Expand Down Expand Up @@ -205,6 +225,7 @@ def setUp(self):
def test_whitelist_words(self):
bad_text = "I have boobs"
censored_text = "I have ****"

self.assertEqual(profanity.censor(bad_text), censored_text)

# Whitelist the word `boobs`
Expand Down