In [None]:
!pip install presidio_analyzer

In [None]:
!pip install presidio_anonymizer

In [None]:
!python -m spacy download en_core_web_lg

# Presidio Analyzer & Anonymizer 
Helps to detect PII data and gracefully masks them with proper label.

In [6]:
from html import entities
from langcodes import Language
from presidio_anonymizer import AnonymizerEngine
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer.entities import OperatorConfig

txt = "Justin Bieber had a concert in San Francisco last Friday. He can be reached at 647-671-6711 and his email id is justin.bieber@gmail.com"

analyzer = AnalyzerEngine()
analyzer_results = analyzer.analyze(text = txt, 
    entities=["EMAIL_ADDRESS", "PERSON", "PHONE_NUMBER"],
    language='en')

anonymizer = AnonymizerEngine()
anonymized_results = anonymizer.anonymize(text = txt, analyzer_results=analyzer_results)

print(anonymized_results.text)

<PERSON> had a concert in San Francisco last Friday. He can be reached at <PHONE_NUMBER> and his email id is <EMAIL_ADDRESS>


# SymSpell

In [None]:
!pip install symspellpy==6.7.1

In [14]:
from symspellpy import SymSpell, Verbosity

import pkg_resources

sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
dictionary_path = pkg_resources.resource_filename("symspellpy", "frequency_dictionary_en_82_765.txt")

sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)

def symspell_corrector(input_term):
    suggestions = sym_spell.lookup_compound(
        phrase=input_term,
        max_edit_distance=2,
        transfer_casing=True,
        ignore_term_with_digits=True,
        ignore_non_words=True,
        split_by_space=True
    ) 

    for suggestion in suggestions:
        return f"OUTPUT : {suggestion.term}"

text = "The resturant had greatfood. The location ws nise as wel."
symspell_corrector(text)

'OUTPUT : The restaurant had great food The location is nice as well'

# PySBD (Python Sentence Boundary Disambiguation)

In [None]:
!pip install pysbd

In [20]:
from pysbd import Segmenter
segmenter = Segmenter(language='en', clean=True)
text = "My name is S. K. Bhattacharyya. I have read the book up to p. 45. At 5: 30 P.M. we will talk about A.W.S."
print(segmenter.segment(text))

['My name is S. K. Bhattacharyya.', 'I have read the book up to p. 45.', 'At 5: 30 P.M. we will talk about A.W.S.']


# Contractions

In [None]:
!pip install contractions

In [23]:
import contractions

a = "ive gotta go! Gonna miss you. i'll see yall later."
text = contractions.fix(a, slang=True)
print(text)

i have got to go! Going To miss you. i will see you all later.


# Gibberish Detector

In [None]:
!pip install gibberish_detector

In [25]:
!gibberish-detector train big.txt > gibberish-detector.model

In [26]:
from gibberish_detector import detector
Detector = detector.create_from_model('gibberish-detector.model')
text1 = "jkdjsjgsdfgjsdgfjdsgfjsd"
print(Detector.is_gibberish(text1))

text2 = "apple"
print(Detector.is_gibberish(text2))


True
False
