DyslexiaLens

Text Ingestion

In [55]:
#text = input(Text Input: )
text = """Pursuant to the overarching objectives articulated within the Strategic Urban Development Framework, municipal authorities have commenced implementation of multifaceted infrastructural initiatives intended to augment both intermodal connectivity and socioeconomic inclusivity. The preliminary phase encompasses the recalibration of extant transportation modalities, the optimization of resource allocation protocols, and the systemic integration of public-private partnerships to facilitate sustainable urban growth trajectories. Notwithstanding the ostensible commitment to participatory governance paradigms, stakeholder engagement mechanisms have, in practice, predominantly prioritized fiscal imperatives and regulatory compliance over transparent community empowerment. Consequently, marginalized demographics continue to encounter disproportionate barriers to accessing essential civic services, despite nominal adherence to equity-centric policy mandates."""


Preprocessing

1. Text Cleaning

In [56]:
import ftfy
import re

In [57]:
def clean_text(text):

    #to clear the encoding issues
    text = ftfy.fix_text(text)

    #to remove the non printable letters
    text = ''.join(c for c in text if c.isprintable())
    
    #to remove the extra spaces
    text = re.sub(r'[\r\n]+', '\n', text)
    text = re.sub(r'\s{2,}', ' ', text)

    return text.strip()

In [58]:
cleanText = clean_text(text)

2. Segmentation

In [59]:
import nltk 
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/prathoseraaj-v/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [60]:
def segmentation_text(text):
    paragraph = [p for p in text.split('\n') if p.strip()]
    sentance = []

    for para in paragraph:
        sentance.extend(nltk.sent_tokenize(para))

    tokens = [nltk.word_tokenize(sent) for sent in sentance]
    return {
        'paragraph': paragraph,
        'sentences': sentance,
        'tokens': tokens,
    }

In [61]:
preprocessed_text = segmentation_text(text)

Readability Assesment

In [62]:
full_text = " ".join(preprocessed_text["paragraph"])
sentences = preprocessed_text["sentences"]

print(full_text)
print(sentences)

Pursuant to the overarching objectives articulated within the Strategic Urban Development Framework, municipal authorities have commenced implementation of multifaceted infrastructural initiatives intended to augment both intermodal connectivity and socioeconomic inclusivity. The preliminary phase encompasses the recalibration of extant transportation modalities, the optimization of resource allocation protocols, and the systemic integration of public-private partnerships to facilitate sustainable urban growth trajectories. Notwithstanding the ostensible commitment to participatory governance paradigms, stakeholder engagement mechanisms have, in practice, predominantly prioritized fiscal imperatives and regulatory compliance over transparent community empowerment. Consequently, marginalized demographics continue to encounter disproportionate barriers to accessing essential civic services, despite nominal adherence to equity-centric policy mandates.
['Pursuant to the overarching objecti

In [63]:
import spacy

In [64]:
nlp = spacy.load("en_core_web_sm")

1. Readability Score

In [65]:
import textstat

In [66]:
def readability_score(text):
    return{
        "flesch_reading_ease" : textstat.flesch_reading_ease(text),
        "flesch_kincaid_grade" : textstat.flesch_kincaid_grade(text),
        "gunning_fog": textstat.gunning_fog(text),
        "smog_index": textstat.smog_index(text),
        "coleman_liau_index" : textstat.coleman_liau_index(text),
        "automated_readability_index" : textstat.automated_readability_index(text),
        "dale_chall_readability_score" : textstat.dale_chall_readability_score(text),
        "difficult_words_count" : textstat.difficult_words(text),
        "difficult_words_list" : textstat.difficult_words_list(text),
        }

In [67]:
readability_score(text)

{'flesch_reading_ease': -64.40692307692302,
 'flesch_kincaid_grade': 28.701923076923077,
 'gunning_fog': 34.63076923076923,
 'smog_index': 25.800858919413376,
 'coleman_liau_index': 30.186538461538458,
 'automated_readability_index': 30.472788461538464,
 'dale_chall_readability_score': 16.7686,
 'difficult_words_count': 76,
 'difficult_words_list': ['implementation',
  'despite',
  'stakeholder',
  'policy',
  'equitycentric',
  'Urban',
  'Notwithstanding',
  'commitment',
  'nominal',
  'adherence',
  'encompasses',
  'practice',
  'objectives',
  'transparent',
  'participatory',
  'partnerships',
  'publicprivate',
  'recalibration',
  'initiatives',
  'urban',
  'governance',
  'extant',
  'integration',
  'authorities',
  'continue',
  'facilitate',
  'imperatives',
  'Development',
  'intended',
  'Framework',
  'allocation',
  'compliance',
  'intermodal',
  'commenced',
  'transportation',
  'within',
  'systemic',
  'engagement',
  'accessing',
  'articulated',
  'mechanisms'

2. Detect long sentance

In [68]:
def detect_long_sentance(sentances, threshold=25):
    return [sent for sent in sentances if len(sent.split()) > threshold]

In [69]:
detect_long_sentance(sentances=sentences)

['Pursuant to the overarching objectives articulated within the Strategic Urban Development Framework, municipal authorities have commenced implementation of multifaceted infrastructural initiatives intended to augment both intermodal connectivity and socioeconomic inclusivity.',
 'The preliminary phase encompasses the recalibration of extant transportation modalities, the optimization of resource allocation protocols, and the systemic integration of public-private partnerships to facilitate sustainable urban growth trajectories.']

3. Detect passive voice

In [70]:
import spacy

In [71]:
def detect_passive_voice(sentences):
    passive_sentences = []

    for sent in sentences:
        doc = nlp(sent)
        for token in doc:
            if token.dep_ == "nsubjpass":
                passive_sentences.append(sent)
                break

    return passive_sentences

In [72]:
detect_long_sentance(sentances=sentences)

['Pursuant to the overarching objectives articulated within the Strategic Urban Development Framework, municipal authorities have commenced implementation of multifaceted infrastructural initiatives intended to augment both intermodal connectivity and socioeconomic inclusivity.',
 'The preliminary phase encompasses the recalibration of extant transportation modalities, the optimization of resource allocation protocols, and the systemic integration of public-private partnerships to facilitate sustainable urban growth trajectories.']

4. Detect ambigious structures

In [73]:
def detect_ambiguous_structures(sentences):
    ambiguous_keywords = ["might", "could", "possibly", "maybe", "potentially", "approximately", "suggests", "appears"]
    return [sent for sent in sentences if any(word in sent.lower() for word in ambiguous_keywords)]

In [74]:
detect_ambiguous_structures(sentences=sentences)

[]

5. Return the readability assesment 

In [75]:
def assesment_data(preprocessed_text):

    paragraph = "".join(preprocessed_text['paragraph'])
    sentences = preprocessed_text["sentences"]

    return{
        "readability_score" : readability_score(paragraph),
        "long_sentences" : detect_long_sentance(sentences),
        "passive_voice" : detect_passive_voice(sentences),
        "detect_ambiguous_structures" : detect_ambiguous_structures(sentences),
    }

In [76]:
results = assesment_data(preprocessed_text)

Text Simplification

1. Simplified Synonyms

In [77]:
from nltk.corpus import wordnet
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/prathoseraaj-v/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [78]:
def simplified_synonyms(word):
    synsets = wordnet.synsets(word)

    if not synsets:
        return word

    simpleword = synsets[0].lemmas()[0].name()

    if simpleword.lower != word.lower:
        return simpleword.replace('-','')
    
    return word

2. Simplify lexials

In [79]:
def lexical_simplify(sentence, difficult_words):
    words = nltk.word_tokenize(sentence)
    simplified = []
    for w in words:
        if w.lower() in [dw.lower() for dw in difficult_words]:
            simple = simplified_synonyms(w)
            simplified.append(simple)
        else:
            simplified.append(w)
            
    return ' '.join(simplified)

3. Syntatic symplification

In [80]:
def syntactic_simplify(sentence, max_length=20):
    words = nltk.word_tokenize(sentence)
    if len(words) <= max_length:
        return sentence
    split_points = [m.start() for m in re.finditer(r'(,| and | but )', sentence)]
    if split_points:
        parts = []
        prev = 0
        for idx in split_points:
            parts.append(sentence[prev:idx].strip())
            prev = idx
        parts.append(sentence[prev:].strip())
        return '. '.join([p for p in parts if p])
    else:
        midpoint = len(words) // 2
        return ' '.join(words[:midpoint]) + '. ' + ' '.join(words[midpoint:])

4. Passive Voice Parsing

In [81]:
def paraphrase_passive(sentence):
    m = re.match(r'(.*) (is|are|was|were) (.*) by (.*)\.', sentence)
    if m:
        obj, be, verb, agent = m.groups()
        return f"{agent.strip()} {verb.strip()} {obj.strip()}."
    return sentence

5. Simplification to flagged sentence

In [82]:
def simplify_text(results):
    simplified = []
    for sent in results.get("long_sentences", []):
        s = lexical_simplify(sent, results["readability_score"]["difficult_words_list"])
        s = syntactic_simplify(s)
        simplified.append(s)
    for sent in results.get("passive_voice", []):
        s = paraphrase_passive(sent)
        simplified.append(s)
    return simplified

In [83]:
simplified = simplify_text(results)
print(simplified)

['pursuant to the overarch aim joint inside the strategic urban development model. , municipal government have get_down execution of manysided infrastructural enterprise intend to augment both intermodal connectivity. and socioeconomic inclusivity .', 'The preliminary phase embrace the recalibration of extant transportation_system modality. , the optimization of resource allotment protocol. ,. and the systemic integration of public-private partnership to facilitate sustainable urban growth trajectory .']
