DyslexiaLens

Text Ingestion

In [1]:
#text = input(Text Input: )
text = """In many cities today, large construction projects often make life harder for poor and vulnerable people. These projects can push people out of their homes and make it harder to get basic services like water, transport, or healthcare.
Although city planners say they want to build in a fair and sustainable way, they often focus more on money than on including everyone in the process. As a result, the needs of the most at-risk communities are often ignored."""


Preprocessing

1. Text Cleaning

In [2]:
import ftfy
import re

In [3]:
def clean_text(text):

    #to clear the encoding issues
    text = ftfy.fix_text(text)

    #to remove the non printable letters
    text = ''.join(c for c in text if c.isprintable())
    
    #to remove the extra spaces
    text = re.sub(r'[\r\n]+', '\n', text)
    text = re.sub(r'\s{2,}', ' ', text)

    return text.strip()

In [4]:
cleanText = clean_text(text)

2. Segmentation

In [5]:
import nltk 
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/prathoseraaj-v/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
def segmentation_text(text):
    paragraph = [p for p in text.split('\n') if p.strip()]
    sentance = []

    for para in paragraph:
        sentance.extend(nltk.sent_tokenize(para))

    tokens = [nltk.word_tokenize(sent) for sent in sentance]
    return {
        'paragraph': paragraph,
        'sentences': sentance,
        'tokens': tokens,
    }

In [7]:
preprocessed_text = segmentation_text(text)

Readability Assesment

In [8]:
full_text = " ".join(preprocessed_text["paragraph"])
sentences = preprocessed_text["sentences"]

print(full_text)
print(sentences)

In many cities today, large construction projects often make life harder for poor and vulnerable people. These projects can push people out of their homes and make it harder to get basic services like water, transport, or healthcare. Although city planners say they want to build in a fair and sustainable way, they often focus more on money than on including everyone in the process. As a result, the needs of the most at-risk communities are often ignored.
['In many cities today, large construction projects often make life harder for poor and vulnerable people.', 'These projects can push people out of their homes and make it harder to get basic services like water, transport, or healthcare.', 'Although city planners say they want to build in a fair and sustainable way, they often focus more on money than on including everyone in the process.', 'As a result, the needs of the most at-risk communities are often ignored.']


In [9]:
import spacy

In [10]:
nlp = spacy.load("en_core_web_sm")

1. Readability Score

In [11]:
import textstat

In [12]:
def readability_score(text):
    return{
        "flesch_reading_ease" : textstat.flesch_reading_ease(text),
        "flesch_kincaid_grade" : textstat.flesch_kincaid_grade(text),
        "gunning_fog": textstat.gunning_fog(text),
        "smog_index": textstat.smog_index(text),
        "coleman_liau_index" : textstat.coleman_liau_index(text),
        "automated_readability_index" : textstat.automated_readability_index(text),
        "dale_chall_readability_score" : textstat.dale_chall_readability_score(text),
        "difficult_words_count" : textstat.difficult_words(text),
        "difficult_words_list" : textstat.difficult_words_list(text),
        }

In [13]:
readability_score(text)

{'flesch_reading_ease': 57.97326923076926,
 'flesch_kincaid_grade': 10.017564102564105,
 'gunning_fog': 10.876923076923077,
 'smog_index': 10.686352973137792,
 'coleman_liau_index': 10.26923076923077,
 'automated_readability_index': 11.326538461538462,
 'dale_chall_readability_score': 9.259725641025641,
 'difficult_words_count': 18,
 'difficult_words_list': ['communities',
  'services',
  'transport',
  'projects',
  'Although',
  'sustainable',
  'planners',
  'healthcare',
  'focus',
  'result',
  'ignored',
  'vulnerable',
  'cities',
  'harder',
  'construction',
  'basic',
  'including',
  'process']}

2. Detect long sentance

In [14]:
def detect_long_sentance(sentances, threshold=25):
    return [sent for sent in sentances if len(sent.split()) > threshold]

In [15]:
detect_long_sentance(sentances=sentences)

['Although city planners say they want to build in a fair and sustainable way, they often focus more on money than on including everyone in the process.']

3. Detect passive voice

In [16]:
import spacy

In [17]:
def detect_passive_voice(sentences):
    passive_sentences = []

    for sent in sentences:
        doc = nlp(sent)
        for token in doc:
            if token.dep_ == "nsubjpass":
                passive_sentences.append(sent)
                break

    return passive_sentences

In [18]:
detect_long_sentance(sentances=sentences)

['Although city planners say they want to build in a fair and sustainable way, they often focus more on money than on including everyone in the process.']

4. Detect ambigious structures

In [19]:
def detect_ambiguous_structures(sentences):
    ambiguous_keywords = ["might", "could", "possibly", "maybe", "potentially", "approximately", "suggests", "appears"]
    return [sent for sent in sentences if any(word in sent.lower() for word in ambiguous_keywords)]

In [20]:
detect_ambiguous_structures(sentences=sentences)

[]

5. Return the readability assesment 

In [None]:
def assesment_data(preprocessed_text):

    paragraph = "".join(preprocessed_text['paragraph'])
    sentences = preprocessed_text["sentences"]

    return{
        "readability_score" : readability_score(paragraph),
        "long_sentences" : detect_long_sentance(sentences),
        "passive_voice" : detect_passive_voice(sentences),
        "detect_ambiguous_structures" : detect_ambiguous_structures(sentences),
    }

In [22]:
assesment_data(preprocessed_text)

{'readability_score': {'flesch_reading_ease': 57.64949675324678,
  'flesch_kincaid_grade': 10.000616883116887,
  'gunning_fog': 11.336363636363638,
  'smog_index': 11.20814326018867,
  'coleman_liau_index': 10.607792207792208,
  'automated_readability_index': 11.500324675324677,
  'dale_chall_readability_score': 9.102728571428571,
  'difficult_words_count': 17,
  'difficult_words_list': ['communities',
   'services',
   'transport',
   'projects',
   'sustainable',
   'planners',
   'healthcareAlthough',
   'result',
   'focus',
   'ignored',
   'vulnerable',
   'cities',
   'harder',
   'construction',
   'basic',
   'including',
   'process']},
 'long_sentences': ['Although city planners say they want to build in a fair and sustainable way, they often focus more on money than on including everyone in the process.'],
 'passive_voice': ['As a result, the needs of the most at-risk communities are often ignored.'],
 'detect_ambiguous_structures': []}