In [1]:
#importing libraries
# !pip install beautifulsoup4
# !pip install urllib3

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize
import bs4 as BeautifulSoup
import urllib.request 

In [18]:
# fetching the content from url
fetched_data = urllib.request.urlopen('https://en.wikipedia.org/wiki/Citizenship_(Amendment)_Act,_2019')           


article_read = fetched_data.read()

# parsing the URL and storing in a variable
article_parsed = BeautifulSoup.BeautifulSoup(article_read, 'html.parser')

# returning <p> tags
paragraphs = article_parsed.find_all('p')

article_content = ''

# looping through the paragraphs and adding them to the variable
for p in paragraphs:
    article_content += p.text

In [19]:
article_content



In [20]:
def _create_dictionary_table(text_string) -> dict:
    
    # removing stop words
    stop_words  = set(stopwords.words("english"))
    
    words  = word_tokenize(text_string)
    
    # reducing words to their root form 
    stem = PorterStemmer()
    
    # creating dictionary for the word frequency table
    frequency_table = dict()
    for wd in words:
        wd = stem.stem(wd)
        if wd in stop_words:
            continue
        if wd in frequency_table:
            frequency_table[wd] += 1
        else:
            frequency_table[wd] = 1
            
    return frequency_table

In [21]:
freqtable = _create_dictionary_table(article_content)
freqtable

{'citizenship': 52,
 '(': 23,
 'amend': 31,
 ')': 23,
 'act': 53,
 ',': 260,
 '2019': 23,
 'wa': 28,
 'pass': 7,
 'parliament': 6,
 'india': 62,
 '11': 4,
 'decemb': 20,
 '.': 190,
 'It': 12,
 '1955': 8,
 'provid': 8,
 'path': 1,
 'indian': 42,
 'hindu': 16,
 'sikh': 12,
 'buddhist': 7,
 'jain': 5,
 'parsi': 6,
 'christian': 9,
 'religi': 30,
 'minor': 23,
 'flee': 2,
 'persecut': 29,
 'pakistan': 20,
 'bangladesh': 16,
 'afghanistan': 12,
 '[': 207,
 '2': 6,
 ']': 207,
 'muslim': 26,
 'given': 2,
 'elig': 7,
 '3': 2,
 '4': 4,
 'first': 5,
 'time': 2,
 'religion': 9,
 'use': 5,
 'criterion': 1,
 'law': 18,
 '5': 2,
 'ha': 17,
 'seriou': 1,
 'widespread': 2,
 'problem': 1,
 '6': 4,
 '7': 3,
 '8': 3,
 'nationalist': 3,
 'bharatiya': 3,
 'janata': 4,
 'parti': 5,
 'bjp': 10,
 'lead': 1,
 'govern': 20,
 'promis': 3,
 'previou': 2,
 'elect': 6,
 'manifesto': 2,
 'offer': 6,
 'neighbor': 2,
 'countri': 23,
 '9': 4,
 '10': 5,
 'migrant': 15,
 'enter': 7,
 '31': 5,
 '2014': 4,
 'suffer': 3,
 '

In [22]:
sentences = sent_tokenize(article_content)
sentences

['\n\nThe Citizenship (Amendment) Act, 2019 was passed by the Parliament of India on 11 December 2019.',
 'It amended the Citizenship Act of 1955 by providing a path to Indian citizenship for Hindu, Sikh, Buddhist, Jain, Parsi, and Christian religious minorities fleeing persecution from Pakistan, Bangladesh and Afghanistan.',
 '[2] Muslims were not given such eligibility.',
 '[3][4] The act was the first time religion had been used as a criterion for citizenship under Indian law.',
 '[5]\nThe religious persecution of minorities such as Hindus, Sikhs and Christians has been a serious and widespread problem in Pakistan.',
 '[6][7][8] The Hindu nationalist Bharatiya Janata Party (BJP), which leads the Indian government, had promised in previous election manifestos to offer Indian citizenship to persecuted religious minorities from neighboring countries.',
 '[9][10] Under the 2019 amendment, migrants who had entered India by 31 December 2014, and had suffered "religious persecution or fear

In [23]:
def _calculate_sentence_scores(sentences, frequency_table) -> dict:
    
    # algorithm for scoring a sentence by its words
    sentence_weight = dict()
    
    for sentence in sentences:
        # sentence_wordcount = (len(word_tokenize(sentence)))
        sentence_wordcount = 0
        for word in frequency_table:
            if word in sentence.lower():
                sentence_wordcount += 1
                if sentence[:7] in sentence_weight:
                    sentence_weight[sentence[:7]] += frequency_table[word]
                else:
                    sentence_weight[sentence[:7]] = frequency_table[word]
                
            
        sentence_weight[sentence[:7]] = sentence_weight[sentence[:7]] / sentence_wordcount
    
    return sentence_weight
    

In [24]:
sentscore = _calculate_sentence_scores(sentences, freqtable)
sentscore

{'\n\nThe C': 32.96,
 'It amen': 23.30952380952381,
 '[2] Mus': 59.27272727272727,
 '[3][4] ': 36.96153846153846,
 '[5]\nThe': 40.30769230769231,
 '[6][7][': 33.33556231003039,
 '[9][10]': 37.42037037037037,
 '[2] The': 41.88235294117647,
 '[11] Ac': 37.03225806451613,
 'About 2': 22.96875,
 '[12][13': 38.7037037037037,
 '[14][15': 23.72340425531915,
 '[16] Cr': 37.0,
 'Comment': 29.25,
 '[17][18': 28.102040816326532,
 '[10][19': 45.416666666666664,
 '[20][21': 38.8421052631579,
 '[23] As': 28.391304347826086,
 '[24][25': 31.71560846560847,
 '[27]  M': 48.125,
 'Student': 15.058823529411764,
 '[28] Th': 27.52777777777778,
 '[29][30': 37.42857142857143,
 'The Ind': 32.07857142857143,
 'This ac': 26.66193181818182,
 'The act': 24.25,
 'The 195': 15.947368421052632,
 '[31]\nIn': 28.692307692307693,
 '[32][33': 43.875,
 'All ref': 26.4,
 'While I': 25.8,
 '[35][36': 29.944444444444443,
 '[35][38': 27.44,
 'They al': 15.481159420289854,
 '[40] Sp': 31.685714285714287,
 'Eligibi': 16.44,
 '[

In [25]:
def _calculate_average_score(sentence_weight) -> int:
    
    # calculating the average score for the sentences
    sum_value = 0
    for entry in sentence_weight:
        sum_value += sentence_weight[entry]
        
    # getting sentence average value from source text
    average_score = (sum_value / len(sentence_weight))
    
    return average_score

In [26]:
threshold = _calculate_average_score(sentscore)
threshold

31.010034117730548

In [27]:
def _get_article_summary(sentences, sentence_weight, threshold):
    
    article_summary = ''
    for sentence in sentences:
        if sentence[:7] in sentence_weight and sentence_weight[sentence[:7]] >= (threshold): 
            article_summary += " "+sentence
            
    return article_summary

In [28]:
def _run_article_summary(article):
    
    #creating a dictionary for the aord frequency table
    frequency_table = _create_dictionary_table(article)
    
    # tokenising the sentences
    sentences = sent_tokenize(article)
    
    # algorithm for scoring a sentence bu its words
    sentence_scores = _calculate_sentence_scores(sentences, frequency_table)
    
    #getting the threshold
    threshold = _calculate_average_score(sentence_scores)
    
    #producing the summary 
    article_summary = _get_article_summary(sentences, sentence_scores, 1.5*threshold)
    
    return article_summary

    
    
    

In [29]:
summary = _run_article_summary(article_content)

In [30]:
print(summary)

 [2] Muslims were not given such eligibility. [27]  Major protests against the Act were held at universities in India. [6] This has led to attacks and forced conversion of Christians and Hindus, as well as attacks on Sufis and Ahmadis. [75] The act will come into force on a date chosen by the Government of India, and will be notified as such. They should be treated as minorities. [25]
Internet access was restricted in Assam state. The states have no power to reject it,". [156]
