In [1]:
import numpy as np 
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize,word_tokenize
from bs4 import BeautifulSoup
import requests
import re

In [2]:
import nltk
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
def clean(sentences):
	lemmatizer = WordNetLemmatizer()
	cleaned_sentences = []
	for sentence in sentences:
		sentence = sentence.lower()
		sentence = re.sub(r'[^a-zA-Z]',' ',sentence)
		sentence = sentence.split()
		sentence = [lemmatizer.lemmatize(word) for word in sentence if word not in set(stopwords.words('english'))]
		sentence = ' '.join(sentence)
		cleaned_sentences.append(sentence)
	return cleaned_sentences

In [4]:
def init_probability(sentences):
	probability_dict = {}
	words = word_tokenize('. '.join(sentences))
	total_words = len(set(words))
	for word in words:
		if word!='.':
			if not probability_dict.get(word):
				probability_dict[word] = 1
			else:
				probability_dict[word] += 1

	for word,count in probability_dict.items():
		probability_dict[word] = count/total_words 
	
	return probability_dict

In [5]:
def update_probability(probability_dict,word):
	if probability_dict.get(word):
		probability_dict[word] = probability_dict[word]**2
	return probability_dict

In [6]:
def average_sentence_weights(sentences,probability_dict):
	sentence_weights = {}
	for index,sentence in enumerate(sentences):
		if len(sentence) != 0:
			average_proba = sum([probability_dict[word] for word in sentence if word in probability_dict.keys()])
			average_proba /= len(sentence)
			sentence_weights[index] = average_proba 
	return sentence_weights


In [7]:
def generate_summary(sentence_weights,probability_dict,cleaned_article,tokenized_article,summary_length = 30):
	summary = ""
	current_length = 0
	while current_length < summary_length :
		highest_probability_word = max(probability_dict,key=probability_dict.get)
		sentences_with_max_word= [index for index,sentence in enumerate(cleaned_article) if highest_probability_word in set(word_tokenize(sentence))]
		sentence_list = sorted([[index,sentence_weights[index]] for index in sentences_with_max_word],key=lambda x:x[1],reverse=True)
		summary += tokenized_article[sentence_list[0][0]] + "\n"
		for word in word_tokenize(cleaned_article[sentence_list[0][0]]):
			probability_dict = update_probability(probability_dict,word)
		current_length+=1
	return summary

In [8]:
from bs4 import BeautifulSoup
import requests

url = 'https://www.hrw.org/report/2020/11/24/sparkling-jewels-opaque-supply-chains/jewelry-companies-changing-sourcing'
soup = BeautifulSoup(requests.get(url).text, 'html.parser')

whole_section = soup.find('div',{'class':'WordSection2'})
paras = whole_section.findAll('p')
content = ""
model_summary = ""

for para in paras:
  if para.text != 'Around the world, people living near or working at gold and diamond mines have for many years suffered serious human rights abuses, including those stemming from large-scale environmental destruction. An estimated 40 million people work in artisanal and small-scale mining, and an additional 100 million people indirectly depend on the sector for their livelihoods.[1] Artisanal and small-scale mines operate with little or no machinery and often belong to the informal sector. By comparison, around seven million people work globally in industrial, large-scale mining operations.[2]':
    model_summary = model_summary + para.text
    
  else:
    break
    
for para in paras[18:]:
    content= content + para.text

	


In [9]:
print(model_summary)

The Covid-19 pandemic has demonstrated the fragility of global supply chains and the vulnerability of people working at the bottom of these supply chains. In the mining sector, the pandemic has had devastating effects on workers and communities around the world. In some parts of Africa, Asia, and Latin America, small-scale mining activity has been reduced or halted due to lockdowns and blocked trade routes. Where mining has been suspended, mine workers and their families have lost their income. Where mining has continued, workers and affected communities have been exposed to increased risks to their human rights. In some small-scale mining areas, child labor has risen.In addition, some illegal mine operators and traders have made use of the Covid-19 pandemic to expand their unlawful small-scale mining activities. Illegal gold mining in Africa and Latin America threatens the environment and rights protections, especially the rights of Indigenous peoples. And while lockdowns have been im

In [10]:
print(content)

Around the world, people living near or working at gold and diamond mines have for many years suffered serious human rights abuses, including those stemming from large-scale environmental destruction. An estimated 40 million people work in artisanal and small-scale mining, and an additional 100 million people indirectly depend on the sector for their livelihoods.[1] Artisanal and small-scale mines operate with little or no machinery and often belong to the informal sector. By comparison, around seven million people work globally in industrial, large-scale mining operations.[2]Basic labor rights are violated in the context of artisanal and small-scale mining. For example, young children have worked in small-scale gold or diamond mines, often at the expense of their education. Disregard for health and safety standards has resulted in mining accidents, injuring and killing child as well as adult miners.[3] Miners have also been subject to trafficking or forced labor in both small-scale an

In [11]:
required_length = int(input("Enter the number of required sentences"))
tokenized_article = sent_tokenize(content)
cleaned_article = clean(tokenized_article) 
probability_dict = init_probability(cleaned_article)
sentence_weights = average_sentence_weights(cleaned_article,probability_dict)
summary = generate_summary(sentence_weights,probability_dict,cleaned_article,tokenized_article,required_length)
print(summary)

Enter the number of required sentences60
[50]The human rights responsibility of companies is articulated in the 2011 UN Guiding Principles on Business and Human Rights (the “UN Guiding Principles”).
In 2019, Alrosa produced 35.5 million carats of rough diamonds and its revenue from sales was about US$2.7billion.
In 2019, Bulgari exclusively sourced recycled gold.
[251] The company has no publicly available sourcing policy and does not publish information on any human rights due diligence measures or the names of its suppliers.
[41] By July 2020, most mines had resumed operations.
They set vicious dogs on us which mauled us for about 10 to 15 minutes as they watched, leaving us severely injured.”[21]Since the outbreak of Covid-19, the Zimbabwean government has declared mining an essential service and allowed operations to continue.
[136] The standard requires all mines to be audited on a regular basis.
[51] Under the UN Guiding Principles, businesses have a responsibility to ensure that