In [26]:
import numpy as np
import pandas as pd
import gensim
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize,sent_tokenize

In [27]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [28]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [29]:
import pickle

path = '/content/drive/My Drive/nlp/glove.840B.300d.pkl'

with open(path,'rb') as f:
  embeddings = pickle.load(f)

In [30]:
from nltk.stem import WordNetLemmatizer
import re
lem = WordNetLemmatizer()

In [31]:
def clean(sentence):
  sentence = sentence.lower()
  sentence = re.sub(r'http\S+',' ',sentence)
  sentence = re.sub(r'[^a-zA-Z]',' ',sentence)
  sentence = sentence.split()
  sentence = [lem.lemmatize(word) for word in sentence if word not in stopwords.words('english')]
  sentence = ' '.join(sentence)
  sentence = sentence + "\n"
  return sentence

In [32]:
def average_vector(sentence):
  words = sentence.split()
  size = len(words)
  average_vector = np.zeros((size,300))
  unknown_words=[]

  for index,word in enumerate(words):
    try:  
        average_vector[index] = embeddings[word].reshape(1,-1)
    except Exception as e:
      unknown_words.append(word)
      average_vector[index] = 0

  if size!=0:
    average_vector = sum(average_vector)/size
  return average_vector,unknown_words

In [33]:
def cosine_similarity(vector_1,vector_2):
  cosine_similarity = 0
  try:
    cosine_similarity = (np.dot(vector_1,vector_2)/(np.linalg.norm(vector_1)*np.linalg.norm(vector_2)))
  except Exception as e :
    # print("Exception occured",str(e))
    pass
  return cosine_similarity

In [34]:
def find_similarity(string1,string2):
  # string1,string2 = clean(string1),clean(string2)
  vector1,unknown_words1 = average_vector(string1)
  vector2,unknown_words2 = average_vector(string2)
  similarity = cosine_similarity(vector1,vector2)
  return similarity

In [None]:
""" https://www.hrw.org/report/2020/11/24/sparkling-jewels-opaque-supply-chains/jewelry-companies-changing-sourcing, WordSection2, Around the world, people living near or working at gold and diamond mines have for many years suffered serious human rights abuses, including those stemming from large-scale environmental destruction. An estimated 40 million people work in artisanal and small-scale mining, and an additional 100 million people indirectly depend on the sector for their livelihoods.[1] Artisanal and small-scale mines operate with little or no machinery and often belong to the informal sector. By comparison, around seven million people work globally in industrial, large-scale mining operations.[2]:"""
"""https://www.hrw.org/report/2020/11/20/no-law-no-justice-no-state-victims/culture-impunity-post-conflict-nepal, WordSection3, Around the world, people living near or working at gold and diamond mines have for many years suffered serious human rights abuses, including those stemming from large-scale environmental destruction. An estimated 40 million people work in artisanal and small-scale mining, and an additional 100 million people indirectly depend on the sector for their livelihoods.[1] Artisanal and small-scale mines operate with little or no machinery and often belong to the informal sector. By comparison, around seven million people work globally in industrial, large-scale mining operations.[2]:"""


In [35]:
from bs4 import BeautifulSoup
import requests

url = 'https://www.hrw.org/report/2020/11/24/sparkling-jewels-opaque-supply-chains/jewelry-companies-changing-sourcing'
soup = BeautifulSoup(requests.get(url).text, 'html.parser')

whole_section = soup.find('div',{'class':'WordSection2'})
paras = whole_section.findAll('p')
content = ""
model_summary = ""

for para in paras:
  if para.text != 'Around the world, people living near or working at gold and diamond mines have for many years suffered serious human rights abuses, including those stemming from large-scale environmental destruction. An estimated 40 million people work in artisanal and small-scale mining, and an additional 100 million people indirectly depend on the sector for their livelihoods.[1] Artisanal and small-scale mines operate with little or no machinery and often belong to the informal sector. By comparison, around seven million people work globally in industrial, large-scale mining operations.[2]':
    model_summary = model_summary + para.text + "\n" 
    
  else:
    break
    
for para in paras[18:]:
    content= content + para.text
  

In [36]:
print(model_summary)

The Covid-19 pandemic has demonstrated the fragility of global supply chains and the vulnerability of people working at the bottom of these supply chains. In the mining sector, the pandemic has had devastating effects on workers and communities around the world. In some parts of Africa, Asia, and Latin America, small-scale mining activity has been reduced or halted due to lockdowns and blocked trade routes. Where mining has been suspended, mine workers and their families have lost their income. Where mining has continued, workers and affected communities have been exposed to increased risks to their human rights. In some small-scale mining areas, child labor has risen.
In addition, some illegal mine operators and traders have made use of the Covid-19 pandemic to expand their unlawful small-scale mining activities. Illegal gold mining in Africa and Latin America threatens the environment and rights protections, especially the rights of Indigenous peoples. And while lockdowns have been i

In [37]:
sentences = sent_tokenize(content)

In [38]:
cleaned_sentences=[]
for sentence in sentences:
  cleaned_sentences.append(clean(sentence))

In [39]:
similarity_matrix = np.zeros((len(cleaned_sentences),len(cleaned_sentences)))

for i in range(0,len(cleaned_sentences)):
  for j in range(0,len(cleaned_sentences)):
    if type(find_similarity(cleaned_sentences[i],cleaned_sentences[j])) == np.float64 :
      similarity_matrix[i,j] = find_similarity(cleaned_sentences[i],cleaned_sentences[j])

In [40]:
similarity_matrix

array([[1.        , 0.82800042, 0.79895498, ..., 0.49927547, 0.59172314,
        0.59727143],
       [0.82800042, 1.        , 0.82593145, ..., 0.46811074, 0.5110569 ,
        0.52954401],
       [0.79895498, 0.82593145, 1.        , ..., 0.43262233, 0.51386363,
        0.54765815],
       ...,
       [0.49927547, 0.46811074, 0.43262233, ..., 1.        , 0.76279871,
        0.72187086],
       [0.59172314, 0.5110569 , 0.51386363, ..., 0.76279871, 1.        ,
        0.84437642],
       [0.59727143, 0.52954401, 0.54765815, ..., 0.72187086, 0.84437642,
        1.        ]])

In [41]:
class Graph:
  
  def __init__(self,graph_dictionary):
    if not graph_dictionary:
      graph_dictionary={}
    self.graph_dictionary = graph_dictionary
  
  def vertices(self):
    return self.graph_dictionary.keys()
  
  def edges(self):
    return self.generate_edges()

  def add_vertex(self,vertex):
    if vertex not in graph_dictionary.keys():
      graph_dictionary[vertex] = []
  
  def add_edge(self,edge):
    vertex1,vertex2 = tuple(set(edge))
    if vertex1 in graph_dictionary.keys():
      graph_dictionary[vertex1].append(vertex2)
    else:
      graph_dictionary[vertex1] = [vertex2]

  def generate_edges(self):
    edges = set()
    for vertex in graph_dictionary.keys():
      for edges in graph_dictionary[vertex]:
        edges.add([vertex,edge])
    return list(edges)

In [42]:
similarity_threshold = 0.70
network_dictionary = {}

for i in range(len(cleaned_sentences)):
    network_dictionary[i] = []  

for i in range(len(cleaned_sentences)):
  for j in range(len(cleaned_sentences)):
    if similarity_matrix[i][j] > similarity_threshold:
      if j not in network_dictionary[i]:
        network_dictionary[i].append(j)
      if i not in network_dictionary[j]:
        network_dictionary[j].append(i)


In [43]:
graph = Graph(network_dictionary)

In [44]:
def page_rank(graph,iterations = 50,sentences=66):
  ranks = []
  # ranks = {}
  network = graph.graph_dictionary
  current_ranks = np.squeeze(np.zeros((1,len(cleaned_sentences))))
  prev_ranks = np.array([1/len(cleaned_sentences)]*len(cleaned_sentences))
  for iteration in range(0,iterations):
    for i in range(0,len(list(network.keys()))):
      current_score = 0
      adjacent_vertices = network[list(network.keys())[i]]
      for vertex in adjacent_vertices:
          current_score += prev_ranks[vertex]/len(network[vertex])
      current_ranks[i] = current_score
    prev_ranks = current_ranks
  
  for index in range(len(cleaned_sentences)):
      # ranks[index] = prev_ranks[index]
      if prev_ranks[index]: 
        ranks.append((index,prev_ranks[index]))
  # ranks = {index:rank for index,rank in sorted(ranks.items(),key=ranks.get,reverse=True)}[:sentences]
  ranks = sorted(ranks,key = lambda x:x[1],reverse=True)[:sentences]

  return ranks

ranks = page_rank(graph,iterations=1000)

In [45]:
summary = ""
for index,rank in ranks:
  summary+=sentences[index]+" "

In [46]:
summary

'Furthermore, diamond and colored gemstone companies have been granted a “pilot period” with additional time to reach compliance, after diamond industry companies or groups pushed for this exception[106]: Between April 2020 and April 2021, companies in the diamond and gemstone supply chain will be assessed for some, but not all of the Code’s requirements—notably, they will not be checked for steps 3 and 4 of the OECD Minerals Guidance which require companies to identify and address human rights risks and undergo a third-party audit. Therefore, Human Rights Watch was unable to determine whether the remaining companies have codes for their suppliers.Traceability: Very few companies can trace all their mined gold or diamonds back to the mines of origin, ensuring full chain of custody in order to reliably assess whether their materials are free from abuse. [226] The Code of Conduct mostly deals with business ethics and professional behavior, but also briefly touches on labor rights.Christ 

In [47]:
!pip install rouge



In [48]:
!pip install pyrouge



In [49]:
'''from pyrouge import Rouge155
r = Rouge155()
r.system_dir = '/content/drive/My Drive/nlp'
r.model_dir = '/content/drive/My Drive/Colab Notebooks'
r.system_filename_pattern = 'system_textrank.txt'
r.model_filename_pattern = 'model_textrank.txt'

output = r.convert_and_evaluate()
print(output)
output_dict = r.output_to_dict(output)

SyntaxError: ignored