**Extracting Data from Website**

In [0]:
# !pip install requests --upgrade

In [0]:
import requests
import time
import re
# import string

In [0]:
URL = "https://www.businesswire.com/portal/site/home/news/"
page = requests.get(URL)

In [0]:
from bs4 import BeautifulSoup

In [0]:
soup = BeautifulSoup(page.content,'html5lib')

In [0]:
news_items = soup.find_all(itemscope="itemscope")

In [0]:
news = {'date': [], 'time' : [], 'url' : [], 'content' : []}
for i in range(len(news_items)):
    if i&1 != 1:
        news['date'].append(news_items[i].find('time')['datetime'][:10])
        news['time'].append(news_items[i].find('time')['datetime'][11:-1])
        news['url'].append(news_items[i]['itemid'])
        news['content'].append('')

In [0]:
sep = ''
for i in range(len(news['url'])):
    temp = requests.get(news['url'][i])
    s = BeautifulSoup(temp.content,'html5lib')
    news['content'][i] += sep.join(i.getText() for i in s.find_all('p'))

All data stored in news & googletrans to translate non english articles

In [0]:
# !pip install googletrans
# from googletrans import Translator
# translator = Translator()

**Preprocessing and Cleaning**

In [0]:
# !pip install nltk --upgrade

In [0]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

In [0]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

sep = ' '
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

sentences = []

for i in range(len(news['content'])):
    sentences.append(sent_tokenize(news['content'][i]))

    # grading sentences
    temp = sentences[i][:]
    for j in range(len(temp)):
        temp[j] = temp[j].lower()
        temp[j] = re.sub(r'\d+', '', temp[j])           # Removing Numbers
        # temp[j] = temp[j].translate(string.maketrans("",""), string.punctuation)      # Removing Punctuations
        temp[j] = re.sub(r'[^\w\s]', '', temp[j])       # Removing Punctuations using Regex
        temp[j] = temp[j].strip()

        tokens = word_tokenize(temp[j])
        # pre_tagg = [stemmer.stem(r) for r in tokens if not r in stop_words]        # Stemming and removing stop words
        # OR
        pre_tagg = [lemmatizer.lemmatize(r) for r in tokens if not r in stop_words]        # Lemmatizing and removing stop words
        # pre_tagg = lemmatizer.lemmatize(tokens)
        tagged = nltk.pos_tag(pre_tagg)     # PoS Tagging

        temp[j] = sep.join([r[0] for r in tagged if r[1] in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'NN', 'NNS', 'NNP', 'NNPS']])   # Selecting only Verbs and Nouns
        
    sentences[i] = temp

In [0]:
# sep = ' '
# print(sep.join(nltk.RegexpTokenizer(r'\w+').tokenize(sentences[0])))

**TD-IDF Approach for Scoring**

In [14]:
# total_documents = len(sentences)
# total_documents

25

In [0]:
def freq_matrix(sentences):
    matrix = {}

    for s in sentences:
        freq_table = {}
        words = word_tokenize(s)

        for w in words:
            if w in freq_table:
                freq_table[w] += 1
            else:
                freq_table[w] = 1
        
        matrix[s] = freq_table
    
    return matrix

In [21]:
print(freq_matrix(sentences[0]))

{'la primera compañía del sector bananero conseguir dole committed reaching water practice dolemanaged farm packing facility': {'la': 1, 'primera': 1, 'compañía': 1, 'del': 1, 'sector': 1, 'bananero': 1, 'conseguir': 1, 'dole': 1, 'committed': 1, 'reaching': 1, 'water': 1, 'practice': 1, 'dolemanaged': 1, 'farm': 1, 'packing': 1, 'facility': 1}, 'photo business wirecharlotte carolina del wiredole food company anunció hoy que fincas bananeras colombia do ecuador han logrado la certificación del estándar para la para alliance water stewardship fin nivel más alto proveedores dole': {'photo': 1, 'business': 1, 'wirecharlotte': 1, 'carolina': 1, 'del': 2, 'wiredole': 1, 'food': 1, 'company': 1, 'anunció': 1, 'hoy': 1, 'que': 1, 'fincas': 1, 'bananeras': 1, 'colombia': 1, 'do': 1, 'ecuador': 1, 'han': 1, 'logrado': 1, 'la': 2, 'certificación': 1, 'estándar': 1, 'para': 2, 'alliance': 1, 'water': 1, 'stewardship': 1, 'fin': 1, 'nivel': 1, 'más': 1, 'alto': 1, 'proveedores': 1, 'dole': 1}, 'el

In [0]:
def tf_matrix(freq_matrix):
    matrix = {}

    for sent, f_table in freq_matrix.items():
        table = {}

        count = len(f_table)
        for word, coun in f_table.items():
            table[word] = coun/count

        matrix[sent] = table
    
    return matrix

In [23]:
print(tf_matrix(freq_matrix(sentences[0])))

{'la primera compañía del sector bananero conseguir dole committed reaching water practice dolemanaged farm packing facility': {'la': 0.0625, 'primera': 0.0625, 'compañía': 0.0625, 'del': 0.0625, 'sector': 0.0625, 'bananero': 0.0625, 'conseguir': 0.0625, 'dole': 0.0625, 'committed': 0.0625, 'reaching': 0.0625, 'water': 0.0625, 'practice': 0.0625, 'dolemanaged': 0.0625, 'farm': 0.0625, 'packing': 0.0625, 'facility': 0.0625}, 'photo business wirecharlotte carolina del wiredole food company anunció hoy que fincas bananeras colombia do ecuador han logrado la certificación del estándar para la para alliance water stewardship fin nivel más alto proveedores dole': {'photo': 0.03225806451612903, 'business': 0.03225806451612903, 'wirecharlotte': 0.03225806451612903, 'carolina': 0.03225806451612903, 'del': 0.06451612903225806, 'wiredole': 0.03225806451612903, 'food': 0.03225806451612903, 'company': 0.03225806451612903, 'anunció': 0.03225806451612903, 'hoy': 0.03225806451612903, 'que': 0.03225806

In [0]:
def docs_per_word(freq_matrix):
    table = {}

    for sent, f_table in freq_matrix.items():
        for word, count in f_table.items():
            if word in table:
                table[word] += 1
            else:
                table[word] = 1
    return table

In [25]:
print(docs_per_word(freq_matrix(sentences[0])))

{'la': 9, 'primera': 1, 'compañía': 1, 'del': 8, 'sector': 1, 'bananero': 1, 'conseguir': 1, 'dole': 5, 'committed': 1, 'reaching': 1, 'water': 2, 'practice': 1, 'dolemanaged': 1, 'farm': 1, 'packing': 1, 'facility': 1, 'photo': 1, 'business': 1, 'wirecharlotte': 1, 'carolina': 1, 'wiredole': 1, 'food': 3, 'company': 3, 'anunció': 1, 'hoy': 1, 'que': 7, 'fincas': 2, 'bananeras': 2, 'colombia': 1, 'do': 1, 'ecuador': 1, 'han': 1, 'logrado': 1, 'certificación': 1, 'estándar': 3, 'para': 6, 'alliance': 1, 'stewardship': 1, 'fin': 1, 'nivel': 1, 'más': 7, 'alto': 1, 'proveedores': 1, 'el': 4, 'tiene': 1, 'como': 4, 'beneficios': 1, 'sociales': 2, 'ambientales': 1, 'económicos': 1, 'paisaje': 1, 'abarca': 1, 'todos': 3, 'captación': 1, 'lo': 1, 'le': 1, 'comprender': 1, 'mejor': 1, 'cómo': 1, 'agua': 4, 'impacta': 1, 'trabajar': 1, 'colaborativa': 1, 'transparente': 1, 'gestión': 1, 'identificó': 1, 'conservación': 1, 'práctica': 1, 'relevante': 1, 'e': 6, 'impactante': 1, 'toda': 1, 'sus':

In [0]:
import math
def idf_matrix(freq_matrix, docs_per_word, total_documents):

    matrix = {}

    for sent, f_table in freq_matrix.items():
        table = {}

        for word in f_table.keys():
            table[word] = math.log10(total_documents / float(docs_per_word[word]))

        matrix[sent] = table

    return matrix

In [33]:
print(idf_matrix(freq_matrix(sentences[0]), docs_per_word(freq_matrix(sentences[0])), len(sentences[0])))

{'la primera compañía del sector bananero conseguir dole committed reaching water practice dolemanaged farm packing facility': {'la': 0.3467874862246563, 'primera': 1.3010299956639813, 'compañía': 1.3010299956639813, 'del': 0.3979400086720376, 'sector': 1.3010299956639813, 'bananero': 1.3010299956639813, 'conseguir': 1.3010299956639813, 'dole': 0.6020599913279624, 'committed': 1.3010299956639813, 'reaching': 1.3010299956639813, 'water': 1.0, 'practice': 1.3010299956639813, 'dolemanaged': 1.3010299956639813, 'farm': 1.3010299956639813, 'packing': 1.3010299956639813, 'facility': 1.3010299956639813}, 'photo business wirecharlotte carolina del wiredole food company anunció hoy que fincas bananeras colombia do ecuador han logrado la certificación del estándar para la para alliance water stewardship fin nivel más alto proveedores dole': {'photo': 1.3010299956639813, 'business': 1.3010299956639813, 'wirecharlotte': 1.3010299956639813, 'carolina': 1.3010299956639813, 'del': 0.3979400086720376,

In [0]:
def tf_idf_matrix(tf_matrix, idf_matrix):
    tf_idf_matrix = {}

    for (sent1, f_table1), (sent2, f_table2) in zip(tf_matrix.items(), idf_matrix.items()):

        tf_idf_table = {}

        for (word1, value1), (word2, value2) in zip(f_table1.items(), f_table2.items()):
            tf_idf_table[word1] = float(value1 * value2)

        tf_idf_matrix[sent1] = tf_idf_table

    return tf_idf_matrix

In [35]:
print(tf_idf_matrix(tf_matrix(freq_matrix(sentences[0])), idf_matrix(freq_matrix(sentences[0]), docs_per_word(freq_matrix(sentences[0])), len(sentences[0]))))

{'la primera compañía del sector bananero conseguir dole committed reaching water practice dolemanaged farm packing facility': {'la': 0.02167421788904102, 'primera': 0.08131437472899883, 'compañía': 0.08131437472899883, 'del': 0.02487125054200235, 'sector': 0.08131437472899883, 'bananero': 0.08131437472899883, 'conseguir': 0.08131437472899883, 'dole': 0.03762874945799765, 'committed': 0.08131437472899883, 'reaching': 0.08131437472899883, 'water': 0.0625, 'practice': 0.08131437472899883, 'dolemanaged': 0.08131437472899883, 'farm': 0.08131437472899883, 'packing': 0.08131437472899883, 'facility': 0.08131437472899883}, 'photo business wirecharlotte carolina del wiredole food company anunció hoy que fincas bananeras colombia do ecuador han logrado la certificación del estándar para la para alliance water stewardship fin nivel más alto proveedores dole': {'photo': 0.041968709537547784, 'business': 0.041968709537547784, 'wirecharlotte': 0.041968709537547784, 'carolina': 0.041968709537547784, 

In [0]:
def score_sentences(tf_idf_matrix) -> dict:
    sentenceValue = {}

    for sent, f_table in tf_idf_matrix.items():
        total_score_per_sentence = 0

        count_words_in_sentence = len(f_table)
        for word, score in f_table.items():
            total_score_per_sentence += score

        sentenceValue[sent] = total_score_per_sentence / count_words_in_sentence

    return sentenceValue

In [0]:
sentence_scores = score_sentences(tf_idf_matrix(tf_matrix(freq_matrix(sentences[0])), idf_matrix(freq_matrix(sentences[0]), docs_per_word(freq_matrix(sentences[0])), len(sentences[0]))))

In [0]:
def average_score(sentenceValue) -> int:
    sumValues = 0
    for entry in sentenceValue:
        sumValues += sentenceValue[entry]

    average = (sumValues / len(sentenceValue))

    return average

In [42]:
print(average_score(sentence_scores))

0.131844550056537


In [0]:
def generate_summary(sentences, sentenceValue, threshold):
    sentence_count = 0
    summary = ''

    for sentence in sentences:
        if sentence[:15] in sentenceValue and sentenceValue[sentence[:15]] >= (threshold):
            summary += " " + sentence
            sentence_count += 1

    return summary