### Library - Import & Config

In [4]:
import newspaper
import csv, io
import os
import re
import pandas as pd
import nltk
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from docx import Document
from docx.text.paragraph import Paragraph
from docx.shared import Cm, Pt

from newspaper import Article
from newspaper import Config

# Newspaper3k config
config = Config()
config.request_timeout = 120
LINK_PATH = 'Text Summarization\data\link_swa.csv'

# Huggingface's Transformers config
MODEL_PATH = "./Text Summarization/model/t5-base-indonesian-summarization-cased/"
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_PATH)

# Load downloaded news data
DATA_PATH = r'Text Summarization\data\news_swa.csv'

Some weights of T5ForConditionalGeneration were not initialized from the model checkpoint at ./Text Summarization/model/t5-base-indonesian-summarization-cased/ and are newly initialized: ['decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Scraping

In [2]:
def news_download(first, last, LINK_PATH, DATA_PATH):
    link = []
    
    # Load links
    with open(LINK_PATH, 'r', ) as f:
        reader = csv.reader(f, delimiter=",")
        link = list(reader)
        link = link[first:last+1]
        article_list = []
        i=1
    
    # Download news text
    for url in link:
        a = Article(url[1], language='id', config=config)
        a.download()
        a.parse()
        a.text = a.text[(a.text.find('\n')+1):]
        a.text = a.text[1:]
        a.nlp()
        #a.text = a.text.replace('#','')
        a.summary = a.summary.replace('\n\n',' ').replace('\n',' ')
        with open(DATA_PATH, 'a', newline='', encoding='utf-8') as csvfile:
            spamwriter = csv.writer(csvfile, delimiter=',',quotechar='"', quoting=csv.QUOTE_ALL)
            spamwriter.writerow([url[0], a.title, a.text, a.publish_date, a.url, a.summary])
        print("News ID " + str(url[0]) + " downloaded")
        article_list.append(a)

### Summarization

In [3]:
# Generate chunks of text / paragraph <= 512 tokens
def nest_paragraph(document):
    nested = []
    sent = []
    length = 0
    
    paragraph = document.split('\n\n')
    paragraph = [ x for x in paragraph if "www.swa.co.id" not in x ]
    paragraph = [ x for x in paragraph if "Editor" not in x ]
    
    for elem in paragraph:
        length += len(elem)
        if length < 1024:
            sent.append(elem)
        else:
            nested.append(sent)
            sent = [elem]
            length = len(elem)

    if sent:
        nested.append(sent)
        
    nested_paragraph = []
    
    for elem in nested:
        elem = ' '.join(elem)
        nested_paragraph.append(elem)
    return nested_paragraph

# Generate summary on text with <= 512 tokens
def generate_summary(nested_paragraph):
    summaries = []
    info = {
        'text_chars': [],
        'text_words': [],
        'text_tokens': [],
        'sum_chars': [],
        'sum_words': [],
        'sum_tokens': []
    }  
   
    for elem in nested_paragraph:
        # T5 uses a max_length of 512 so we cut the article to 512 tokens.
        inputs = tokenizer.encode("summarize: " + elem, return_tensors="pt", max_length=512, truncation=True)
        outputs = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
        summary = [tokenizer.decode(outputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)]
    
        summaries.append(summary)
        info['text_chars'].append(len(elem))
        info['text_words'].append(len(elem.split(' ')))
        info['text_tokens'].append(len(inputs[0]))
        info['sum_chars'].append(len(summary[0]))
        info['sum_words'].append(len(summary[0].split(' ')))
        info['sum_tokens'].append(len(outputs[0]))
    
    summaries = [sentence for sublist in summaries for sentence in sublist]
    infos = "Text : \n" + str(round(sum(info['text_chars'])/len(info['text_chars']))) + " characters(avg), \n" + str(round(sum(info['text_words'])/len(info['text_words']))) + " words(avg), \n" + str(round(sum(info['text_tokens'])/len(info['text_tokens']))) + " tokens(avg), \n" + str(max(info['text_tokens'])) + " tokens(max), \n" + str(min(info['text_tokens'])) + " tokens (min). " + "\n\n" + "Summary : " + str(round(sum(info['sum_chars'])/len(info['sum_chars']))) + " characters(avg), \n" + str(round(sum(info['sum_words'])/len(info['sum_words']))) + " words(avg), \n" + str(round(sum(info['sum_tokens'])/len(info['sum_tokens']))) + " tokens(avg), \n" + str(max(info['sum_tokens'])) + " tokens(max), \n" + str(min(info['sum_tokens'])) + " tokens (min)."
#     infos = "Text : \n" + str(round(sum(info['text_chars']))) + " characters, \n" + str(round(sum(info['text_words']))) + " words, \n" + str(round(sum(info['text_tokens']))) + " tokens, \n" + str(max(info['text_tokens'])) + " tokens(max), \n" + str(min(info['text_tokens'])) + " tokens (min). " + "\n\n" + "Summary : " + str(round(sum(info['sum_chars']))) + " characters, \n" + str(round(sum(info['sum_words']))) + " words, \n" + str(round(sum(info['sum_tokens']))) + " tokens, \n" + str(max(info['sum_tokens'])) + " tokens(max), \n" + str(min(info['sum_tokens'])) + " tokens (min)."
    return [summaries, infos]

# Combine all sub-summary result
def summarization(document):
    nest = nest_paragraph(document)
    result = generate_summary(nest)
    separator = ' '
    summary_result = separator.join(result[0])
    infos = result[1]
    return [summary_result, infos]

### Summarize All News Dataset

In [4]:
# Summarize news data from id a to b => write in docx file
def news_summarization(first, last, DATA_PATH):
    data = pd.read_csv(DATA_PATH, encoding= 'utf-8')
    data = data.iloc[first-1:last]
    
    #docx writer config
    SUMMARY_PATH = 'Text Summarization\summary\summary_swa_final_'+ str(first) + '-' +str(last) +'.docx'
    
    word_document = Document()
    table = word_document.add_table(0, 0) # we add rows iteratively
    table.style = 'Table Grid'
    first_column_width = 1
    second_column_width = 3
    third_column_width = 5
    fourth_column_width = 5
    fifth_column_width = 1
    table.add_column(Cm(first_column_width))
    table.add_column(Cm(second_column_width))
    table.add_column(Cm(third_column_width))
    table.add_column(Cm(fourth_column_width))
    table.add_column(Cm(fifth_column_width))
    table.add_row()
    row = table.rows[0]
    row.cells[0].text = str('Num')
    row.cells[1].text = str('Title')
    row.cells[2].text = str('Summary')
    row.cells[3].text = str('Content')
    row.cells[4].text = str('Info')
    
    for i, elem in enumerate(data['Content'].isnull()):
        if elem == False:
            summarization_result = summarization(data['Content'].iloc[i])
            summarization_result[0] = summarization_result[0].replace('\%', '%')
            table.add_row()
            row = table.rows[i+1]
            row.cells[0].text = str(data['Num'].iloc[i])
            row.cells[1].text = str(data['Title'].iloc[i])
            row.cells[2].text = str(summarization_result[0])
            row.cells[3].text = str(data['Content'].iloc[i])
            row.cells[4].text = str(summarization_result[1])
            word_document.save(SUMMARY_PATH)
            print("News ID " + str(data['Num'].iloc[i]) + " summarized")
        else:
            table.add_row()
            row = table.rows[i+1]
            row.cells[0].text = str(data['Num'].iloc[i])
            row.cells[1].text = str(data['Title'].iloc[i])
            row.cells[2].text = str('-')
            row.cells[3].text = str(data['Content'].iloc[i])
            row.cells[4].text = str(summarization_result[1])
            word_document.save(SUMMARY_PATH)
            print("News ID " + str(data['Num'].iloc[i]) + " summarized")

### Run Scraping

In [49]:
LINK_PATH = 'Text Summarization\data\link_swa.csv'
DATA_PATH = r'Text Summarization\data\news_swa_owewoe.csv'

# Download news item number a to b
news_download(1, 5, LINK_PATH, DATA_PATH)

News ID 1 downloaded
News ID 2 downloaded
News ID 3 downloaded
News ID 4 downloaded
News ID 5 downloaded


### Run Summarization

In [5]:
DATA_PATH = r'Text Summarization\data\news_swa.csv'

# Summary news item number a to b
news_summarization(500, 581, DATA_PATH)

News ID 500 summarized
News ID 501 summarized
News ID 502 summarized
News ID 503 summarized
News ID 504 summarized
News ID 505 summarized
News ID 506 summarized
News ID 507 summarized
News ID 508 summarized
News ID 509 summarized
News ID 510 summarized
News ID 511 summarized
News ID 512 summarized
News ID 513 summarized
News ID 514 summarized
News ID 515 summarized
News ID 516 summarized
News ID 517 summarized
News ID 518 summarized
News ID 519 summarized
News ID 520 summarized
News ID 521 summarized
News ID 522 summarized
News ID 523 summarized
News ID 524 summarized
News ID 525 summarized
News ID 526 summarized
News ID 527 summarized
News ID 528 summarized
News ID 529 summarized
News ID 530 summarized
News ID 531 summarized
News ID 532 summarized
News ID 533 summarized
News ID 534 summarized
News ID 535 summarized
News ID 536 summarized
News ID 537 summarized
News ID 538 summarized
News ID 539 summarized
News ID 540 summarized
News ID 541 summarized
News ID 542 summarized
News ID 543

### Info - Pemilihan Ringkasan Berita

In [5]:
data = pd.read_csv(DATA_PATH, encoding= 'utf-8')
print("Jumlah berita terpilih = " + str((data.Select == 'V').sum()-4))

Jumlah berita terpilih = 225
