### Text Summarization for ACER Technical Assignment 
#### Shiva Safaei
The objective is to use a link on the internet and summarize its content (.txt or .pdf format)

The script is ready to use. The only part that we need to change is [here](#Data-Prep), where we can define the inputs: the link and the summary length.

In [344]:
import re
import nltk
import heapq
import requests 
import urllib
import numpy as np
import networkx as nx
import bs4 as bs
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO


A defined class for converting a PDF file to a text file;

In [345]:
class PdfConverter:
    
    def __init__(self, file_path):
        self.file_path = file_path

    def convert_pdf_to_txt(self):
        rsrcmgr = PDFResourceManager()
        retstr = StringIO()
        laparams = LAParams()
        device = TextConverter(rsrcmgr, retstr, laparams = laparams)
        fp = open(self.file_path, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        #pagenos = set()
        for page in PDFPage.get_pages(fp, pagenos = set(), maxpages = 0, password = '',
                                      caching = True, check_extractable = True):
            interpreter.process_page(page)
        fp.close()
        device.close()
        str = retstr.getvalue()
        retstr.close()
        return str


Let's assume `file_url` is our desired link which we want to find its summary. Also, here I chose 10 sentences (`top_sentences = 10`) for the length of the summary â€“this is quite optional!
Below I showed how  to extract its text from different formatted texts.

##### Put the link and summary length below:  <a class="anchor" id="Data-Prep"></a>

In [346]:
file_url = 'https://docs.financialaid.uic.edu/docs/PDF_upload_guide.pdf'
top_sentences = 10 

In [347]:
if file_url.__contains__('pdf'):
    r = requests.get(file_url, stream = True) 
    with open('dl_file.pdf', 'wb') as pdf:
        for chunk in r.iter_content(chunk_size = 1024): 
            if chunk:
                pdf.write(chunk)
    pdfConverter = PdfConverter(file_path = 'dl_file.pdf')
    article = pdfConverter.convert_pdf_to_txt()
else:
    url_text = urllib.request.urlopen(file_url)
    txt = url_text.read()
    parsed_txt = bs.BeautifulSoup(txt, 'lxml')
    paragraphs = parsed_txt.find_all('p')
    article = ''
    for p in paragraphs:
        article += p.text

In [348]:
article = re.sub(r'\n', ' ', article)
article = re.sub(r'\[[0-9]*\]', ' ', article)
article = re.sub(r'\s+', ' ', article)
article = re.sub(r'\s+', ' ', article)

with open('article.txt', 'w') as text_file:
    for i in  article:
        text_file.write(i)

There are many methods for creating the summary of a text. Below is a simple way to extract the highlighted part of a text.
This method is based on the higher frequency of appearing a word and evaluating the most important sentences.
    

In [349]:
formatted_article = re.sub('[^a-zA-Z]', ' ', article)
sentence_list = nltk.sent_tokenize(article)
stop_words = stopwords.words('english')

word_frequencies = {}
for word in nltk.word_tokenize(formatted_article):
    if word not in stop_words:
        if word not in word_frequencies.keys():
            word_frequencies[word] = 1
        else:
            word_frequencies[word] += 1


maximum_frequncy = max(word_frequencies.values())

for word in word_frequencies.keys():
    word_frequencies[word] = (word_frequencies[word]/maximum_frequncy)


sentence_scores = {}
for sent in sentence_list:
    for word in nltk.word_tokenize(sent.lower()):
        if word in word_frequencies.keys():
            if len(sent.split(' ')) < 30:
                if sent not in sentence_scores.keys():
                    sentence_scores[sent] = word_frequencies[word]
                else:
                    sentence_scores[sent] += word_frequencies[word]


summary_sentences = heapq.nlargest(top_sentences, sentence_scores, key = sentence_scores.get)

summary = ' '.join(summary_sentences)
with open('summary_1.txt', 'w') as text_file:
    for i in  summary:
        text_file.write(i)                    

The result is saved in `summary_1.txt`.

This method is using the similarity of the sentence vectors to summarize the text. 

In [350]:
def sentence_smlrty(sent1, sent2, stopwords=None):
    if stopwords is None:
        stopwords = []
 
    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]
    all_words = list(set(sent1 + sent2))
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
 
    for w in sent1:
        if w in stopwords:
            continue
        vector1[all_words.index(w)] += 1        
    for w in sent2:
        if w in stopwords:
            continue
        vector2[all_words.index(w)] += 1
    return 1 - cosine_distance(vector1, vector2)


def build_smlrty_mtrx(sentences, stop_words):
    similarity_matrix = np.zeros((len(sentences), len(sentences))) 
    for idx1 in range(len(sentences)):
        for idx2 in range(len(sentences)):
            if idx1 == idx2: 
                continue 
            similarity_matrix[idx1][idx2] = sentence_smlrty(sentences[idx1], sentences[idx2], stop_words)
            if np.isnan(similarity_matrix[idx1][idx2]):
                similarity_matrix[idx1][idx2] = 0    
    return similarity_matrix
 

In [352]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
summarize_text = []

file = open('article.txt', "r")
article = file.readlines()[0].split(". ")
sentences = []

for sentence in article:
    sentences.append(sentence.replace("[^a-zA-Z]", " ").split(" "))
sentences.pop() 
sentence_smlrty_mtrx = build_smlrty_mtrx(sentences, stop_words)
sentence_smlrty_grph = nx.from_numpy_array(sentence_smlrty_mtrx)
scores = nx.pagerank(sentence_smlrty_grph, max_iter = 150)    
ranked_sentence = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)      

for i in range(top_sentences):
    summarize_text.append(" ".join(ranked_sentence[i][1]))

with open('summary_2.txt', 'w') as text_file:
    for i in  summarize_text:
        text_file.write(i) 

This result is saved in `summary_2.txt`.