In [1]:
import json
import pandas as pd
from sumy.nlp.stemmers import (
  Stemmer,
)
from sumy.nlp.tokenizers import (
  Tokenizer,
)
from sumy.parsers.plaintext import (
  PlaintextParser,
)
from sumy.utils import (
  get_stop_words,
)
from utils.metrics import (
  get_rouge_scores,
)
from utils.summarizer import (
  TestSummarizer,
  LevenshteinSummarizer,
)
from utils.visualizations import (
  draw_matrix,
)

In [2]:
df = pd.read_excel('data/business_articles.xlsx')

In [3]:
LANGUAGE="english"
SENTENCES_COUNT=4

In [4]:
text = df["Article"][0]

In [5]:
parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
stemmer = Stemmer(LANGUAGE)

In [6]:
summarizer = TestSummarizer(stemmer)
summarizer.stop_words = get_stop_words(LANGUAGE)

In [7]:
summary = summarizer(parser.document, SENTENCES_COUNT)

In [8]:
matrix = summarizer.create_matrix(parser.document, SENTENCES_COUNT)

In [9]:
matrix

array([[0.33333333, 0.33333333, 0.        , 0.        , 0.33333333,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        ],
       [0.25      , 0.25      , 0.        , 0.        , 0.25      ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.25      ],
       [0.        , 0.        , 0.14285714, 0.        , 0.        ,
        0.        , 0.14285714, 0.14285714, 0.        , 0.        ,
        0.        , 0.14285714, 0.14285714, 0.14285714, 0.        ,
        0.        , 0.14285714],
       [0.        , 0.        , 0.        , 1.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        ],
       [0.2       , 0.2       , 0.        , 0.      

In [10]:
sentences_words = summarizer.get_sentences_words(parser.document)

In [11]:
sentences = list(map(lambda s: " ".join(s), sentences_words))

In [12]:
sentences

['uk economi face risk',
 'the uk manufactur sector will continu to face serious challeng over the next two year the british chamber of commerc bcc has said',
 'the group quarter survey of compani found export had pick up in the last three month of to their best level in eight year',
 'the rise came despit exchang rate be cite as a major concern',
 'howev the bcc found the whole uk economi still face major risk and warn that growth is set to slow',
 'it recent forecast econom growth will slow from more than in to a littl below in both and',
 'manufactur domest sale growth fell back slight in the quarter the survey of firm found',
 'employ in manufactur also fell and job expect were at their lowest level for a year',
 'despit some posit news for the export sector there are worri sign for manufactur the bcc said',
 'these result reinforc our concern over the sector persist inabl to sustain recoveri',
 'the outlook for the servic sector was uncertain despit an increas in export and order 

In [13]:
draw_matrix(matrix, image_name="test_matrix_graph")

# Levenshtein Summarizer

### Data Preprocessing

In [14]:
summarizer = LevenshteinSummarizer(stemmer)
summarizer.stop_words = get_stop_words(LANGUAGE)

In [15]:
summary = summarizer(parser.document, SENTENCES_COUNT)

In [16]:
matrix = summarizer.create_matrix(parser.document)

In [17]:
matrix

array([[ 1.        , -1.0625    , -0.3       , -0.25      , -0.77083333,
        -0.20833333, -0.8125    , -1.125     , -0.625     , -0.6875    ,
        -0.17261905, -0.5       , -0.5375    , -0.6875    , -0.0625    ,
        -0.3125    , -0.35416667],
       [-1.0625    ,  1.        , -0.66356108, -0.92361111, -0.48416667,
        -0.55555556, -0.79603175, -0.40625   , -0.6372549 , -0.4709707 ,
        -0.47562358, -0.61086957, -0.76955782, -0.19047619, -0.70714286,
        -0.66873706, -0.7826087 ],
       [-0.3       , -0.66356108,  1.        , -0.89583333, -0.36416667,
        -0.42645503, -0.66      , -0.87127976, -0.43529412, -0.55769231,
        -0.49206349, -0.60138889, -0.85357143, -0.47619048, -0.50583333,
        -0.58055556, -0.625     ],
       [-0.25      , -0.92361111, -0.89583333,  1.        , -0.46805556,
        -0.72222222, -0.90972222, -0.42777778, -0.65277778, -0.6359127 ,
        -0.55416667, -0.44444444, -0.5577381 , -0.78055556, -0.58333333,
        -0.47222222

In [18]:
sentences_words = summarizer.get_sentences_words(parser.document)

In [19]:
sentences = list(map(lambda s: " ".join(s), sentences_words))

In [20]:
sentences

['uk economi face risk',
 'the uk manufactur sector will continu to face serious challeng over the next two year the british chamber of commerc bcc has said',
 'the group quarter survey of compani found export had pick up in the last three month of to their best level in eight year',
 'the rise came despit exchang rate be cite as a major concern',
 'howev the bcc found the whole uk economi still face major risk and warn that growth is set to slow',
 'it recent forecast econom growth will slow from more than in to a littl below in both and',
 'manufactur domest sale growth fell back slight in the quarter the survey of firm found',
 'employ in manufactur also fell and job expect were at their lowest level for a year',
 'despit some posit news for the export sector there are worri sign for manufactur the bcc said',
 'these result reinforc our concern over the sector persist inabl to sustain recoveri',
 'the outlook for the servic sector was uncertain despit an increas in export and order 

In [21]:
draw_matrix(matrix, image_name="levenshtein_matrix_graph")

In [22]:
summary_result = ""
for sentence in summary:
    if summary_result != "":
        summary_result += " "
    summary_result += sentence._text

In [23]:
print(json.dumps(get_rouge_scores(summary_result, df["Summary"][0]), indent=2))

{
  "rouge-1": {
    "r": 0.4166666666666667,
    "p": 0.7142857142857143,
    "f": 0.5263157848199447
  },
  "rouge-2": {
    "r": 0.2890625,
    "p": 0.578125,
    "f": 0.38541666222222226
  },
  "rouge-l": {
    "r": 0.40625,
    "p": 0.6964285714285714,
    "f": 0.5131578900831026
  }
}
