In [10]:
import pandas as pd
import numpy as np
import textwrap
import nltk

from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Error loading punkt: <urlopen error [WinError 10054]
[nltk_data]     既存の接続はリモート ホストに強制的に切断されました。>
[nltk_data] Error loading stopwords: <urlopen error [WinError 10054]
[nltk_data]     既存の接続はリモート ホストに強制的に切断されました。>


False

In [3]:
df = pd.read_csv("bbc_text_cls.csv")
df.head()

Unnamed: 0,text,labels
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,High fuel prices hit BA's profits\n\nBritish A...,business
4,Pernod takeover talk lifts Domecq\n\nShares in...,business


In [4]:
doc = df[df.labels == "business"]["text"].sample(random_state=42)

In [5]:
def wrap(x):
    return textwrap.fill(x, replace_whitespace=False, fix_sentence_endings=True)

In [6]:
print(wrap(doc.iloc[0]))

Christmas sales worst since 1981

UK retail sales fell in December,
failing to meet expectations and making it by some counts the worst
Christmas since 1981.

Retail sales dropped by 1% on the month in
December, after a 0.6% rise in November, the Office for National
Statistics (ONS) said.  The ONS revised the annual 2004 rate of growth
down from the 5.9% estimated in November to 3.2%. A number of
retailers have already reported poor figures for December.  Clothing
retailers and non-specialist stores were the worst hit with only
internet retailers showing any significant growth, according to the
ONS.

The last time retailers endured a tougher Christmas was 23 years
previously, when sales plunged 1.7%.

The ONS echoed an earlier
caution from Bank of England governor Mervyn King not to read too much
into the poor December figures.  Some analysts put a positive gloss on
the figures, pointing out that the non-seasonally-adjusted figures
showed a performance comparable with 2003. The Novembe

In [7]:
sents = nltk.sent_tokenize(doc.iloc[0].split("\n", 1)[1])

In [8]:
featurizer = TfidfVectorizer(stop_words=stopwords.words("english"), norm="l1")

In [9]:
X = featurizer.fit_transform(sents)

In [11]:
S = cosine_similarity(X)

In [12]:
S.shape

(17, 17)

In [13]:
len(sents)

17

In [14]:
# normalize similarity matrix
S /= S.sum(axis=1, keepdims=True)

In [15]:
S[0].sum()

1.0

In [17]:
# uniform transition matrix
U = np.ones_like(S) / len(S)

In [18]:
U[0].sum()

1.0

In [21]:
# smoothed similarity matrix
factor = 0.15
S = (1-factor)*S + factor * U

In [23]:
S

array([[0.4929689 , 0.06826234, 0.00882353, 0.03785608, 0.03413282,
        0.05491983, 0.02798498, 0.00882353, 0.02594136, 0.02409533,
        0.00882353, 0.00882353, 0.08454704, 0.02626517, 0.0244107 ,
        0.05449782, 0.00882353],
       [0.06112506, 0.43483377, 0.0711173 , 0.03773799, 0.02973765,
        0.02987004, 0.05045042, 0.00882353, 0.05014396, 0.02403321,
        0.00882353, 0.03074897, 0.02751639, 0.02619422, 0.0457084 ,
        0.05431204, 0.00882353],
       [0.00882353, 0.09063931, 0.56833951, 0.00882353, 0.08805576,
        0.00882353, 0.04301184, 0.00882353, 0.04563373, 0.00882353,
        0.00882353, 0.00882353, 0.06726102, 0.00882353, 0.00882353,
        0.00882353, 0.00882353],
       [0.03270201, 0.03585018, 0.00882353, 0.40701987, 0.0484944 ,
        0.03240349, 0.09056751, 0.04713236, 0.03107236, 0.04852244,
        0.03426744, 0.00882353, 0.00882353, 0.06840302, 0.06268159,
        0.02558921, 0.00882353],
       [0.03375284, 0.03223484, 0.07635335, 0.056333

In [22]:
S[0].sum()

1.0

In [24]:
# find the limiting / stationary distribution
eigenvals, eigenvecs = np.linalg.eig(S.T)

In [25]:
eigenvals

array([1.        , 0.24245466, 0.72108199, 0.67644122, 0.34790129,
       0.34417302, 0.3866884 , 0.40333562, 0.41608572, 0.44238593,
       0.63909999, 0.62556792, 0.58922572, 0.57452382, 0.48511399,
       0.51329157, 0.52975372])

In [33]:
eigenvecs

array([[-0.24206557,  0.04377712, -0.09795041, -0.17120649,  0.37197052,
        -0.07807211,  0.0633532 , -0.22641429,  0.18850271,  0.66319382,
         0.13967363, -0.11225173, -0.34837879,  0.09100741,  0.14157192,
         0.235432  ,  0.08191252],
       [-0.27051337,  0.01675531, -0.02050407, -0.17661766, -0.71846797,
         0.19978134,  0.45245165, -0.14947402, -0.14647799,  0.26064626,
         0.15545193, -0.02759344,  0.08217927, -0.02424053,  0.07812715,
         0.066269  , -0.28824215],
       [-0.2213806 , -0.05262294, -0.13956121, -0.24098096,  0.33006313,
        -0.00243623,  0.05490982, -0.26101717, -0.1027169 , -0.14528343,
         0.29992951, -0.30090827,  0.50890324, -0.37948465, -0.04123118,
        -0.09913074, -0.09930405],
       [-0.28613638, -0.42902783, -0.09548058,  0.02415075,  0.12462774,
         0.71290317, -0.29554217,  0.01692517,  0.09273292,  0.09826122,
        -0.1875244 ,  0.17139712, -0.03271213,  0.07061452, -0.27278039,
        -0.12547525

In [31]:
eigenvecs[0]

array([-0.24206557,  0.04377712, -0.09795041, -0.17120649,  0.37197052,
       -0.07807211,  0.0633532 , -0.22641429,  0.18850271,  0.66319382,
        0.13967363, -0.11225173, -0.34837879,  0.09100741,  0.14157192,
        0.235432  ,  0.08191252])

In [32]:
eigenvecs[:,0]

array([-0.24206557, -0.27051337, -0.2213806 , -0.28613638, -0.25065894,
       -0.2499217 , -0.279622  , -0.21515455, -0.2226665 , -0.22745415,
       -0.2059112 , -0.20959727, -0.23526242, -0.24203809, -0.23663025,
       -0.2940483 , -0.20865607])

In [34]:
eigenvecs[:,0].dot(S)

array([-0.24206557, -0.27051337, -0.2213806 , -0.28613638, -0.25065894,
       -0.2499217 , -0.279622  , -0.21515455, -0.2226665 , -0.22745415,
       -0.2059112 , -0.20959727, -0.23526242, -0.24203809, -0.23663025,
       -0.2940483 , -0.20865607])

In [35]:
eigenvecs[:,0] / eigenvecs[:,0].sum()

array([0.05907327, 0.06601563, 0.05402535, 0.06982824, 0.06117038,
       0.06099047, 0.06823848, 0.05250595, 0.05433915, 0.05550753,
       0.05025022, 0.05114976, 0.05741304, 0.05906657, 0.05774684,
       0.07175905, 0.05092007])

In [36]:
limiting_dist = np.ones(len(S)) / len(S)
threshold = 1e-8
delta = float("inf")
iters = 0
while delta > threshold:
    iters += 1

    # Markov transition
    p = limiting_dist.dot(S)

    # compute change in limiting distribution
    delta = np.abs(p - limiting_dist).sum()

    # update limiting distribution
    limiting_dist = p

print(iters)

41


In [37]:
limiting_dist

array([0.05907327, 0.06601563, 0.05402534, 0.06982824, 0.06117038,
       0.06099047, 0.06823848, 0.05250595, 0.05433915, 0.05550753,
       0.05025022, 0.05114977, 0.05741304, 0.05906657, 0.05774685,
       0.07175905, 0.05092008])

In [39]:
limiting_dist.sum()

0.9999999999999981

In [42]:
np.abs(eigenvecs[:,0] / eigenvecs[:,0].sum() - limiting_dist).sum()

1.9964739035593926e-08

In [43]:
scores = limiting_dist

In [44]:
sort_idx = np.argsort(-scores)

In [45]:
# Options for how tochoose which sentences to include:

# 1) top N sentences
# 2) top N words or characters.
# 3) top X% sentences or top X% words
# 4) sentences with scores > average score
# 5) sentences with scores > factor * average score

print("Generated summary:")
for i in sort_idx[:5]:
    print(wrap("%.2f: %s" % (scores[i], sents[i])))

Generated summary:
0.07: "The retail sales figures are very weak, but as Bank of England
governor Mervyn King indicated last night, you don't really get an
accurate impression of Christmas trading until about Easter," said Mr
Shaw.
0.07: A number of retailers have already reported poor figures for
December.
0.07: The ONS echoed an earlier caution from Bank of England governor
Mervyn King not to read too much into the poor December figures.
0.07: Retail sales dropped by 1% on the month in December, after a
0.6% rise in November, the Office for National Statistics (ONS) said.
0.06: Clothing retailers and non-specialist stores were the worst hit
with only internet retailers showing any significant growth, according
to the ONS.
