In [1]:
import numpy as np
import pandas as pd
import textwrap
import nltk
from nltk import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer

In [3]:
nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dc13208\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
df = pd.read_csv('bbc_text_cls.csv')

In [5]:
df.head()

Unnamed: 0,text,labels
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,High fuel prices hit BA's profits\n\nBritish A...,business
4,Pernod takeover talk lifts Domecq\n\nShares in...,business


In [7]:
labels = set(df['labels'])
labels

{'business', 'entertainment', 'politics', 'sport', 'tech'}

In [13]:
# pick a label whose data we want to train from
label = "business"

In [14]:
texts = df[df['labels'] == label]['text']
texts.head()

0    Ad sales boost Time Warner profit\n\nQuarterly...
1    Dollar gains on Greenspan speech\n\nThe dollar...
2    Yukos unit buyer faces loan claim\n\nThe owner...
3    High fuel prices hit BA's profits\n\nBritish A...
4    Pernod takeover talk lifts Domecq\n\nShares in...
Name: text, dtype: object

In [15]:
# collect counts
probs = {} # key: (w(t-1), w(t+1)), value: {w(t): count(w(t))}

for doc in texts:
    lines = doc.split("\n")
    for line in lines:
        tokens = word_tokenize(line)
        for i in range(len(tokens) - 2):
            t_0 = tokens[i]
            t_1 = tokens[i + 1]
            t_2 = tokens[i + 2]
            key = (t_0, t_2)
            if key not in probs: 
                probs[key] = {}

            # add count for middle token
            if t_1 not in probs[key]:
                probs[key][t_1] = 1
            else:
                probs[key][t_1] += 1

In [17]:
# normalize probabilities
for key, d in probs.items():
    # d should represent a distribution
    total = sum(d.values())
    for k, v in d.items():
        d[k] = v / total

In [18]:
probs

{('Ad', 'boost'): {'sales': 1.0},
 ('sales', 'Time'): {'boost': 1.0},
 ('boost', 'Warner'): {'Time': 1.0},
 ('Time', 'profit'): {'Warner': 1.0},
 ('Quarterly', 'at'): {'profits': 1.0},
 ('profits', 'US'): {'at': 1.0},
 ('at', 'media'): {'US': 1.0},
 ('US', 'giant'): {'media': 0.1,
  'telecoms': 0.1,
  'banking': 0.2,
  'foods': 0.1,
  'retail': 0.1,
  'oil': 0.2,
  'mortgage': 0.1,
  'agrochemical': 0.1},
 ('media', 'TimeWarner'): {'giant': 1.0},
 ('giant', 'jumped'): {'TimeWarner': 1.0},
 ('TimeWarner', '76'): {'jumped': 1.0},
 ('jumped', '%'): {'76': 0.14285714285714288,
  '1.8': 0.14285714285714288,
  '11': 0.14285714285714288,
  '6': 0.14285714285714288,
  '10.7': 0.14285714285714288,
  '7': 0.14285714285714288,
  '22': 0.14285714285714288},
 ('76', 'to'): {'%': 1.0},
 ('%', '$'): {'to': 0.7727272727272727, 'at': 0.22727272727272727},
 ('to', '1.13bn'): {'$': 1.0},
 ('$', '('): {'1.13bn': 0.0068027210884353696,
  '900m': 0.0068027210884353696,
  '280bn': 0.020408163265306107,
  '86

In [19]:
texts.iloc[0].split("\n")

['Ad sales boost Time Warner profit',
 '',
 'Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier.',
 '',
 'The firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales. TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.',
 '',
 "Time Warner said on Friday that it now owns 8% of search-engine Google. But its own internet business, AOL, had has mixed fortunes. It lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters. However, the company said AOL's underlying profit before exceptional items rose 8% on the back of stronger internet advertising revenues. It hopes to increase subscribers by offering the online service free to TimeWarner internet customers a

In [21]:
def spin_document(doc):
    # split the document into lines (paragraphs)
    lines = doc.split("\n")
    output = []
    for line in lines:
        if line:
            new_line = spin_line(line)
        else:
            new_line = line
        output.append(new_line)
    return "\n".join(output)


In [22]:
detokenizer = TreebankWordDetokenizer()

In [32]:
texts.iloc[0].split("\n")[2]

'Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier.'

In [35]:
detokenizer.detokenize(word_tokenize(texts.iloc[0].split("\n")[2]))

'Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier.'

In [36]:
def sample_word(d):
    p0 = np.random.random()
    cumulative = 0
    for t, p in d.items():
        cumulative += p
        if p0 < cumulative:
            return t
    assert(False) # should never get here

In [37]:
def spin_line(line):
    tokens = word_tokenize(line)
    i = 0
    output = [tokens[0]]
    while i < (len(tokens) - 2):
        t_0 = tokens[i]
        t_1 = tokens[i + 1]
        t_2 = tokens[i + 2]
        key = (t_0, t_2)
        p_dist = probs[key]
        if len(p_dist) > 1 and np.random.random() < 0.3:
            # lets replace the middle word
            middle = sample_word(p_dist)
            output.append(t_1)
            output.append("<" + middle + ">")
            output.append(t_2)
            # we wont replace the 3rd token since the middle token was dependent on it
            # instead, skip ahead 2 steps
            i += 2
        else:
            # we wont replace this middle word
            output.append(t_1)
            i += 1
    # append the final token - only if there was no replacement
    if i == len(tokens) - 2:
        output.append(tokens[-1])
    return detokenizer.detokenize(output)

In [41]:
np.random.seed(99)

In [42]:
i = np.random.choice(texts.shape[0])
doc = texts.iloc[i]
new_doc = spin_document(doc)

In [43]:
print(textwrap.fill(new_doc, replace_whitespace=False, fix_sentence_endings=True))

Aviation firms eye booming India

India's defence minister has opened
the country's Aero India 2005 <'s> air show with an invitation for
global aerospace firms to outsource jobs to <to> the nation.

Pranab
Mukherjee said such companies could take advantage of India's highly
skilled workers and low wages . More than 240 civil and military
aerospace firms from 31 countries are attending the show . Analysts
said India <they> could spend up to $35bn <5bn> (£18.8bn <GM>) in the
aviation <buy-to-let> market over the next 20 <five> years . Giants
<Producers> such Boeing and Airbus <Tesco> - on the civil aviation
front - as well as Lockheed Martin and France <Nintendo>'s Snecma
<workforce> - on the military side - are some <some> of the firms
attending <visited> the show <year>. "There <That> is tremendous scope
for outsourcing from India in areas where <where> the companies are
competitive," said Mr Mukerjee <Cescau>. "We <We> are keen to welcome
<an> international collaborations that are in 