In [1]:
from transformers import pipeline

import textwrap
import numpy as np
import pandas as pd

from pprint import pprint

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv("../bbc_text_cls.csv")

In [3]:
df.head()

Unnamed: 0,text,labels
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,High fuel prices hit BA's profits\n\nBritish A...,business
4,Pernod takeover talk lifts Domecq\n\nShares in...,business


In [4]:
labels = set(df['labels'])
labels

{'business', 'entertainment', 'politics', 'sport', 'tech'}

In [5]:
# Pick a label
label = 'business'

In [6]:
texts = df[df['labels'] == label]['text']
texts.head()

0    Ad sales boost Time Warner profit\n\nQuarterly...
1    Dollar gains on Greenspan speech\n\nThe dollar...
2    Yukos unit buyer faces loan claim\n\nThe owner...
3    High fuel prices hit BA's profits\n\nBritish A...
4    Pernod takeover talk lifts Domecq\n\nShares in...
Name: text, dtype: object

In [11]:
np.random.seed(1234)

In [12]:
i = np.random.choice(texts.shape[0])
print(i)
doc = texts.iloc[i]

303


In [14]:
print(textwrap.fill(doc, replace_whitespace=False, fix_sentence_endings=True))

Bombardier chief to leave company

Shares in train and plane-making
giant Bombardier have fallen to a 10-year low following the departure
of its chief executive and two members of the board.

Paul Tellier,
who was also Bombardier's president, left the company amid an ongoing
restructuring.  Laurent Beaudoin, part of the family that controls the
Montreal-based firm, will take on the role of CEO under a newly
created management structure.  Analysts said the resignations seem to
have stemmed from a boardroom dispute.  Under Mr Tellier's tenure at
the company, which began in January 2003, plans to cut the worldwide
workforce of 75,000 by almost a third by 2006 were announced.  The
firm's snowmobile division and defence services unit were also sold
and Bombardier started the development of a new aircraft seating 110
to 135 passengers.

Mr Tellier had indicated he wanted to stay at the
world's top train maker and third largest manufacturer of civil
aircraft until the restructuring was comple

In [15]:
mlm = pipeline("fill-mask")

No model was supplied, defaulted to distilroberta-base and revision ec58a5b (https://huggingface.co/distilroberta-base).
Using a pipeline without specifying a model name and revision in production is not recommended.
Downloading (…)lve/main/config.json: 100%|██████████| 480/480 [00:00<00:00, 240kB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading pytorch_model.bin: 100%|██████████| 331M/331M [00:38<00:00, 8.55MB/s] 
Downloading (…)olve/main/vocab.json: 100%|██████████| 899k/899k [00:00<00:00, 3.38MB/s]
Downloading (…)olve/main/merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 2.23MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 1.36M/1.36M [00:00<00:00, 4.61MB/s]


In [16]:
mlm("Bombardier chief to leave <mask>")

[{'score': 0.06950803846120834,
  'token': 633,
  'token_str': ' job',
  'sequence': 'Bombardier chief to leave job'},
 {'score': 0.06693095713853836,
  'token': 1470,
  'token_str': ' France',
  'sequence': 'Bombardier chief to leave France'},
 {'score': 0.05273529887199402,
  'token': 558,
  'token_str': ' office',
  'sequence': 'Bombardier chief to leave office'},
 {'score': 0.025823015719652176,
  'token': 2201,
  'token_str': ' Paris',
  'sequence': 'Bombardier chief to leave Paris'},
 {'score': 0.0213686041533947,
  'token': 896,
  'token_str': ' Canada',
  'sequence': 'Bombardier chief to leave Canada'}]

In [17]:
text = 'Shares in <mask> and plane-making ' + \
  'giant Bombardier have fallen to a 10-year low following the departure ' + \
  'of its chief executive and two members of the board.'

mlm(text)

[{'score': 0.6640956401824951,
  'token': 11016,
  'token_str': ' Airbus',
  'sequence': 'Shares in Airbus and plane-making giant Bombardier have fallen to a 10-year low following the departure of its chief executive and two members of the board.'},
 {'score': 0.2614656686782837,
  'token': 6722,
  'token_str': ' Boeing',
  'sequence': 'Shares in Boeing and plane-making giant Bombardier have fallen to a 10-year low following the departure of its chief executive and two members of the board.'},
 {'score': 0.023635413497686386,
  'token': 15064,
  'token_str': ' aerospace',
  'sequence': 'Shares in aerospace and plane-making giant Bombardier have fallen to a 10-year low following the departure of its chief executive and two members of the board.'},
 {'score': 0.014581811614334583,
  'token': 8537,
  'token_str': ' airlines',
  'sequence': 'Shares in airlines and plane-making giant Bombardier have fallen to a 10-year low following the departure of its chief executive and two members of th

In [18]:
text = 'Shares in train and plane-making ' + \
  'giant Bombardier have fallen to a 10-year low following the <mask> ' + \
  'of its chief executive and two members of the board.'

pprint(mlm(text))

[{'score': 0.5513928532600403,
  'sequence': 'Shares in train and plane-making giant Bombardier have fallen '
              'to a 10-year low following the resignation of its chief '
              'executive and two members of the board.',
  'token': 6985,
  'token_str': ' resignation'},
 {'score': 0.21090444922447205,
  'sequence': 'Shares in train and plane-making giant Bombardier have fallen '
              'to a 10-year low following the departure of its chief executive '
              'and two members of the board.',
  'token': 5824,
  'token_str': ' departure'},
 {'score': 0.13041961193084717,
  'sequence': 'Shares in train and plane-making giant Bombardier have fallen '
              'to a 10-year low following the departures of its chief '
              'executive and two members of the board.',
  'token': 25624,
  'token_str': ' departures'},
 {'score': 0.03651558980345726,
  'sequence': 'Shares in train and plane-making giant Bombardier have fallen '
              'to a 10-ye

In [19]:
text = 'Shares in train and plane-making ' + \
  'giant Bombardier have fallen to a 10-year low following the departure ' + \
  'of its chief <mask> and two members of the board.'

pprint(mlm(text))

[{'score': 0.9897111654281616,
  'sequence': 'Shares in train and plane-making giant Bombardier have fallen '
              'to a 10-year low following the departure of its chief executive '
              'and two members of the board.',
  'token': 1031,
  'token_str': ' executive'},
 {'score': 0.006391061004251242,
  'sequence': 'Shares in train and plane-making giant Bombardier have fallen '
              'to a 10-year low following the departure of its chief '
              'executives and two members of the board.',
  'token': 4585,
  'token_str': ' executives'},
 {'score': 0.0016239311080425978,
  'sequence': 'Shares in train and plane-making giant Bombardier have fallen '
              'to a 10-year low following the departure of its chief economist '
              'and two members of the board.',
  'token': 7473,
  'token_str': ' economist'},
 {'score': 0.0007142738904803991,
  'sequence': 'Shares in train and plane-making giant Bombardier have fallen '
              'to a 10-ye

In [20]:
text = 'Shares in train and plane-making ' + \
  'giant Bombardier have fallen to a 10-year low following the departure ' + \
  'of its chief executive and two <mask> of the board.'

pprint(mlm(text))

[{'score': 0.9420545697212219,
  'sequence': 'Shares in train and plane-making giant Bombardier have fallen '
              'to a 10-year low following the departure of its chief executive '
              'and two members of the board.',
  'token': 453,
  'token_str': ' members'},
 {'score': 0.032231464982032776,
  'sequence': 'Shares in train and plane-making giant Bombardier have fallen '
              'to a 10-year low following the departure of its chief executive '
              'and two thirds of the board.',
  'token': 29193,
  'token_str': ' thirds'},
 {'score': 0.011232919991016388,
  'sequence': 'Shares in train and plane-making giant Bombardier have fallen '
              'to a 10-year low following the departure of its chief executive '
              'and two directors of the board.',
  'token': 5392,
  'token_str': ' directors'},
 {'score': 0.0030280486680567265,
  'sequence': 'Shares in train and plane-making giant Bombardier have fallen '
              'to a 10-year low 

In [21]:
# Exercise: Write a function that automatically masks and replaces words
# in a whole document. You might choose which words to replace based on some
# statistic, e.g. TF-IDF.