In [96]:
from sklearn.datasets import load_files
import pandas as pd
import spacy
import re
import numpy as np
from pprint import pprint

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess

import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)


nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

random_state = 0

DATA_DIR = "./bbc/"
data = load_files(DATA_DIR, encoding="utf-8", decode_error="replace", random_state=random_state)
df = pd.DataFrame(list(zip(data['data'], data['target'])), columns=['text', 'label'])

In [97]:
# NLTK Stop words
from nltk.corpus import stopwords

import nltk; nltk.download('stopwords')

stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu'])



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\thewa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [98]:
# Convert to list
data = df.text.values.tolist()
# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[:1])


[['tate', 'lyle', 'boss', 'bags', 'top', 'award', 'tate', 'lyles', 'chief', 'executive', 'has', 'been', 'named', 'european', 'businessman', 'of', 'the', 'year', 'by', 'leading', 'business', 'magazine', 'iain', 'ferguson', 'was', 'awarded', 'the', 'title', 'by', 'us', 'publication', 'forbes', 'for', 'returning', 'one', 'of', 'the', 'uks', 'venerable', 'manufacturers', 'to', 'the', 'countrys', 'top', 'companies', 'the', 'sugar', 'group', 'had', 'been', 'absent', 'from', 'the', 'ftse', 'for', 'seven', 'years', 'until', 'mr', 'ferguson', 'helped', 'it', 'return', 'to', 'growth', 'tates', 'shares', 'have', 'leapt', 'this', 'year', 'boosted', 'by', 'firming', 'sugar', 'prices', 'and', 'sales', 'of', 'its', 'artificial', 'sweeteners', 'after', 'years', 'of', 'sagging', 'stock', 'price', 'and', 'seven', 'year', 'hiatus', 'from', 'the', 'ftse', 'one', 'of', 'britains', 'venerable', 'manufacturers', 'has', 'returned', 'to', 'the', 'vaunted', 'index', 'forbes', 'said', 'mr', 'ferguson', 'took', '

In [99]:
# Build the bigram model
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
#trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
#trigram_mod = gensim.models.phrases.Phraser(trigram)

In [100]:
# See trigram example
# print(trigram_mod[bigram_mod[data_words[0]]])

In [101]:
print(bigram_mod[data_words[0]])

['tate', 'lyle', 'boss', 'bags', 'top', 'award', 'tate', 'lyles', 'chief_executive', 'has', 'been', 'named', 'european', 'businessman', 'of', 'the', 'year', 'by', 'leading', 'business', 'magazine', 'iain', 'ferguson', 'was', 'awarded', 'the', 'title', 'by', 'us', 'publication', 'forbes', 'for', 'returning', 'one', 'of', 'the', 'uks', 'venerable', 'manufacturers', 'to', 'the', 'countrys', 'top', 'companies', 'the', 'sugar', 'group', 'had', 'been', 'absent', 'from', 'the', 'ftse', 'for', 'seven', 'years', 'until', 'mr', 'ferguson', 'helped', 'it', 'return', 'to', 'growth', 'tates', 'shares', 'have', 'leapt', 'this', 'year', 'boosted', 'by', 'firming', 'sugar', 'prices', 'and', 'sales', 'of', 'its', 'artificial', 'sweeteners', 'after', 'years', 'of', 'sagging', 'stock', 'price', 'and', 'seven', 'year', 'hiatus', 'from', 'the', 'ftse', 'one', 'of', 'britains', 'venerable', 'manufacturers', 'has', 'returned', 'to', 'the', 'vaunted', 'index', 'forbes', 'said', 'mr', 'ferguson', 'took', 'the'

In [102]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
# from nltk.stem.porter import *
# p_stemmer = PorterStemmer()

def remove_stopwords(texts):
    #return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
    return [[word for word in doc if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

# def make_trigrams(texts):
#     return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
        #texts_out.append([p_stemmer.stem(token.lemma_) for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [103]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)
#data_words_bigrams = make_trigrams(data_words_nostops)

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

In [104]:
print(data_lemmatized[:1])

[['top', 'award', 'name', 'award', 'title', 'publication', 'forbe', 'return', 'uks', 'venerable', 'manufacturer', 'countrys', 'top', 'company', 'sugar', 'year', 'help', 'return', 'growth', 'tate', 'share', 'year', 'boost', 'firm', 'sugar', 'price', 'sale', 'artificial', 'sweetener', 'year', 'sag', 'stock', 'price', 'year', 'britain', 'venerable', 'manufacturer', 'return', 'vaunted', 'index', 'forbe', 'say', 'take', 'company', 'spend', 'career', 'consumer', 'good', 'original', 'member', 'index', 'operate', 'factory', 'additional', 'production', 'facility', 'country', 'previous', 'winner', 'include', 'fre', 'former']]


In [105]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 2), (3, 1), (4, 1), (5, 1), (6, 2), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 2), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 2), (21, 2), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 2), (28, 1), (29, 1), (30, 3), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 2), (38, 1), (39, 1), (40, 1), (41, 1), (42, 2), (43, 1), (44, 1), (45, 2), (46, 1), (47, 4)]]


In [106]:
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('additional', 1),
  ('artificial', 1),
  ('award', 2),
  ('boost', 1),
  ('britain', 1),
  ('career', 1),
  ('company', 2),
  ('consumer', 1),
  ('country', 1),
  ('countrys', 1),
  ('facility', 1),
  ('factory', 1),
  ('firm', 1),
  ('forbe', 2),
  ('former', 1),
  ('fre', 1),
  ('good', 1),
  ('growth', 1),
  ('help', 1),
  ('include', 1),
  ('index', 2),
  ('manufacturer', 2),
  ('member', 1),
  ('name', 1),
  ('operate', 1),
  ('original', 1),
  ('previous', 1),
  ('price', 2),
  ('production', 1),
  ('publication', 1),
  ('return', 3),
  ('sag', 1),
  ('sale', 1),
  ('say', 1),
  ('share', 1),
  ('spend', 1),
  ('stock', 1),
  ('sugar', 2),
  ('sweetener', 1),
  ('take', 1),
  ('tate', 1),
  ('title', 1),
  ('top', 2),
  ('uks', 1),
  ('vaunted', 1),
  ('venerable', 2),
  ('winner', 1),
  ('year', 4)]]

In [107]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=50,
                                           passes=50,
                                           alpha='auto',
                                           per_word_topics=True)

In [110]:
lda_model = gensim.models.ldamodel.LdaModel.load('lda.model')

In [108]:
pprint(lda_model.print_topics())
#doc_lda = lda_model[corpus]

[(0,
  '0.031*"year" + 0.022*"say" + 0.015*"rise" + 0.014*"sale" + 0.012*"market" + '
  '0.012*"month" + 0.011*"fall" + 0.011*"last" + 0.011*"economy" + '
  '0.011*"figure"'),
 (1,
  '0.014*"go" + 0.013*"play" + 0.012*"game" + 0.012*"time" + 0.011*"win" + '
  '0.011*"take" + 0.010*"first" + 0.010*"say" + 0.010*"good" + 0.009*"come"'),
 (2,
  '0.041*"say" + 0.025*"people" + 0.016*"make" + 0.014*"work" + 0.012*"also" + '
  '0.011*"many" + 0.011*"new" + 0.009*"help" + 0.009*"would" + 0.009*"call"'),
 (3,
  '0.031*"film" + 0.019*"good" + 0.017*"year" + 0.016*"show" + 0.016*"include" '
  '+ 0.015*"music" + 0.013*"also" + 0.010*"top" + 0.010*"number" + '
  '0.010*"match"'),
 (4,
  '0.049*"firm" + 0.035*"company" + 0.021*"deal" + 0.017*"share" + '
  '0.016*"foreign" + 0.016*"financial" + 0.016*"account" + 0.015*"aid" + '
  '0.015*"bank" + 0.014*"investor"'),
 (5,
  '0.023*"company" + 0.018*"case" + 0.017*"file" + 0.016*"court" + '
  '0.015*"charge" + 0.013*"firm" + 0.012*"fraud" + 0.011*"lega

0:business
1:entertainment
2:politics
3:sport
4:tech

In [90]:
label_dic = {
    0:0,
    1:3,
    2:2,
    3:1,
    4:4
}

In [404]:
# x = lda_model.get_document_topics(corpus)
# x = np.array(x)
# x.shape

  x = np.array(x)


(2225,)

In [80]:
list(lda_model[corpus[:1]][0][0])

[(0, 0.62965584), (1, 0.023978624), (2, 0.0544572), (3, 0.2884717)]

In [91]:
def format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()
    topic_dic = {0:'business',
                1:'sport',
                2:'politics',
                3:'entertainment',
                4:'tech'}
    
    
    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = row[0]
        row = sorted(row, key=(lambda x: (x[1])), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), topic_dic[int(topic_num)], round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Dominant_Topic_Label', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic','Dominant_Topic_Label', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(10)

Unnamed: 0,Document_No,Dominant_Topic,Dominant_Topic_Label,Topic_Perc_Contrib,Keywords,Text
0,0,0.0,business,0.6297,"say, year, company, firm, rise, sale, last, ma...",Tate & Lyle boss bags top award Tate & Lyles c...
1,1,1.0,sport,0.4212,"say, game, go, play, time, take, win, first, m...",Halo 2 sells five million copies Microsoft is ...
2,2,2.0,politics,0.4124,"say, would, government, people, could, plan, m...",MSPs hear renewed climate warning Climate chan...
3,3,1.0,sport,0.9389,"say, game, go, play, time, take, win, first, m...",Pavey focuses on indoor success Jo Pavey will ...
4,4,2.0,politics,0.9424,"say, would, government, people, could, plan, m...",Tories reject rethink on axed MP Sacked MP How...
5,5,2.0,politics,0.7891,"say, would, government, people, could, plan, m...",Lib Dems predict best ever poll The Lib Dems a...
6,6,2.0,politics,0.826,"say, would, government, people, could, plan, m...",Howard attacks pay later Budget Tory leader Mi...
7,7,4.0,tech,0.7563,"say, use, people, technology, user, site, comp...",Games win for Blu-ray DVD format The next-gene...
8,8,2.0,politics,0.7878,"say, would, government, people, could, plan, m...",Labour pig poster anti-Semitic The Labour Part...
9,9,1.0,sport,0.657,"say, game, go, play, time, take, win, first, m...",Costin aims for comeback in 2006 Jamie Costin ...


In [85]:
predictions2 = df_dominant_topic.Dominant_Topic.tolist()
count = 0
dt = df['label'].tolist()
for index, i in enumerate(predictions2):
    if i == label_dic[dt[index]]:
        count +=1

print(f'acc:{100*(count/len(predictions2))}')


acc:90.47191011235955


In [76]:
#lda_model.save('lda.model')

In [111]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

In [54]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords2(text):
    return [word for word in text if word not in stop_words]

def make_bigrams2(text):
    return bigram_mod[text]

def make_trigrams2(text):
    return trigram_mod[bigram_mod[text]]

def lemmatization2(text, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    doc = nlp(" ".join(text)) 
    return [token.lemma_ for token in doc if token.pos_ in allowed_postags]

In [87]:
def test_document(unseen_document):
    
    #unseen_document = 'In 2013, a study published in the journal Memory & Cognition enrolled 60 adults who were learning Hungarian. The adults were randomized to one of three learning tasks: speaking unfamiliar Hungarian phrases, speaking the same phrases in a rhythmic fashion or singing the phrases.\r\n\r\nWhen asked to recall the phrases, the researchers found participants who sang the phrases had much higher recall accuracy than the other two groups. “These results suggest that a ‘listen-and-sing’ learning method can facilitate verbatim memory for spoken foreign language phrases,” say the authors.\r\n\r\nEvidence from such studies has led researchers to suggest music may help memory recall for people with cognitive disorders, such as Alzheimer’s disease.\r\n\r\nA study published in the journal Gerontologist last year assessed the effect of music on memory recall in individuals with early-stage dementia.\r\n\r\nFor the research, 89 people with dementia and their caregivers were randomly assigned to either a 10-week singing coaching group, a 10-week music listening coaching group or usual care.\r\n\r\nThe results revealed that both the singing and music listening groups not only had better mood and overall well-being that the usual care group, but they demonstrated better episodic memory on cognitive assessments. The singing group also showed better working memory than the usual care group.\r\n\r\n“Regular musical leisure activities can have long-term cognitive, emotional, and social benefits in mild/moderate dementia and could therefore be utilized in dementia care and rehabilitation,” the authors concluded.\r\n\r\nHelping recover brain injury, treat seizures\r\nIncreasingly, research is indicating that music can help aid recovery from brain injury – such as that from stroke.\r\n\r\nA 2008 study conducted by researchers from the University of Helsinki in Finland found that stroke patients who listened to music for around 2 hours daily had better verbal memory and attention and a more positive mood than those who listened to an audio book or nothing at all.\r\n\r\nWhat is more, studies have shown that music may aid speech recovery following stroke. One study conducted in 2013 by researchers from Korea, for example, found that stroke patients who developed communication problems after stroke demonstrated improved language ability following 1 month of neurologic music therapy.\r\n\r\nCommenting on the possible benefits of music therapy for stroke patients, Barbara Else told MNT:\r\n\r\n“While the neuroscience and research findings around the various music therapy interventions employed to support speech, language, and communication are rapidly growing and evolving, this is an exciting area.\r\n\r\nWhen combined with our colleagues’ working with these patients in related disciplines, we often see good results. Many open questions remain but the work is very encouraging.”\r\n\r\nIt has also been suggested that music may help treat epilepsy – a brain disorder characterized by the occurrence of seizures. Reported by MNT in August, a study found the brains of patients with epilepsy show different responses to music than the brains of those without the condition.\r\n\r\nConducted by Christine Charyton, of The Ohio State University Wexner Medical Center, and colleagues, the study found the brains of people with epilepsy showed greater synchronization in response to music – a “surprising” finding.\r\n\r\n“Persons with epilepsy synchronize before a seizure. However, in our study, patients with epilepsy synchronized to the music without having a seizure,” Charyton told us.\r\n\r\nThese results, Charyton said, could lead to a novel treatment strategy for epilepsy. “Persons with epilepsy may use the music to relax; stress causes seizures to occur,” she explained. “By listening to the music, many patients reported that they felt relaxed.”\r\n\r\nMusic therapy should be utilized more in health care settings\r\nBased on the substantial evidence that music offers numerous health benefits, many experts are calling for greater utilization of music therapy within health care settings.\r\n\r\n“Music therapists are poised and ready to assess, deliver and document music therapy treatment but also to consult with our colleagues (physicians, nurses, physiotherapists physical, occupational therapists, speech-language pathologists, etc.) to support the patient as part of the interdisciplinary team and care of the patient,” Else told MNT.\r\n\r\nIn addition, Else believes that music therapy could offer an alternative treatment option for some conditions – such as tension headaches.\r\n\r\n“A more complicated case example I can think of, although more rare, is for certain persons who experience seizure activity associated with music and auditory exposures – often high-frequency sounds and rhythmic intensity,” she said.\r\n\r\n“Customized music therapy interventions to cope with the offending acoustic exposures can support stabilization of the patient’s symptoms and may, in turn, result in a medication reduction or taper,” she continued.\r\n\r\nBased on the research to date, there is certainly evidence that we have much more than just an emotional connection with music. So the next time you put on your favorite track, have a little dance around safe in the knowledge that you are likely to be reaping some health benefits.\r\n\r\nAnxiety / StressNeurology / NeurosciencePain / AnestheticsComplementary Medicine / Alternative Medicine\r\nWritten by Honor Whiteman on November 19, 2015\r\nLatest news\r\nHow has the pandemic influenced our relationship with nature?\r\nRemdesivir and baricitinib shortened recovery time from COVID-19\r\nMedical myths: All about weight loss\r\nWhy do we need nostalgia?\r\nThe Recovery Room: Our best non-pandemic stories '
    data_words = gensim.utils.simple_preprocess(str(unseen_document), deacc=True)

    # Remove Stop Words
    data_words_nostops = remove_stopwords2(data_words)

    # Form Bigrams
    data_words_bigrams = make_bigrams2(data_words_nostops)
    #data_words_bigrams = make_trigrams2(data_words_nostops)

    # Do lemmatization keeping only noun, adj, vb, adv
    data_lemmatized = lemmatization2(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])


    # Create Corpus
    text = data_lemmatized

    # Term Document Frequency
    corpus = id2word.doc2bow(text)

    topic_dic = {0:'business',
                1:'sport',
                2:'politics',
                3:'entertainment',
                4:'tech'
    }

    print(f'Original text:\n{unseen_document}\n After Lemmatization:\n{data_lemmatized}\n')
    for index, score in sorted(lda_model[corpus][0], key=lambda tup: -1*tup[1]):
        tabs = '\t\t'
        if index == 3:
            tabs = '\t'
        print(f"Topic:{topic_dic[index]} {tabs} Score: {score:.2f}\t Topic keywords: {lda_model.print_topic(index, 5)}")


In [113]:
#unseen_document = 'Labour plans maternity pay rise\r\n\r\nMaternity pay for new mothers is to rise by Â as part of new proposals announced by the Trade and Industry Secretary Patricia Hewitt.\r\n\r\nIt would mean paid leave would be increased to nine months by 2007, Ms Hewitt told GMTV\'s Sunday programme. Other plans include letting maternity pay be given to fathers and extending rights to parents of older children. The Tories dismissed the maternity pay plan as \"desperate\", while the Liberal Democrats said it was misdirected.\r\n\r\nMs Hewitt said: \"We have already doubled the length of maternity pay, it was 13 weeks when we were elected, we have already taken it up to 26 weeks. \"We are going to extend the pay to nine months by 2007 and the aim is to get it right up to the full 12 months by the end of the next Parliament.\" She said new mothers were already entitled to 12 months leave, but that many women could not take it as only six of those months were paid. \"We have made a firm commitment. We will definitely extend the maternity pay, from the six months where it now is to nine months, that\'s the extra Â.\" She said ministers would consult on other proposals that could see fathers being allowed to take some of their partner\'s maternity pay or leave period, or extending the rights of flexible working to carers or parents of older children. The Shadow Secretary of State for the Family, Theresa May, said: \"These plans were announced by Gordon Brown in his pre-budget review in December and Tony Blair is now recycling it in his desperate bid to win back women voters.\"\r\n\r\nShe said the Conservatives would announce their proposals closer to the General Election. Liberal Democrat spokeswoman for women Sandra Gidley said: \"While mothers would welcome any extra maternity pay the Liberal Democrats feel this money is being misdirected.\" She said her party would boost maternity pay in the first six months to allow more women to stay at home in that time.\r\n\r\nMs Hewitt also stressed the plans would be paid for by taxpayers, not employers. But David Frost, director general of the British Chambers of Commerce, warned that many small firms could be \"crippled\" by the move. \"While the majority of any salary costs may be covered by the government\'s statutory pay, recruitment costs, advertising costs, retraining costs and the strain on the company will not be,\" he said. Further details of the government\'s plans will be outlined on Monday. New mothers are currently entitled to 90% of average earnings for the first six weeks after giving birth, followed by Â a week until the baby is six months old.\r\n'
#unseen_document = 'Bitcoin\'s value surged above $34,000 (Â£24,850) for the first time on Sunday as the leading cryptocurrency continued to soar.\r\n\r\nIt put the gain this year at almost $5,000, although by 17:00 GMT the price had drifted lower to about $33,000, according to the Coindesk website.\r\n\r\nThe rise was put down to interest from big investors seeking quick profits.\r\n\r\nIt comes after Bitcoin soared 300% last year, with the price of many other digital currencies also rising sharply.\r\n\r\nEthereum, the second biggest cryptocurrency, gained 465% in 2020\r\n\r\nSome analysts think Bitcoin\'s value could rise even further as the US dollar drops further.\r\n\r\nWhile the value of the US currency rose in March at the start of the coronavirus pandemic as investors sought safety amid the uncertainty, it has since dropped due to major stimulus from the US Federal Reserve. The currency ended last year with its biggest annual loss since 2017.\r\n\r\n    Covid worries help Bitcoin to three-year high\r\n    How do cryptocurrencies work?\r\n\r\nBitcoin is traded in much the same way as real currencies like the US dollar and pound sterling.\r\n\r\nRecently it has won growing support as a form of payment online, with PayPal among the most recent adopters of digital currencies. \r\nBut the cryptocurrency has also proved to be a volatile investment.\r\n\r\nThe soaring price has raised concerns that Bitcoin is due for a dramatic correction, as happened three years ago when the value collapsed after a bull run.\r\n\r\nDuring the rally in 2017 Bitcoin came close to breaking through the $20,000 level, only to hit extreme lows and fall below $3,300.\r\n\r\nIt passed $19,000 in November last year before dropping sharply again.\r\n\r\n    â€˜Robin Hoodâ€™ hackers giving stolen money to charity\r\n    \'One day everyone will use China\'s digital currency\'\r\n\r\nIn October, Bank of England Governor Andrew Bailey cautioned over Bitcoin\'s use as a payment method.\r\n\r\n\"I have to be honest, it is hard to see that Bitcoin has what we tend to call intrinsic value,\" he said. \"It may have extrinsic value in the sense that people want it.\"\r\n\r\nMr Bailey added that he was \"very nervous\" about people using Bitcoin for payments pointing out that investors should realise its price is extremely volatile.'
#unseen_document = 'Technological advances made during the pandemic are an \"opportunity as well as a threat\" to workers, an expert says.\r\n\r\nThe use of automation, robotics and artificial intelligence (AI) has grown by 30% since March 2020, according to industry body Technology Connected.\r\n\r\nA report found this could harm the work prospects of lower paid staff, women, young people and minority groups.\r\n\r\nBut Cardiff University professor Philip Brown said technology must be used to help shape a better future.\r\n\r\n\"We need to really get a better understanding of how technologies can be used in a way which is positive for the workforce, in a way which is positive for Wales,\" said Prof Brown.\r\n\r\n    AI steps up in battle against Covid-19\r\n    Covid could mean more facial recognition tech\r\n\r\nHe chaired an independent review into digital innovation, the economy and work for the Welsh Government.\r\n\r\n\"It\'s very difficult to make a judgement about where we are because of course lots of businesses are actually closed and not able to open.\r\n\r\n\"But I think basically we\'re not in a great position, and I think we really do need to get our act together.\"\r\n\"We\'ve got to use that opportunity but we\'ll fail if we actually don\'t come together around government, business, unions and other stakeholders to really present Wales differently, and really see how can we shift the economy,\" he said.\r\n\r\n\"How can we move to create a better future of work for people. Because if we just allow the technologies to determine our fate we are in for a very difficult future.\"\r\n\r\nX-STK is an automation company with offices in Cardiff that offers products and training in the use of \"cobots\", or collaborative robots.\r\n\r\nThese cobots are used in a range of industries, from pharmaceuticals to automotive and in universities.\r\n\r\nThe company has seen a major increase in demand for automation - with people not being able to go to work - to help with reduced numbers of staff and measures such as social distancing.\r\n\r\nBut does automation inevitably lead to people being replaced? Not according to Jessica Watts from the company.\r\n\"Output increases, which means higher demand so those people that typically would have been doing the very repetitive, potentially dangerous jobs - you know with RSI [repetitive strain industry] and other things like that - actually, those people go on and be upskilled, and they\'ve got more rewarding jobs, and they\'re doing things that have got more value,\" she explained.\r\n\r\nTechnology Connected said nearly 70% of all businesses questioned had increased their use of technology as a result of the pandemic and 48% said it had caused them to evaluate and speed up their adoption of new technology and working practices.\r\n\r\n\"Technology is totally ubiquitous and it\'s not just in industry,\" said managing director Avril Lewis.\r\n\r\n\"Who would have thought that we\'d be connecting with our loved ones via, you know, Zoom calls or who would have thought that we would be visiting our doctor surgeries remotely, taking photographs on our phones and sending those through for diagnosis.\r\n\r\n\"Who\'d have thought our children would be educated whilst online at home.\"'
#unseen_document = 'The Cleveland Browns have secured a play-off spot for the first time since 2002 with a 24-22 win over the Pittsburgh Steelers on the final day of the regular season.\r\n\r\nNick Chubb and Jarvis Landry each ran for a touchdown and Baker Mayfield threw for another.\r\n\r\nIt ended the NFL\'s longest active post-season drought.\r\n\r\nThe Green Bay Packers clinched the top seed in the NFC Conference while the Tennessee Titans won the AFC South.\r\n\r\nThe Baltimore Ravens, the Indianapolis Colts, the Los Angeles Rams and the Chicago Bears also qualified for the play-offs, which start on Saturday.\r\n\r\nWashington took the final play-off spot with a 20-14 victory over the Philadelphia Eagles to win the NFC East division despite a 7-9 record.\r\n\r\nThey are just the third NFL team to reach the play-offs with a losing record (excluding the strike-shortened 1982 season). The Carolina Panthers were the last to do so in 2014, following the Seattle Seahawks in 2010 - both went on to win their play-off openers.\r\n\r\nThe Bears lost 35-16 to Green Bay to finish 8-8 but still qualified for the wildcard round because of the Arizona Cardinals\' 18-7 defeat by the Rams.\r\n\r\n    As it happened - final Sunday of NFL\'s regular season\r\n    How to follow the 2020-21 season on the BBC\r\n\r\nThe Browns, three years after an 0-16 campaign, withstood a disrupted week of preparation with their training facility closed for three of the past four days because of Covid-19 issues.\r\n\r\n\"We wanted this. I could see that determination,\" Browns coach Kevin Stefanski said. \"It\'s not over. We\'ve got work left to do. I like how this team responds when their back is up against the wall.\"\r\nLamar Jackson threw for 113 yards and three touchdowns and ran 11 times for 97 yards as Baltimore beat Cincinnati 38-3 to clinch one of the three wildcard spots in the AFC Conference.\r\n\r\nHe became the first quarterback in NFL history with back-to-back 1,000-yard rushing seasons while JK Dobbins ran for 160 yards and two touchdowns in the emphatic Ravens victory.\r\n\r\nThe Titans, who later won 41-38 at the Houston Texans, secured a play-off spot when the Miami Dolphins were beaten 56-26 at the Buffalo Bills.'
unseen_document = 'Anyone who has seen The Secret of Kells or Song of the Sea will recognise the Celtic mysticism and the ornate, angular animation favoured by their director, Tomm Moore. But Wolfwalkers is his most delightful yet. Its plucky heroine is Robyn (voiced by Honor Kneafsey), the daughter of an English huntsman (Sean Bean) stationed in Ireland in the mid-1600s. He has the job of clearing the wolves from a forest near Kilkenny, but when Robyn goes exploring, she finds that one of those wolves can transform into a human girl (Eva Whittaker). As firmly rooted as it is in the history and landscape of Ireland, this is a cartoon about magic that actually feels magical. The girls\' battles against destructive adults are exhilarating fun, and every frame is a work of art. (NB)\r\n\r\nCharlie Kaufman\'s mind-bending horror drama is like no other film this year â€“ or any other year, for that matter. Adapted from Iain Reid\'s novel, it stars the superb Jessie Buckley as a young woman who drives through the snow with her faintly menacing boyfriend, Jesse Plemons, to meet his eccentric parents, Toni Collette and David Thewlis, on their family farm. Nothing goes wrong, but nothing is quite right, either: the characters\' clothes and ages keep changing, and the heroine is never quite sure of her own name. And then things really start getting weird. As you\'d expect from the screenwriter of Being John Malkovich and Adaptation, Kaufman twists the rules of film narrative in all sorts of clever and surreal ways, but I\'m Thinking of Ending Things is ultimately a powerfully melancholy treatise on the brevity of life and love. It may be confusing, but it\'s undoubtedly moving\r\n\r\nRocks was directed by Sarah Gavron (Brick Lane, Suffragette) and written by Theresa Ikoko and Claire Wilson, but, as Gavron regularly says, much of its plot and dialogue was suggested by the young people she met in inner-city London. The upshot is that Rocks is the most authentic teen movie in years, showing all the difficulties of adolescence in a deprived area, but all the laughter, warmth and ebullient energy, too. Bukky Bakray stars as a schoolgirl who has to fend for her younger brother (the adorable D\'angelou Osei Kissiedu) when their mother disappears from their flat. She makes bad choices, but she has good friends. A heart-rending but hopeful tribute to the bravery and resilience of youth. \r\n\r\nChadwick Boseman\'s astounding performance would have put him at the front of the Oscar race even if the role hadn\'t turned out, so sadly, to be his last. Based on August Wilson\'s play, the story is set in 1924 at a recording session for the real-life blues legend Ma Rainey, played brilliantly as a sly and imperious diva by Viola Davis. Boseman is Levee, a talented musician traumatised by a racist attack he witnessed in childhood. In Boseman\'s perfectly modulated performance, Levee is a charming, dancing, smiling young man with a future, whose anger finally explodes. Every minute of his performance and of this film is alive with energy and passion.'


test_document(unseen_document)

Original text:
Anyone who has seen The Secret of Kells or Song of the Sea will recognise the Celtic mysticism and the ornate, angular animation favoured by their director, Tomm Moore. But Wolfwalkers is his most delightful yet. Its plucky heroine is Robyn (voiced by Honor Kneafsey), the daughter of an English huntsman (Sean Bean) stationed in Ireland in the mid-1600s. He has the job of clearing the wolves from a forest near Kilkenny, but when Robyn goes exploring, she finds that one of those wolves can transform into a human girl (Eva Whittaker). As firmly rooted as it is in the history and landscape of Ireland, this is a cartoon about magic that actually feels magical. The girls' battles against destructive adults are exhilarating fun, and every frame is a work of art. (NB)

Charlie Kaufman's mind-bending horror drama is like no other film this year â€“ or any other year, for that matter. Adapted from Iain Reid's novel, it stars the superb Jessie Buckley as a young woman who drives 