# 1. Prerequisites

In [104]:
# pip install PyMuPDF                    # (install PyMuPDF for extracting info from PDF files)
# pip install tika                       # (install tika for extracting paragraphs from PDF files)
# pip install spacy==2.2.0               # (install spacy for lemmatization)
# conda install gensim                   # (intall gesim for topic modelling)
# pip install pyLDAvis                   # (install pyLDAvis for topic modelling visulisation)
# conda install -c conda-forge pyldavis  # (if you use aconda to install pyLADvis)

In [105]:
import pandas as pd
import numpy as np
import re

# glob for extracting the directories of metadata
import glob

# PyMuPDF
import fitz

# tika
import tika               
from tika import parser   

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Visualisation
import plotly.express as px
import pyLDAvis
import pyLDAvis.gensim_models
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)
import os

# 2. Import pdf files, data wrangling and overview

In [106]:
# Extract the directories of the PDF files, make sure the folder name does not contain number
pdf_dir = "D:\LEON\Business Analytics\Study\9. Business Project\Data set\Olympics"
pdf_files = glob.glob("%s/*.pdf" % pdf_dir)
pdf_files[:1]

['D:\\LEON\\Business Analytics\\Study\\9. Business Project\\Data set\\Olympics\\Examination_of_Witnesses_Sept_2003_-_Q1-19.pdf']

In [107]:
# Use PyMuPDF to extract all info of the PDF files (text, title, date, etc)
list_metadata = []
for i in pdf_files:
    with fitz.open(i) as doc:
        info = doc.metadata
        info['file_name'] = os.path.basename(i)
        text = ''
        for page in doc:
            text+= page.getText()
        info['Content'] = text       
    list_metadata.append(info)

In [108]:
df = pd.DataFrame(list_metadata)
df['document_id'] = df.index
df = df.drop_duplicates(subset = 'Content')             # drop duplicate rows
df = df.dropna(subset=['Content'])                      # drop rows whose text content is NaN
df['Word_count'] = df ['Content'].str.count(' ') + 1
df.head(3)

Unnamed: 0,format,title,author,subject,keywords,creator,producer,creationDate,modDate,trapped,encryption,file_name,Content,document_id,Word_count
0,PDF 1.7,,B Lewis,,,Microsoft Word,,D:20210822083603+00'00',D:20210822083603+00'00',,,Examination_of_Witnesses_Sept_2003_-_Q1-19.pdf,Examination of Witnesses (1-19) \n16 SEPTEMBER...,0,6115
1,PDF 1.7,,B Lewis,,,Microsoft Word,,D:20210822083606+00'00',D:20210822083606+00'00',,,Examination_of_Witnesses_Sept_2003_-_Q20-39.pdf,Examination of Witnesses (20-39) \n16 SEPTEMBE...,1,4002
2,PDF 1.7,,B Lewis,,,Microsoft Word,,D:20210822083609+00'00',D:20210822083609+00'00',,,Examination_of_Witnesses_Sept_2003_-_Q40-44.pdf,Examination of Witnesses (40-44) \n16 SEPTEMBE...,2,1007


In [109]:
# check if there are documents with few words
min_word_count= 10                                               # set the threshold of the minimum word count of each document 
min_word_count_filter = df['Word_count'] <= min_word_count
df_few_words = df[min_word_count_filter][['file_name', 'Content']]
df_few_words

Unnamed: 0,file_name,Content


In [110]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 169 entries, 0 to 168
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   format        169 non-null    object
 1   title         169 non-null    object
 2   author        169 non-null    object
 3   subject       169 non-null    object
 4   keywords      169 non-null    object
 5   creator       169 non-null    object
 6   producer      169 non-null    object
 7   creationDate  169 non-null    object
 8   modDate       169 non-null    object
 9   trapped       169 non-null    object
 10  encryption    3 non-null      object
 11  file_name     169 non-null    object
 12  Content       169 non-null    object
 13  document_id   169 non-null    int64 
 14  Word_count    169 non-null    int64 
dtypes: int64(2), object(13)
memory usage: 21.1+ KB


In [111]:
# Word count
df['Word_count'].sum( )

1054090

# 3. Natural language processing

### 3.1. Tokenisation

In [112]:
data = df.Content.values.tolist()

In [113]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence).encode('utf-8'), deacc=True))  # deacc=True removes punctuations

data_words= list(sent_to_words(data))

### 3.2. Processing words: 
Remove Stopwords, Make Bigrams and Trigrams,Lemmatisation, remove short words and meaningless words

In [114]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [115]:
# import the stop_words from gensim
from gensim.parsing.preprocessing import remove_stopwords, STOPWORDS
stop_words = [i for i in STOPWORDS]

# add more stop words after analysing the overall term frequncy of each topic in pyLDAvis in the "Word frequency of each topic" section
new_stop_words = ['go', 'would', 'make', 'think', 'take', 'say', 'need', 'want', 'thing', 'have', 'lot', 'people', 'year',
                   'work','time', 'know', 'use', 'try', 'happen', 'ask', 'new', 'way', 'jonathan', 'stephen']            
stop_words.extend(new_stop_words)
stop_words

['does',
 'whereby',
 'not',
 'didn',
 'ourselves',
 'off',
 'top',
 'thick',
 'next',
 'below',
 'do',
 're',
 'everyone',
 'un',
 'if',
 'nobody',
 'own',
 'namely',
 'thru',
 'then',
 'often',
 'too',
 'de',
 'we',
 'no',
 'yourselves',
 'call',
 'well',
 'otherwise',
 'anything',
 'make',
 'ten',
 'none',
 'name',
 'but',
 'system',
 'such',
 'each',
 'other',
 'the',
 'hence',
 'yet',
 'toward',
 'now',
 'or',
 'her',
 'describe',
 'be',
 'four',
 'further',
 'seem',
 'much',
 'least',
 'what',
 'whatever',
 'with',
 'towards',
 'becomes',
 'keep',
 'in',
 'nine',
 'some',
 'when',
 'seeming',
 'anyhow',
 'have',
 'all',
 'anyone',
 'ever',
 'most',
 'over',
 'after',
 'am',
 'everywhere',
 'beside',
 'where',
 'whenever',
 'along',
 'get',
 'about',
 'once',
 'never',
 'done',
 'interest',
 'thus',
 'a',
 'ie',
 'part',
 'because',
 'con',
 'ours',
 'show',
 'their',
 'my',
 'there',
 'full',
 'elsewhere',
 'that',
 'mill',
 'an',
 'than',
 'just',
 'former',
 'from',
 'himself',

In [116]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stop_words(texts):
    return [[word for word in doc if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [117]:
# Form Trigrams
data_words_trigrams = make_trigrams(data_words)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

#increase the maximum length of text that the parser or NER can process
nlp.max_length = 13000000 #

# Do lemmatization keeping only noun, adj, verb
data_lemmatized1 = lemmatization(data_words_trigrams, allowed_postags=['NOUN', 'ADJ', 'VERB'])

# Set a threshold for removing the words with length less than the threshold
minimum_len = 3 
data_lemmatized2 = []
for i in data_lemmatized1:
    new_element = [x for x in i if len(x) >= minimum_len]
    data_lemmatized2.append(new_element)

# remove stop words
data_lemmatized = remove_stop_words(data_lemmatized2)
data_lemmatized[:1]

[['examination',
  'witness',
  'morning',
  'like',
  'welcome',
  'today',
  'occasional',
  'series',
  'checking',
  'session',
  'follow',
  'regard',
  'olympic',
  'bid',
  'committee',
  'report',
  'bring',
  'good',
  'express',
  'number',
  'concern',
  'issue',
  'ought',
  'consider',
  'interested',
  'hear',
  'understand',
  'course',
  'acceptable',
  'appearance',
  'like',
  'open',
  'statement',
  'happy',
  'listen',
  'thank',
  'thank',
  'opportunity',
  'come',
  'today',
  'talk',
  'little_bit',
  'plan',
  'hope',
  'come',
  'month',
  'month',
  'shall',
  'able',
  'meet',
  'update',
  'progress',
  'like',
  'role',
  'week',
  'explain',
  'sitting',
  'role',
  'offer',
  'accept',
  'like',
  'little_bit',
  'idea',
  'job',
  'philosophy',
  'bid',
  'update',
  'initial',
  'thought',
  'reaction',
  'date',
  'sense',
  'progress',
  'week',
  'talk',
  'little_bit',
  'future',
  'motivation',
  'role',
  'come',
  'number',
  'area',
  'lucky'

###  3.3. Dictionary and Corpus

In [118]:
# Create Dictionary, set the parameters to filter out tokens in the dictionary by their frequency
no_below = 5             # remove the tokens less frequent than no_below documents (absolute number)
no_above = 0.85          # remove the tokens more frequent than no_above documents (fraction of the total corpus size)
id2word = corpora.Dictionary(data_lemmatized)
id2word.filter_extremes(no_below = no_below, no_above = no_above)

# print the number of reserved unique tokens and word count afer removal of high and low frequency words
print('After removal of high and low frequency words - Number of unique tokens: %d, %d' % (len(id2word),id2word.num_pos))

After removal of high and low frequency words - Number of unique tokens: 3409, 340269


In [119]:
# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 2), (1, 4), (2, 3), (3, 1), (4, 1), (5, 1), (6, 1), (7, 2), (8, 1), (9, 1), (10, 1), (11, 1), (12, 3), (13, 1), (14, 1), (15, 2), (16, 3), (17, 1), (18, 2), (19, 3), (20, 1), (21, 1), (22, 2), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 5), (29, 1), (30, 4), (31, 2), (32, 1), (33, 1), (34, 2), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 2), (41, 4), (42, 3), (43, 1), (44, 1), (45, 1), (46, 2), (47, 1), (48, 3), (49, 1), (50, 1), (51, 1), (52, 1), (53, 11), (54, 17), (55, 1), (56, 1), (57, 61), (58, 4), (59, 3), (60, 1), (61, 1), (62, 5), (63, 1), (64, 3), (65, 1), (66, 1), (67, 11), (68, 1), (69, 2), (70, 2), (71, 8), (72, 5), (73, 1), (74, 2), (75, 1), (76, 1), (77, 2), (78, 1), (79, 10), (80, 2), (81, 3), (82, 1), (83, 4), (84, 1), (85, 1), (86, 2), (87, 4), (88, 16), (89, 1), (90, 1), (91, 2), (92, 3), (93, 16), (94, 1), (95, 1), (96, 3), (97, 2), (98, 1), (99, 2), (100, 5), (101, 1), (102, 1), (103, 1), (104, 1), (105, 4), (106, 1), (107, 3), (108, 1), (109, 1), (

#  4. LDA Model

### 4.1. Building LDA Model, Perparameter/Hyperparameter tuning

In [120]:
# set training parameters and hyperameters
k = 20
passes = 20
iterations = 100
alpha = 50.0/k   
eta = 0.01
random_state = 12345
minimum_probability = 0

Plotting the coherence score against k to identify the opitmal k where the coherence socre reaches the highest point. Because running it is quite time-consuming, I stopped the chunk below and just set k to be 12 based on the analysis of the reuslt. If users want to fit the model to the other corpus, they can remove the hashs to  reactivate the chunk.

In [121]:
#start=3; limit=63; step=3
#coherence_values = []
#model_list = []
#for i in range(start,limit,step):
    #model = gensim.models.LdaModel(corpus = corpus,id2word = id2word,alpha = alpha,eta = eta,
    #                              iterations = iterations,num_topics = i,passes = passes,random_state = 12345,minimum_probability = minimum_probability)
    #model_list.append(model)
    #coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=id2word, coherence='c_v')
    #coherence_values.append(coherencemodel.get_coherence())

In [122]:
#list_num_topics = [i for i in range(start, limit, step)]
#df_coherence1 = pd.DataFrame({'Number_of_Topics': list_num_topics, 'Coherence_Score': coherence_values})
#df_coherence1.to_pickle('./df_coherence1.pkl') #save the result to disk
#df_coherence = pd.read_pickle('./df_coherence1.pkl') #load the result from disk

In [123]:
#fig1 = px.line(df_coherence, x = 'Number_of_Topics', y = "Coherence_Score", title = 'Coherence scores against number of topics')
#fig1.update_layout(autosize=False, width=1000, height=400)
#fig1.update_traces(mode = "lines + markers")
#fig1.show()

In [124]:
# num of topics =  to get the highest coherence socre
k = 12
lda_model = gensim.models.LdaModel(
    corpus = corpus,
    id2word = id2word,
    alpha = alpha,
    eta = eta,
    iterations = iterations,
    num_topics = k,
    passes = passes,
    random_state = 12345,
    minimum_probability = minimum_probability)

In [125]:
# print the coherence of the LDA model
coherencemodel2 = CoherenceModel(model=lda_model, texts=texts, dictionary=id2word, coherence='c_v')
coherence_score = coherencemodel2.get_coherence()
coherence_score

0.4208466929396401

### 4.2. Topic distribution of documents

In [126]:
# create the function for converting a list of tuples into a dictionary
def Convert(tup, di):
    di = dict(tup)
    return di

In [127]:
# topic distribution of documents
list_topic = []
dictionary_topic = {}
for d in texts:
    bow = id2word.doc2bow(d)
    belong = lda_model[bow]                        # generate a list of tuples of topic distribution of a document
    belong_dic = Convert(belong, dictionary_topic) # convert the list of tuples into a dictionary
    list_topic.append(belong_dic)           
                      
df_topic_distribution = pd.DataFrame(list_topic)   # convert the list of dictionaries into a dataframe

# rename the topic IDs to ensure they are as same as the topic IDs in the pyLDAvis
original_topic_id = [*df_topic_distribution]; new_topic_id = [x + 1 for x in original_topic_id]
df_topic_distribution = df_topic_distribution.rename(columns = dict(zip(original_topic_id, new_topic_id))) #rename the topic IDs to ensure they are as same as the topic IDs in the pyLDAvis
df_topic = pd.merge(df, df_topic_distribution, how = 'left', left_index=True, right_index=True) # merge with info of documents
df_topic.drop(['title','format','creator', 'producer', 'keywords', 'trapped', 'encryption','subject', 'modDate'], axis = 1)

Unnamed: 0,author,creationDate,file_name,Content,document_id,Word_count,1,2,3,4,5,6,7,8,9,10,11,12
0,B Lewis,D:20210822083603+00'00',Examination_of_Witnesses_Sept_2003_-_Q1-19.pdf,Examination of Witnesses (1-19) \n16 SEPTEMBER...,0,6115,0.010725,0.012664,0.008528,0.007332,0.180764,0.725537,0.008626,0.007885,0.006884,0.006341,0.009331,0.015384
1,B Lewis,D:20210822083606+00'00',Examination_of_Witnesses_Sept_2003_-_Q20-39.pdf,Examination of Witnesses (20-39) \n16 SEPTEMBE...,1,4002,0.032201,0.017489,0.006726,0.011709,0.320539,0.453667,0.010538,0.007622,0.008823,0.010536,0.067056,0.053094
2,B Lewis,D:20210822083609+00'00',Examination_of_Witnesses_Sept_2003_-_Q40-44.pdf,Examination of Witnesses (40-44) \n16 SEPTEMBE...,2,1007,0.027157,0.031576,0.018927,0.022364,0.248048,0.428007,0.032854,0.022809,0.026884,0.026380,0.086787,0.028207
3,Bronwen Lewis,D:20210822084116+00'00',Further_supplementary_memorandum_submitted_by_...,Further supplementary memorandum submitted by ...,3,431,0.028281,0.020322,0.504126,0.034461,0.026796,0.026107,0.026675,0.106673,0.039447,0.125504,0.029139,0.032469
4,Bronwen Lewis,D:20210822083921+00'00',Further_Supplementary_Memorandum_submitted_by_...,Further supplementary memorandum submitted by ...,4,288,0.119681,0.064246,0.058652,0.042788,0.099493,0.065156,0.056939,0.065553,0.087256,0.160921,0.131773,0.047540
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164,Bronwen Lewis,D:20210822084528+00'00',Written_evidence_submitted_by_UK_Sport_-_Jan_2...,Written evidence submitted by UK Sport \n \n ...,164,3089,0.072124,0.253418,0.008633,0.483050,0.012194,0.016957,0.027191,0.006808,0.014276,0.010390,0.058512,0.036446
165,Bronwen Lewis,D:20210822084531+00'00',Written_evidence_submitted_by_Vision_2020_UK_-...,Written evidence submitted by Vision 2020 UK ...,165,2284,0.843995,0.018983,0.008188,0.013050,0.010891,0.017523,0.009616,0.018343,0.008116,0.007716,0.021596,0.021984
166,Bronwen Lewis,D:20210822084535+00'00',Written_evidence_submitted_by_VisitBritain_-_J...,Written evidence submitted by VisitBritain \n...,166,2372,0.005055,0.893003,0.011858,0.008562,0.012724,0.006802,0.007480,0.009944,0.016306,0.008309,0.006184,0.013773
167,Bronwen Lewis,D:20210822084543+00'00',Written_evidence_submitted_by_Womens_Sport_and...,Written evidence submitted by the Women's Spor...,167,1966,0.021705,0.833179,0.016557,0.017655,0.013123,0.012271,0.012026,0.012222,0.008117,0.013012,0.020493,0.019641


# 5. Topic interpretation tools

I first identify the salient topics defined by PTBI proposed by Marchetti and Puranam (2020), then combine both the topic visualisation of pyLDAvis and the prototypical texts defined by PTBI to facilitate the topic interpretation.

##  5.1. Salient topics for interpretation
PTBI assumes that the topics with little salience are not worthy of interpretation. To extract the most salient topics for interpretation, for each topic, we need to compute the fraction of documents with the probability that the documents belong to the topic is more than > 1/K (Marchetti and Puranam, 2020, p. 14), and I defined the fraction as the “salience” of the topic. 

The scree plot below shows that when the topics are sorted by salience in descending order, the salience tends to level off on topic 8, as a result, we can select the topics ahead of topic 8 as the salient topics for interpretation.

In [128]:
# compute salience: the fraction of documents with the probability that the document belongs to the topic is more than > 1/K for each document
list_percent_above = []
for i in df_topic_distribution:
    num_above = df_topic_distribution[i][df_topic_distribution[i] > 1/k].count()
    percent_above =  num_above/len(df_topic_distribution)
    list_percent_above.append(percent_above)
    
df_salient_topic = pd.DataFrame({'topic_ID':  [str(i) for i in new_topic_id], 'salience': list_percent_above}).sort_values(
    by = 'salience', ascending = False)

In [129]:
fig_L1 = px.line(df_salient_topic, x = 'topic_ID', y = 'salience', title="Scree plot of salience of topics")
fig_L1.update_layout(autosize=False, width=800, height=400)
fig_L1.update_traces(mode = "lines + markers")
fig_L1.show()

## 5.2. Topic visualisation

Check the words of each topic, if there're common words with high overall frequency such as "think" "want" or "make", return to the "import the stop_words from gensim" section, add these words to the list of stop words to remove them.

In [130]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, sort_topics = False )
vis

## 5.3. prototypical paragraphs
The prototypical paragraphs, the paragraphs with a high probabitiy that they belong to a topic, can be used to assist topic interpretation. This section classify the paragraphs into topics and provides the users 4 types of filters to select the prototypical paragraphs: N most prototypical paragraphs overall, N most prototypical paragraphs where the belong() function is greater than the threshold L, N most prototypical paragraphs of each topic and N most prototypical paragraphs of a specific topic.

### 5.3.1.  Classify the paragraphs based on the trained model

##### Extract paragraphs from documents

In [131]:
# define the function for spliting texts into paragraphs by delimiter '.\n\n' or '. \n\n'
def para_split(i):
    j = parser.from_file(i)
    m = j['content']
    import re
    return re.split('[?.!-]\n|[?.!-] \n|  \n\n|\n\n[0-9]', m)

In [132]:
list_paragraphs = []
list_para_id = []
for i in pdf_files:
    para = para_split(i)
    para = [w.replace('\n', '') for w in para]
    para = [x.strip() for x in para if x.strip()] # remove empty elements
    para_id = [x for x in range(len(para))] 
    list_paragraphs.append(para)
    list_para_id.append(para_id)

In [133]:
df_para1 = df.copy()
df_para1['paragraphs'] = list_paragraphs
df_para1['para_id'] = list_para_id
df_para2 = df_para1.apply(pd.Series.explode)
df_para3 = df_para2.reset_index()
df_para4 = df_para3[['creationDate', 'document_id', 'file_name', 'para_id', 'paragraphs']]
len(df_para4) # number of paragraphs extracted

21640

In [134]:
# set a filter to filter out the paragraphs with short words
n_word_count = 10                                                        # set the threshold of word count
para_word_count = df_para4['paragraphs'].str.split().str.len()           # word count of each paragraph
df_para = df_para4[(para_word_count>=n_word_count)].reset_index()        # select the paragraphs with word count not less than the threshold
df_para

Unnamed: 0,index,creationDate,document_id,file_name,para_id,paragraphs
0,2,D:20210822083603+00'00',0,Examination_of_Witnesses_Sept_2003_-_Q1-19.pdf,2,"MS BARBARA CASSANI Q1 Chairman: Good morning, ..."
1,3,D:20210822083603+00'00',0,Examination_of_Witnesses_Sept_2003_-_Q1-19.pdf,3,Ms Cassani: Thank you very much. Thank you ver...
2,4,D:20210822083603+00'00',0,Examination_of_Witnesses_Sept_2003_-_Q1-19.pdf,4,8 months I shall be able to meet frequently wi...
3,5,D:20210822083603+00'00',0,Examination_of_Witnesses_Sept_2003_-_Q1-19.pdf,5,The first thing I should like to say is that I...
4,6,D:20210822083603+00'00',0,Examination_of_Witnesses_Sept_2003_-_Q1-19.pdf,6,Really the backdrop is that I believe in the G...
...,...,...,...,...,...,...
17709,21631,D:20210822084546+00'00',168,Written_evidence_submitted_by_Youth_Sport_Trus...,37,7.3 When the impact of Olympics and Paralympi...
17710,21633,D:20210822084546+00'00',168,Written_evidence_submitted_by_Youth_Sport_Trus...,39,11 2007-08 School Sport Survey. 12 As ...
17711,21634,D:20210822084546+00'00',168,Written_evidence_submitted_by_Youth_Sport_Trus...,40,13 Gold Young Ambassadors work across School...
17712,21635,D:20210822084546+00'00',168,Written_evidence_submitted_by_Youth_Sport_Trus...,41,14 From national data supplied by Department...


##### Process the paragraphs

In [135]:
# tokenization
data2 = df_para.paragraphs.values.tolist()
data_words2 = list(sent_to_words(data2))

In [136]:
# Form Trigrams
data_words_trigrams2 = make_trigrams(data_words2)

# Do lemmatization keeping only noun, adj, vb
data_lemmatized2 = lemmatization(data_words_trigrams2, allowed_postags=['NOUN', 'ADJ', 'VERB'])

# set the length of word threshold as same as before for removing the words less than the threshold
data_lemmatized2_2 = []
for i in data_lemmatized2:
    new_element = [x for x in i if len(x) >= minimum_len]
    data_lemmatized2_2.append(new_element)
    
# Remove Stop Words
data_lemmatized2_1 = remove_stop_words(data_lemmatized2_2)

##### Classify the paragraphs based on the extracted topics

In [137]:
# belong function: classify topics of paragraphs, it might take a long time because there are 148,651 paragraphs in the 11,132,849-word corpus
list_topic_para = []
dictionary_topic_para = {}
for d in data_lemmatized2_1:
    bow = id2word.doc2bow(d)
    belong = lda_model[bow]
    doc_dic = Convert(belong, dictionary_topic_para)
    list_topic_para.append(doc_dic)
    df_topic_para = pd.DataFrame(list_topic_para)

In [138]:
# rename the topic IDs to ensure they are as same as the topic IDs in the pyLDAvis
df_topic_para = df_topic_para.rename(columns = dict(zip(original_topic_id, new_topic_id)))

# topic distribution of paragraphs
df_topic_para1_1 = pd.merge(df_para, df_topic_para, how = 'left', left_index=True, right_index=True)
df_topic_para1_1

Unnamed: 0,index,creationDate,document_id,file_name,para_id,paragraphs,1,2,3,4,5,6,7,8,9,10,11,12
0,2,D:20210822083603+00'00',0,Examination_of_Witnesses_Sept_2003_-_Q1-19.pdf,2,"MS BARBARA CASSANI Q1 Chairman: Good morning, ...",0.049290,0.057799,0.052240,0.051225,0.120243,0.235192,0.046411,0.101134,0.070472,0.065950,0.087475,0.062570
1,3,D:20210822083603+00'00',0,Examination_of_Witnesses_Sept_2003_-_Q1-19.pdf,3,Ms Cassani: Thank you very much. Thank you ver...,0.066549,0.067266,0.067827,0.063631,0.136346,0.118108,0.067207,0.091102,0.075200,0.069947,0.103458,0.073359
2,4,D:20210822083603+00'00',0,Examination_of_Witnesses_Sept_2003_-_Q1-19.pdf,4,8 months I shall be able to meet frequently wi...,0.079225,0.071757,0.095059,0.077315,0.112131,0.091406,0.072867,0.082869,0.075167,0.080996,0.080382,0.080826
3,5,D:20210822083603+00'00',0,Examination_of_Witnesses_Sept_2003_-_Q1-19.pdf,5,The first thing I should like to say is that I...,0.041947,0.096321,0.029758,0.026998,0.077918,0.475506,0.041892,0.046509,0.029957,0.034317,0.053890,0.044986
4,6,D:20210822083603+00'00',0,Examination_of_Witnesses_Sept_2003_-_Q1-19.pdf,6,Really the backdrop is that I believe in the G...,0.065745,0.076654,0.052975,0.096067,0.063671,0.196147,0.058440,0.064122,0.090837,0.062271,0.075833,0.097238
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17709,21631,D:20210822084546+00'00',168,Written_evidence_submitted_by_Youth_Sport_Trus...,37,7.3 When the impact of Olympics and Paralympi...,0.090832,0.245551,0.062688,0.090365,0.052118,0.051488,0.066706,0.056014,0.055876,0.057229,0.061379,0.109755
17710,21633,D:20210822084546+00'00',168,Written_evidence_submitted_by_Youth_Sport_Trus...,39,11 2007-08 School Sport Survey. 12 As ...,0.081229,0.163673,0.069663,0.078291,0.077352,0.073786,0.065894,0.063376,0.071884,0.077780,0.086246,0.090827
17711,21634,D:20210822084546+00'00',168,Written_evidence_submitted_by_Youth_Sport_Trus...,40,13 Gold Young Ambassadors work across School...,0.119819,0.085556,0.067685,0.137824,0.071079,0.068831,0.074449,0.072185,0.066447,0.071587,0.087186,0.077353
17712,21635,D:20210822084546+00'00',168,Written_evidence_submitted_by_Youth_Sport_Trus...,41,14 From national data supplied by Department...,0.111826,0.093539,0.086258,0.094410,0.080932,0.071638,0.075615,0.083544,0.069175,0.080237,0.073071,0.079757


In [139]:
# save the result to disk
df_topic_para1_1.to_pickle('./df_topic_para_Olympics.pkl')

In [140]:
# load the result from disk
df_topic_para1 = pd.read_pickle('./df_topic_para_Olympics.pkl') 

In [141]:
# drop the paragraphs with high frequency but meaningless for interperation based on the extraction of prototypical paragraphs below
list_remove_para = [1519, 12966]                                # input the index you want to drop
df_topic_para2 = df_topic_para1.copy().drop(list_remove_para)

### 5.3.2. N most prototypical paragraphs overall

In [142]:
df_topic_para2_n = df_topic_para2.copy()
df_topic_para2_n['highest_p'] = df_topic_para2_n.iloc[:, 6:].max(axis = 1)          # get the highest probability among the topic distribution of each paragraph
df_topic_para2_n['salient_topic'] = df_topic_para2_n.iloc[:, 6:].idxmax(axis = 1)    # get the corresponding topic id
df_topic_para2_n = df_topic_para2_n[['paragraphs', 'salient_topic', 'highest_p']]

In [143]:
N1 = 5   # Set N to get the N most prototypical paragraphs overall
df_topic_para2_n.nlargest(N1,['highest_p']).style.set_properties(subset = ['paragraphs'], **{'width':'1000px', 'length': '50px'})

Unnamed: 0,paragraphs,salient_topic,highest_p
8304,"You think that the assessment of risks is our best estimate of the most likely outcome of the budget as a whole. But actually the assessment of risk-and how we have compiled it-is this: we have not sought to estimate how likely it is that every risk arises. We just said, ""Let us think about every risk that could arise, and let us assume that they all arise and work out the likely cost of them all arising."" On top of that, we said, ""And there will be some risks that we just cannot think about that are unknown unknowns. There will be some multiple consequentials if everything came together."" So we end up with an estimate not of the most likely cost of the project, which is what the burden of paragraph 1 of the PAC Report understands it is, but an estimate of how much we would need to set aside in the very unlikely event that all risks arise and some more unknown risks arise as well. The purpose of that is not to get to an estimate of the likely outcome of the budget. Its purpose and why we do it is to see, against any reasonable view of the likely risk that might arise, even on an assumption that they all arise and some more unknown risks arise, whether we have enough money. The conclusion has always been, yes, we had. Against what is therefore, in my view, a conservative and prudent estimate, we had £36 million headroom at the time of the NAO Report. We had more, and indeed the picture over the six-month period since the original figures on which the NAO was recording this, is that the contingency has gone down by £27 million or so-we reckon, because these are provisional figures, but I want to give our best figures-and the assessed risks on that very conservative and prudent basis have gone down by £136 million. So the picture on the budget as a whole is that we are spending contingency significantly slower than risks are disappearing from the programme. That is why, without in any sense being complacent, I am confident that we will bring this in within budget, and I do not think that the budget is close to being used up",8,0.65663
17552,"4. Most clubs seem ill-prepared for enquiries from, and inclusion of, people with disabilities who wish to participate in the sport offered by the club. There is little or no support for specialist clubs who provide opportunities for sport that cannot be integrated ie wheelchair basketball or blind cricket! There is often nothing locally to support the child or their parent in accessing the specialist provision and this often involves their having to undertake extensive travel to specialist facilities or organisations catering for this group. There is poor information regarding availability etc and http://www.parasport.org.uk was established as a portal to provide pathway and provision information. The then Mayor of London published a strategy in 2007, which highlighted all these issues and to date there has been little, if any, action to redress these anomalies in London or in the rest of the country. DCSF have, in my view, shown no leadership regarding the legacy of 2012 and its impact on PE in schools and the inclusion of those with disabilities in core curriculum activities or sporting opportunities within or after school. DCMS held a legacy event in 2008 and again in April 2009 focusing on the legacy of the Games for those with disabilities. One outcome was to seek greater links between DWP, DCSF and themselves to ensure that joint strategies were developed and pathways established that enabled children to enjoy and participate in PE and sport within schools/after school clubs/integrated and specialist provision in the community, with good national talent forums and pathways established for those who wish to participate in sport at a higher level and finally, with governing bodies having clear inclusive programmes for sports men and women with disabilities active at a national and international level",1,0.626673
8388,"Jonathan Stephens: There are two aspects to risk: likelihood and impact. What we are saying is that we made no estimate of likelihood, we just wrote in a 100% likelihood of all the risks we could think of and some unknown risks that we could not think of. We then looked at impact, and on impact we said, ""If this risk were to materialise-we are assuming a 100% likelihood that it materialises-what is the likely cost?"" That is where you get the low, the most likely outcome and the high outcome. When you add those together, you do not get to an outcome of, ""What is the most likely expenditure on the programme?""; you get to an outcome of, ""If all conceivable risks arise, plus some unknown risks that we cannot identify, what is the likely expenditure?"" That is a conservative and prudent view of, ""Do we have enough contingency left, if all those risks arise?"" In practice, they won’t all arise. It is conceivable that some will arise, but it is pretty unlikely that all of them will arise. It is perfectly conceivable that some individual risk will arise at a higher estimate than the most likely estimate, but the prospect of all those risks arising is unlikely. The prospect of them all arising at the very highest possible cost is so unlikely as to not provide a good basis for planning. I am sorry. I am going on at some length, but there is a real point",8,0.578037
3329,"6 PREPARATIONS FOR THE LONDON 2012 OLyMPIC AND PARALyMPIC GAMES: PROGRESS REPORT JuNE 2008 10 The start and completion dates for the construction of the main venue and infrastructure projects delivered by the Olympic Delivery Authority at the end of March 2008 compared with the milestones in the November 2007 Programme Baseline ReportProjectEnabling Works (site preparation) Power Lines under Grounding (switchover only) Structures, Bridges and Highways utilities Main Stadium Aquatics Centre VeloparkHandball/Indoor Sports ArenaBasketballInternational Broadcast Centre/ Main Press CentreOlympic Village Eton Manor (training facilities and Paralympic events) Broxbourne (white water canoeing) Eton Dorney (rowing) Weymouth and Portland (sailing)construction start date November 2007 March 2008 Change in programme Forecast start date baseline (months)October 2006 October 2006 0 July 2008 July 2008 0 April 2008 April 2008 0 January 2008 January 2008 0 July 2008 May 2008 –21 September 2008 September 2008 0 March 2009 March 2009 0August 2009 June 2009 –2July 2009 November 2009 4May 2009 March 2009 –2 June 2008 May 2008 –1 March 2010 January 2010 –2 August 2008 May 2009 9 March 2009 January 2009 –2 May 2008 January 2008 –4construction end date November 2007 March 2008 Change in programme Forecast end date baseline (months)September 2009 September 2009 0 September 2008 November 2008 2 December 2011 December 2011 0 December 2011 August 2011 –4 Construction Construction end date end dateFebruary 2011 April 2011 2Completion date Completion date for construction for construction and initial overlay and initial overlay for test events for test eventsJune 2011 June 2011 0Construction Construction end date end dateApril 2011 August 2011 4Completion date Completion date for construction for construction and initial overlay and initial overlay for test events for test eventsJuly 2011 August 2011 1April 2011 February 2011 –2April 2011 March 2011 –1April 2011 April 2011 0June 2011 July 2011 1 December 2011 December 2011 0 February 2012 April 2011 –10 June 2010 October 2010 4 April 2010 July 2009 –9 February 2009 January 2009 –1Source: National Audit Office examination of actual and forecast progress against the November 2007 Programme BaselineNOTE",3,0.570649
454,"The Committee suspended from 4.09 pm to 4.23 pm for a division in the House Alan Keen 202. I did not get to the end of the question at the beginning but the point I am making is that because we have to have a village and all the events have to be in that area it adds costs to hosting the Olympics, I reckon at least half a billion and probably a billion. If we could spread them round the country—and I went to Japan for the World Cup and the atmosphere was brilliant. We went to different places—more people could get to see it. If we could do that with the Olympics, the point I am really asking you is that it is difficult for the Government. The Minister and the Secretary of State are going to see the President of the IOC on Friday. It will not do our bid any good if they go there telling them how they should organise the Olympic Games in the future. I am really asking you as the main channellers of funding in sport in this country, will you make these representations that the Olympics, just for the sake of having 18,000 athletes in one village, which is very nice, although it is not so nice for those whose event comes on the last day and they want a party—we could save somewhere between half a billion and a billion pounds by using facilities we have got around the country now. The athletics could be at Wembley as they were supposed to be. The football could be at the main stadium and spread around the country as it is going to be in fact. What I am saying is that instead of having the athletes all together in one village for the three weeks of the Olympics, we could put a party on for them and they could stay for a week after the Olympics when they could all get drunk if that is what they do. I think somebody needs to go to the IOC and put this point to them. We have been taking evidence from people in the last couple of days and there are tremendous difficulties. There would hardly be a difficulty if we could use stadia around the country and we did not have to have the village. It is the village that causes all the problems that we are facing now",6,0.568952


### 5.3.3.  N most prototypical paragraphs where the belong() function is greater than the threshold L
I followed the method of extraction of prototypical text suggested by PTBI (Marchetti and Puranam, 2020. p. 14). PTBI attempts to not only extract the prototypical documents to improve interpretability, but also to find the minimum number of prototypical documents for topic interpretation. The algorithm is shown as follows:
1. Defines a threshold L  (L < ∈ [0,1]). For instance, we set L to be 0.5.
2. For each topic, select the documents with the probability that they belong to the topic is not less than L (0.5). 
3. For each topic, check whether the number of documents selected is not less than 1/L. For instance, if L = 0.5, for each topic we need at least 2 documents for topic interpretation. This method weakens the limitation that a few documents have a high proportion of a topic is because of randomness.
4. Compute the percentage of interpretable topics as described in step iii
5. Change L, keep iterating and find the optimal L with which the percentage of interpretable topics is the highest. 

##### Indenfication of the optimal L and miminum number of paragraphs for topic interpretation

In [144]:
List_num_doc = [x for x in range(1, 20, 1)] # generate a list of 1/L (minimum number of documents to interpret a topic)
list_L = [1/x for x in List_num_doc]        # generate a list of L

In [145]:
# create the function for computing the percentage of potentially interpretable topics against parameter L
def perc(i, df):
    list_num_topics = []
    for j in df:                                  
        topic_filter = df[j] >= i         
        m = df[j][topic_filter].count()           
        list_num_topics.append(m)                                             
        count1 = sum(map(lambda x : x >= 1/i, list_num_topics))                                     
        perc1 = count1 / k
    return(perc1)

The plot shows that when L = 0.333, the percentage of interpretable topics is 100%, so I set L to be 0.333 - ie, each topic needs at least 3 (1/0.333) paragraphs with the probability that they belong to the topic is no less than 3  for interpretation. It is worth noting that L is inversely proportional to the minimum number of paragraphs of each topic for interpretation (1/L), in other words, the lower the threshold L is, the more paragraphs that users need to interpret the topics. Although when L = 0.1 the percentage of interpretable topics is also 100%, the minimum number of paragraphs of each topic for interpretation also rises to 10 (1/0.1), which increases the workload of interpretation significantly.

In [146]:
list_perc2 = []
for i in list_L:
    num = perc(i, df_topic_para.drop(list_remove_para))
    list_perc2.append(num)

df_L2 = pd.DataFrame({'Threshold_L': list_L, 'Percentage of interpretable topics': list_perc2})
fig_L2 = px.line(df_L2, x = 'Threshold_L', y="Percentage of interpretable topics", title = 'Percentage of interpretable topics')
fig_L2.update_layout(autosize=False, width=800, height=400)
fig_L2.update_traces(mode = "lines + markers")
fig_L2.show()

In [147]:
# define the function for extracting the highest N ranked paragraphs from each topic
def top_n_filter(df, top_n):
    list_topic_id = [x+1 for x in range(0,k)]
    list_n_para = []
    list_n_p = []
    list_n_index = []
    for x in range(1, k + 1): 
        n_para = [i for i in df.nlargest(top_n, [x])['paragraphs']]
        n_p = [i for i in df.nlargest(top_n, [x])[x]]
        n_index = [i for i in df_topic_para1.nlargest(top_n, [x]).index]
        list_n_para.append(n_para)
        list_n_p.append(n_p)
        list_n_index.append(n_index)
    pd_n_para = pd.DataFrame({'Index':list_n_index, 'topic_id': list_topic_id, 'salient_paragraph': list_n_para, 'probability': list_n_p})
    return(pd_n_para.apply(pd.Series.explode).reset_index().drop('index', axis = 1))

Below we get the the 4 most prototypical paragraphs of each topic when we set the optimal L to be 0.25. 

In [148]:
L = 1/3                                                # set the optimal L based on the analysis above                                             
top_n_above_L = top_n_filter(df_topic_para2, int(1/L))
top_n_above_L['porobability >= L'] = top_n_above_L['probability'] >= L
top_n_above_L.style.set_properties(subset = ['salient_paragraph'], **{'width':'500px', 'length': '50px'})

Unnamed: 0,Index,topic_id,salient_paragraph,probability,porobability >= L
0,17552,1,"4. Most clubs seem ill-prepared for enquiries from, and inclusion of, people with disabilities who wish to participate in the sport offered by the club. There is little or no support for specialist clubs who provide opportunities for sport that cannot be integrated ie wheelchair basketball or blind cricket! There is often nothing locally to support the child or their parent in accessing the specialist provision and this often involves their having to undertake extensive travel to specialist facilities or organisations catering for this group. There is poor information regarding availability etc and http://www.parasport.org.uk was established as a portal to provide pathway and provision information. The then Mayor of London published a strategy in 2007, which highlighted all these issues and to date there has been little, if any, action to redress these anomalies in London or in the rest of the country. DCSF have, in my view, shown no leadership regarding the legacy of 2012 and its impact on PE in schools and the inclusion of those with disabilities in core curriculum activities or sporting opportunities within or after school. DCMS held a legacy event in 2008 and again in April 2009 focusing on the legacy of the Games for those with disabilities. One outcome was to seek greater links between DWP, DCSF and themselves to ensure that joint strategies were developed and pathways established that enabled children to enjoy and participate in PE and sport within schools/after school clubs/integrated and specialist provision in the community, with good national talent forums and pathways established for those who wish to participate in sport at a higher level and finally, with governing bodies having clear inclusive programmes for sports men and women with disabilities active at a national and international level",0.626673,True
1,17553,1,"5. Integration of disabled people into mainstream sporting provision is a concept rather than a reality. Grants to organisations has largely been based on physical access, rather than actual provision of activities, coaching etc. For some it has been a tick box action, rather than an attempt to include and integrate their provision. In order to create a lasting legacy from 2012, this offer of inclusion has to be both genuine and meaningful, ie accessible facilities with no manmade barriers re attitudes, lack of coaching or energy to find solutions regarding sporting opportunities. These are all major threats to establishing a broad base of participation pyramid with hundreds of disabled people participating in sport at the base level, filtering through at representative level for club and nationally, leading on to the tip of the pyramid—international competition at Paralympic level. This pathway does not currently exist for those with a disability and therefore, a different solution needs to be found that provides a separate provision (where appropriate) and uses the mainstream provision (where appropriate)",0.56892,True
2,1519,1,"3. After school activities often exclude this group of children ie no possibility of inclusion in team sports such as football, cricket, rugby, basket ball etc., limited access to swimming baths, athletic fields etc. All of these sports are undertaken by people with a disability, but not now normally at school. Specialist schools did provide a massive range of sporting opportunities and sport played a major part of my own adjustment to disability. I learned about teamwork, was able to set individual goals, have competition to extend my abilities, occasionally experienced ""being a winner"" and had that thrill of competition. One example of efforts to redress this issue within a school setting is in Leeds, which has a programme of monthly sporting and physical activities arranged in school time for vision impaired children within the schools. One solution easily achieved would be for groups of schools to come together monthly to provide sporting and recreational activities for disabled children and young people within their schools",0.564087,True
3,16542,2,"2.3.1.1 Creative industries grow at least 1% more than the rest of the economy. [1] Music grew by 4.7% from 2007 to 2008. [2] 2.3.1.2 Creative industries more than double our investment. The contribution of music to the UK economy reached well over £3.5 billion with the arts estimated to put over £2 back into the economy for every £1 invested [2]. Over 120,000 people are employed in music. [1] In addition, consumers spent over £4 billion on music in 2000. [2] 2.3.2 Music education and effective teaching of music technology are vital to the continuing world leadership of the UK in music. The UK currently lacks the capability to take full advantage of music technology even though music technology has the ability to motivate pupils. [3] The Cultural Olympiad gives an opportunity to put music technology at the centre of Continuing Professional Development. We believe that there is an opportunity to create Cultural Olympiad music technology teachers, available to every local authority and able to provide training and continuing professional development to the current music teacher workforce",0.444426,True
4,17588,2,"2.2 ""Winning: a tourism strategy for 2012 and beyond"", recognises that although London and Britain are already among the world's top city and country destinations, only a very small percentage of Britain's 200,000 tourism businesses will be high up in the 2012 Games supply chain and thus have any direct engagement with the 2012 Games and their visitors and spectators (see figure 1 below). It also recognises the potential of the 2012 Games to act as a catalyst in addressing a number of critical issues which are currently limiting the potential growth of tourism into and within this country. These include the need to reach out to younger visitors and to new markets, to improve aspects and overall perceptions of Britain's welcome and to counteract potential tourism displacement effects which have proved to be a problem for previous Olympic Games host countries.",0.44122,True
5,17650,2,"— Of specific concern to us is the real risk that even if there is an increase in participation, this will not substantially include women. Women's participation already seriously lags behind that of men (only 12.7% of women compared to 20.6% of men participate in at least three 30 minute sessions of sport per week) and is going down while men's is going up (Active People 3 data).[10] Women are also a very different marketplace: women tend to take part in different activities, have different motivations and face some specific barriers. For this reason, it is vital that the participation legacy for 2012 should not just focus on sport, but should also include physical activity",0.434702,True
6,3329,3,"6 PREPARATIONS FOR THE LONDON 2012 OLyMPIC AND PARALyMPIC GAMES: PROGRESS REPORT JuNE 2008 10 The start and completion dates for the construction of the main venue and infrastructure projects delivered by the Olympic Delivery Authority at the end of March 2008 compared with the milestones in the November 2007 Programme Baseline ReportProjectEnabling Works (site preparation) Power Lines under Grounding (switchover only) Structures, Bridges and Highways utilities Main Stadium Aquatics Centre VeloparkHandball/Indoor Sports ArenaBasketballInternational Broadcast Centre/ Main Press CentreOlympic Village Eton Manor (training facilities and Paralympic events) Broxbourne (white water canoeing) Eton Dorney (rowing) Weymouth and Portland (sailing)construction start date November 2007 March 2008 Change in programme Forecast start date baseline (months)October 2006 October 2006 0 July 2008 July 2008 0 April 2008 April 2008 0 January 2008 January 2008 0 July 2008 May 2008 –21 September 2008 September 2008 0 March 2009 March 2009 0August 2009 June 2009 –2July 2009 November 2009 4May 2009 March 2009 –2 June 2008 May 2008 –1 March 2010 January 2010 –2 August 2008 May 2009 9 March 2009 January 2009 –2 May 2008 January 2008 –4construction end date November 2007 March 2008 Change in programme Forecast end date baseline (months)September 2009 September 2009 0 September 2008 November 2008 2 December 2011 December 2011 0 December 2011 August 2011 –4 Construction Construction end date end dateFebruary 2011 April 2011 2Completion date Completion date for construction for construction and initial overlay and initial overlay for test events for test eventsJune 2011 June 2011 0Construction Construction end date end dateApril 2011 August 2011 4Completion date Completion date for construction for construction and initial overlay and initial overlay for test events for test eventsJuly 2011 August 2011 1April 2011 February 2011 –2April 2011 March 2011 –1April 2011 April 2011 0June 2011 July 2011 1 December 2011 December 2011 0 February 2012 April 2011 –10 June 2010 October 2010 4 April 2010 July 2009 –9 February 2009 January 2009 –1Source: National Audit Office examination of actual and forecast progress against the November 2007 Programme BaselineNOTE",0.570649,True
7,2641,3,"main reasons for the increase in costs from the time of the bid to the budget announced in march 2007Source: National Audit Office, drawing on information from the Department for Culture, Media and SportOlympic Delivery Authority programme management costs – increased from £16 million to £570 millionAt the time of the bid, the Department based its estimate of the resources the then proposed delivery body (the Olympic Delivery Authority) would need on the model of an urban development corporation. The Olympic Delivery Authority is now expected to require a significantly higher level of resources and this, together with the need to secure the necessary expertise within the timescales of the programme, has resulted in higher programme management costs. The £570 million programme management costs (excluding VAT) comprise: n £344 million Delivery Authority staff costs and Delivery Partner costs. The Delivery Partner (cLm) is engaged on a framework basis. under this arrangement there is no commitment for ODA to use cLm and it may use other project managers or recruit directly. While they have agreed staffing requirements and base level costs to July 2008 there is no contractual commitment beyond this date. It is therefore uncertain what the division of costs between ODA and cLm will be over the duration of the programme. The average monthly number of full time equivalent staff employed by the Delivery Authority during the year to 31 march 2007 was 152",0.567027,True
8,3288,3,"Progress made on the Olympic Delivery Authority’s construction programme3.2 The majority of the Authority’s spending will be on developing the Olympic Park, some 500 acres of previously used and contaminated land in the Lower Lea Valley in East London where access is severely limited by existing roads, railways and waterways. The preparation of the Olympic Park site and the procurement of the main construction contractors for the venues and infrastructure started in earnest in mid-2006. As plans for the design and layout of the Olympic Park have been firmed up, and stakeholders have provided more information on their requirements, the Authority has continually reviewed its plans for delivering the venues and infrastructure for the Games. In November 2007 the Authority produced the Programme Baseline Report which details the activities which are critical to the successful completion of its delivery programme, sets out the scope, expected costs, cash flow, risks and key milestones for individual projects, and is the baseline against which progress will be reported for the life of the programme (we comment on development of the Programme Baseline Report in paragraphs 2.11 to 2.14)",0.548697,True
9,3639,4,"4 UK Sport’s ‘ultimate goals’ for medal success at the London 2012 Games will require a step change in performance amongst elite athletes. The achievements of athletes at recent elite international events in a number of sports, including sailing, cycling, rowing, boxing, disability equestrian and disability shooting, suggest that performance levels in some sports are already improving significantly. Following increased spending on elite sport, host nations can typically expect to win an extra six or seven gold medals at an Olympic Games and to win medals across a wider range of sports. This ‘host nation effect’ would not in itself be enough to deliver UK Sport’s Olympic goal, which is likely to require an improvement of eight or nine gold medals over the Great Britain team’s performance at the Athens Games in 2004 if the relative performance of other nations remained the same. Changes in the performance of other nations since 2004, especially in the context of a general trend of increased spending on elite sport, sometimes referred to as a ‘global sporting arms race’, may also have implications for UK Sport in delivering its medal aspirations",0.546636,True


### 5.3.4. N most prototypical paragraphs of each topic

In [150]:
# 2 most prototypical paragraphs of each topic
N2 = 2
top_n_filter(df_topic_para2, N2).style.set_properties(subset = ['salient_paragraph'], **{'width':'500px', 'length': '50px'})

Unnamed: 0,Index,topic_id,salient_paragraph,probability
0,17552,1,"4. Most clubs seem ill-prepared for enquiries from, and inclusion of, people with disabilities who wish to participate in the sport offered by the club. There is little or no support for specialist clubs who provide opportunities for sport that cannot be integrated ie wheelchair basketball or blind cricket! There is often nothing locally to support the child or their parent in accessing the specialist provision and this often involves their having to undertake extensive travel to specialist facilities or organisations catering for this group. There is poor information regarding availability etc and http://www.parasport.org.uk was established as a portal to provide pathway and provision information. The then Mayor of London published a strategy in 2007, which highlighted all these issues and to date there has been little, if any, action to redress these anomalies in London or in the rest of the country. DCSF have, in my view, shown no leadership regarding the legacy of 2012 and its impact on PE in schools and the inclusion of those with disabilities in core curriculum activities or sporting opportunities within or after school. DCMS held a legacy event in 2008 and again in April 2009 focusing on the legacy of the Games for those with disabilities. One outcome was to seek greater links between DWP, DCSF and themselves to ensure that joint strategies were developed and pathways established that enabled children to enjoy and participate in PE and sport within schools/after school clubs/integrated and specialist provision in the community, with good national talent forums and pathways established for those who wish to participate in sport at a higher level and finally, with governing bodies having clear inclusive programmes for sports men and women with disabilities active at a national and international level",0.626673
1,17553,1,"5. Integration of disabled people into mainstream sporting provision is a concept rather than a reality. Grants to organisations has largely been based on physical access, rather than actual provision of activities, coaching etc. For some it has been a tick box action, rather than an attempt to include and integrate their provision. In order to create a lasting legacy from 2012, this offer of inclusion has to be both genuine and meaningful, ie accessible facilities with no manmade barriers re attitudes, lack of coaching or energy to find solutions regarding sporting opportunities. These are all major threats to establishing a broad base of participation pyramid with hundreds of disabled people participating in sport at the base level, filtering through at representative level for club and nationally, leading on to the tip of the pyramid—international competition at Paralympic level. This pathway does not currently exist for those with a disability and therefore, a different solution needs to be found that provides a separate provision (where appropriate) and uses the mainstream provision (where appropriate)",0.56892
2,16542,2,"2.3.1.1 Creative industries grow at least 1% more than the rest of the economy. [1] Music grew by 4.7% from 2007 to 2008. [2] 2.3.1.2 Creative industries more than double our investment. The contribution of music to the UK economy reached well over £3.5 billion with the arts estimated to put over £2 back into the economy for every £1 invested [2]. Over 120,000 people are employed in music. [1] In addition, consumers spent over £4 billion on music in 2000. [2] 2.3.2 Music education and effective teaching of music technology are vital to the continuing world leadership of the UK in music. The UK currently lacks the capability to take full advantage of music technology even though music technology has the ability to motivate pupils. [3] The Cultural Olympiad gives an opportunity to put music technology at the centre of Continuing Professional Development. We believe that there is an opportunity to create Cultural Olympiad music technology teachers, available to every local authority and able to provide training and continuing professional development to the current music teacher workforce",0.444426
3,17588,2,"2.2 ""Winning: a tourism strategy for 2012 and beyond"", recognises that although London and Britain are already among the world's top city and country destinations, only a very small percentage of Britain's 200,000 tourism businesses will be high up in the 2012 Games supply chain and thus have any direct engagement with the 2012 Games and their visitors and spectators (see figure 1 below). It also recognises the potential of the 2012 Games to act as a catalyst in addressing a number of critical issues which are currently limiting the potential growth of tourism into and within this country. These include the need to reach out to younger visitors and to new markets, to improve aspects and overall perceptions of Britain's welcome and to counteract potential tourism displacement effects which have proved to be a problem for previous Olympic Games host countries.",0.44122
4,3329,3,"6 PREPARATIONS FOR THE LONDON 2012 OLyMPIC AND PARALyMPIC GAMES: PROGRESS REPORT JuNE 2008 10 The start and completion dates for the construction of the main venue and infrastructure projects delivered by the Olympic Delivery Authority at the end of March 2008 compared with the milestones in the November 2007 Programme Baseline ReportProjectEnabling Works (site preparation) Power Lines under Grounding (switchover only) Structures, Bridges and Highways utilities Main Stadium Aquatics Centre VeloparkHandball/Indoor Sports ArenaBasketballInternational Broadcast Centre/ Main Press CentreOlympic Village Eton Manor (training facilities and Paralympic events) Broxbourne (white water canoeing) Eton Dorney (rowing) Weymouth and Portland (sailing)construction start date November 2007 March 2008 Change in programme Forecast start date baseline (months)October 2006 October 2006 0 July 2008 July 2008 0 April 2008 April 2008 0 January 2008 January 2008 0 July 2008 May 2008 –21 September 2008 September 2008 0 March 2009 March 2009 0August 2009 June 2009 –2July 2009 November 2009 4May 2009 March 2009 –2 June 2008 May 2008 –1 March 2010 January 2010 –2 August 2008 May 2009 9 March 2009 January 2009 –2 May 2008 January 2008 –4construction end date November 2007 March 2008 Change in programme Forecast end date baseline (months)September 2009 September 2009 0 September 2008 November 2008 2 December 2011 December 2011 0 December 2011 August 2011 –4 Construction Construction end date end dateFebruary 2011 April 2011 2Completion date Completion date for construction for construction and initial overlay and initial overlay for test events for test eventsJune 2011 June 2011 0Construction Construction end date end dateApril 2011 August 2011 4Completion date Completion date for construction for construction and initial overlay and initial overlay for test events for test eventsJuly 2011 August 2011 1April 2011 February 2011 –2April 2011 March 2011 –1April 2011 April 2011 0June 2011 July 2011 1 December 2011 December 2011 0 February 2012 April 2011 –10 June 2010 October 2010 4 April 2010 July 2009 –9 February 2009 January 2009 –1Source: National Audit Office examination of actual and forecast progress against the November 2007 Programme BaselineNOTE",0.570649
5,2641,3,"main reasons for the increase in costs from the time of the bid to the budget announced in march 2007Source: National Audit Office, drawing on information from the Department for Culture, Media and SportOlympic Delivery Authority programme management costs – increased from £16 million to £570 millionAt the time of the bid, the Department based its estimate of the resources the then proposed delivery body (the Olympic Delivery Authority) would need on the model of an urban development corporation. The Olympic Delivery Authority is now expected to require a significantly higher level of resources and this, together with the need to secure the necessary expertise within the timescales of the programme, has resulted in higher programme management costs. The £570 million programme management costs (excluding VAT) comprise: n £344 million Delivery Authority staff costs and Delivery Partner costs. The Delivery Partner (cLm) is engaged on a framework basis. under this arrangement there is no commitment for ODA to use cLm and it may use other project managers or recruit directly. While they have agreed staffing requirements and base level costs to July 2008 there is no contractual commitment beyond this date. It is therefore uncertain what the division of costs between ODA and cLm will be over the duration of the programme. The average monthly number of full time equivalent staff employed by the Delivery Authority during the year to 31 march 2007 was 152",0.567027
6,3639,4,"4 UK Sport’s ‘ultimate goals’ for medal success at the London 2012 Games will require a step change in performance amongst elite athletes. The achievements of athletes at recent elite international events in a number of sports, including sailing, cycling, rowing, boxing, disability equestrian and disability shooting, suggest that performance levels in some sports are already improving significantly. Following increased spending on elite sport, host nations can typically expect to win an extra six or seven gold medals at an Olympic Games and to win medals across a wider range of sports. This ‘host nation effect’ would not in itself be enough to deliver UK Sport’s Olympic goal, which is likely to require an improvement of eight or nine gold medals over the Great Britain team’s performance at the Athens Games in 2004 if the relative performance of other nations remained the same. Changes in the performance of other nations since 2004, especially in the context of a general trend of increased spending on elite sport, sometimes referred to as a ‘global sporting arms race’, may also have implications for UK Sport in delivering its medal aspirations",0.546636
7,3798,4,"UK Sport will set targets for the London 2012 Games following a review of performance at the Beijing 2008 Games and in the light of available funding 3.4 In its funding submission for 2012, UK Sport referred to an ‘ultimate goal of finishing fourth in the 2012 Olympics’, though the published funding agreement between the Department and UK Sport at that time, set before the London games were secured, said that the target was fifth. When questioned by the Committee of Public Accounts in March 2006, both the Department and UK Sport maintained there was no target. They attributed the confusion to their inappropriate use of the term “target”, and agreed that various published documents had confused long term aims and specific, measurable, agreed and resourced targets. The Committee recommended that “performance expectations need to be unambiguous and clearly explicable”. They also recommended that, in the knowledge of the resources available to it in the run up to London 2012, UK Sport should decide its medal table targets for 2012, which should be reflected in the targets it agreed with individual sports and reviewed in the light of performance at the Beijing 2008 Games",0.527726
8,4954,5,"Stratford is a huge churn. We have done research into the nature of the flats that have gone up around Stratford—the better end—and they are being rented by people who work and basically it is a place to sleep and they go out and do other things, which is what people do when they are young and that is fine. What we could easily end up with is what we have ended up with elsewhere in the borough, which is a desert, with people not interested in the community and so we have a bunch of social tenants in there—not huge numbers—and no real community. I think we have to have a proper debate and discussion around what will happen with the rest of the housing. Ideally we want people who live there to buy and then continue to have an interest in the community. So any discussion about the Village I would argue needs to start with what happens to the private sector area because this has to work; if it does not work it can damage the rest of the Olympic area and the Olympic Park. We are now arguing, for example, for a Royal Park because we think that that will raise the whole standard of the area and people will get some sense of a better place, a place where it is really good to live. The OPLC understands that it needs to get housing and family housing and not just blocks of flats, so starting with that. Then you get to this question about what would the nature of the allocations be and we are about to have a big discussion with various partners where we talk about who would we move in. Certainly from a Newham point of view—because the Village is in Newham—we took a court case which opposed housing by need and were successful in turning the policy around, and the Government has now introduced a policy which would allow us to support people who are working into social housing because our view would tend to be if you have a low rent that is a great benefit if you are working and on a lower income, but if you are on benefit that is not much of a benefit at all. We are certainly at the moment having a discussion around allocations policy and there are plans to have it. We need to make the community work—start off with the community. I get very frustrated when I hear people talk about ""units""; talk about the community that you are trying to create and how that will look. If we want to have this as a place that people want to live in in 100 years' time the community has to work. To do that you do not then take, for example, social housing for everybody that is not working together because we have evidence that that does not work, so if it does not work perhaps we should stop trying to do it. The nature of the allocation as to who moves in there is, I think, up for debate at the moment and all the people involved are willing to look at that in a radical way that will enable us to build a community there. But I will come back again, you can do that in the social and affordable rent side, what about the rented properties that you will end up with? People will invest and buy chunks off the plan and they will end up with a lot of people that do not necessarily spend a lot of time there. For me it is a very clear thing, this should be the responsibility of the OPLC, it should not be the responsibility of the ODA and the ODA should be handing over that responsibility to the OPLC tomorrow because the ODA has no locus or interest there. It is not that they are doing badly because these guys have built an Olympic site on time and on budget, they are doing a really good job from the point of view of doing what they were asked to do, but they will not be there after 2014 and we will. The OPLC should be the people who are actually involved in the Village now. I am very clear about that",0.56036
9,4943,5,"00,000 square feet, you have the Broadcast Centre which, again, is very well connected in terms of power and Internet connectivity, again, good for the latest approaches to production, whether it be broadcast production or any kind of creative media industries that rely on high power and digital connections, which has seen the industry move from having its heart in Soho over the years—where you cannot get the power to run the air conditioning and the servers and the Internet connectivity not being so good—to Shoreditch and benefiting from cheaper warehouse-type accommodation and the bandwidth afforded by the City of London being adjacent to that. Now prices are rising there and more of those warehouses are being turned into lofts and prices are rising and bandwidth is being constrained and not so much investment is going in there or, if it is, it is to maintain that for the City and not spare bandwidth that they can use sending films and recordings and stuff backwards and forward between here and the West Coast of America so, again, they need somewhere new. I think that there is an issue about how attractive the Broadcast Centre is as a building and so there will be a big issue for the Legacy Company (OPLC) to make that an attractive place which is going to be appealing to media professionals. Often it has been somewhat, I would say, disparagingly termed a ""big tin shed"", and undoubtedly that is what it is, but actually big large spaces are what some of these companies need. The fact that there is good transport access, Internet power and all that stuff has the potential to be a really good offer. I do not want to take up the Committee's time but the OPLC now, it is great that that has been formed and it is up and running because what has been lacking is someone—other than the boroughs—who really has the power and the control to market that building because it is never really the ODA's job because they are gone after 2014 so filling it is not really their responsibility and never has been. The tenants that you would want in there—or even the tenants you do not want in there—no potential tenant was going to sign in 2006-07 for a building that they could not occupy until 2014 and beyond, and unfortunately that was often portrayed as no-one is interested. I would not be surprised if no-one was interested—it was six or seven years out and no one even knew what the building was going to look like inside or out. I would think that we should be getting worried by about 2012 if still two to three years out from occupation no-one has expressed an interest, that is when I think we should be worried. Here at 2010 now I think is when we should be generating interest for people to occupy it",0.557435


### 5.3.5.  N most prototypical paragraphs of a specific topic

In [151]:
topic_id_chosen = 10                                    # choose the topic ID
num_para = 2                                            # set N to extract the N most prototypical paragraphs of a specific topic
df_n_topic_k = top_n_filter(df_topic_para2, num_para)
topic_id_filter = df_n_topic_k['topic_id'] == topic_id_chosen
df_n_topic_k[topic_id_filter].style.set_properties(subset = ['salient_paragraph'], **{'width':'500px', 'length': '50px'})

Unnamed: 0,Index,topic_id,salient_paragraph,probability
18,12700,10,"2. British Cycling told us that the cycling facilities at the Velopark had “the potential to be absolutely world-class” and that they “should be the very best anywhere in the world”.152 There has nonetheless been a certain amount of controversy about the extent to which the Velopark will offer a suitable replacement for off-road facilities at the former Eastway Circuit, lost when land was assembled by the LDA for incorporation into the Olympic Park. The design currently proposed by the ODA for the Velopark offers most of the facilities previously available at Eastway, albeit in a more fragmented layout. British Cycling, despite being supportive of the proposed design for use during the Games and despite anticipating that, after the Games, the Velopark will “provide a boost for cycling”,153 initially lodged objections to the relevant planning applications on the grounds that they did “not provide an adequate or comparable replacement for the road and off-road facilities provided to cycling on the Eastway Circuit”. British Cycling is now satisfied that the ODA has taken on board its concerns and that current plans for the Velopark offer an acceptable replacement for Eastway. The Eastway Users Group, which has campaigned for off-road cycling facilities in the Velopark in legacy mode, remains frustrated by the uncertainty about future provision, and it has pointed out to us that facilities at Eastway closed before the ODA or LDA had provided any suitable temporary alternative, causing 149 Ev 107",0.51311
19,12643,10,"1. Beyond the initial payment to the LDA, proceeds will be split between the LDA, to repay costs associated with the remediation and disposal of land and buildings in the Olympic Park, and DCMS, which will act as a channel for reimbursement of the National Lottery Distribution Fund (NLDF). An initial tranche of £631 million will be allocated pro rata according to a formula which would lead to reimbursement of three-quarters of the funds due to the NLDF and one quarter of the remaining funds due to the LDA. A further £544 million will then be allocated according to a formula which would, if receipts from land sales allow, lead to reimbursement of the remaining quarter of the funds due to the NLDF and the remaining three-quarters of the funds due to the LDA. The treatment of any further surplus arising from land sales “will be determined separately at the time by agreement between the Government and the Mayor”. The Revised Memorandum of Understanding does not provide for repayment to the LDA of its grant of £250 million to the ODA, included within the original Public Sector Funding Package.80 It is important to observe that the Memorandum of Understanding does not provide for uprating of these amounts for general inflation. This is a significant omission, to which we return below",0.492892
