# 1. Prerequisites

In [202]:
# pip install PyMuPDF                    # (install PyMuPDF for extracting info from PDF files)
# pip install tika                       # (install tika for extracting paragraphs from PDF files)
# pip install spacy==2.2.0               # (install spacy for lemmatization)
# conda install gensim                   # (intall gesim for topic modelling)
# pip install pyLDAvis                   # (install pyLDAvis for topic modelling visulisation)
# conda install -c conda-forge pyldavis  # (if you use aconda to install pyLADvis)

In [203]:
import pandas as pd
import numpy as np
import re

# glob for extracting the directories of metadata
import glob

# PyMuPDF
import fitz

# tika
import tika               
from tika import parser   

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Visualisation
import plotly.express as px
import pyLDAvis
import pyLDAvis.gensim_models
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)
import os

# 2. Import pdf files, data wrangling and overview

In [204]:
# Extract the directories of the PDF files, make sure the folder name does not contain number
pdf_dir = "D:\LEON\Business Analytics\Study\9. Business Project\Data set\Olympics"
pdf_files = glob.glob("%s/*.pdf" % pdf_dir)
pdf_files[:1]

['D:\\LEON\\Business Analytics\\Study\\9. Business Project\\Data set\\Olympics\\Examination_of_Witnesses_Sept_2003_-_Q1-19.pdf']

In [205]:
# Use PyMuPDF to extract all info of the PDF files (text, title, date, etc)
list_metadata = []
for i in pdf_files:
    with fitz.open(i) as doc:
        info = doc.metadata
        info['file_name'] = os.path.basename(i)
        text = ''
        for page in doc:
            text+= page.getText()
        info['Content'] = text       
    list_metadata.append(info)

In [206]:
df = pd.DataFrame(list_metadata)
df['document_id'] = df.index
df = df.drop_duplicates(subset = 'Content')             # drop duplicate rows
df = df.dropna(subset=['Content'])                      # drop rows whose text content is NaN
df['Word_count'] = df ['Content'].str.count(' ') + 1
df.head(3)

Unnamed: 0,format,title,author,subject,keywords,creator,producer,creationDate,modDate,trapped,encryption,file_name,Content,document_id,Word_count
0,PDF 1.7,,B Lewis,,,Microsoft Word,,D:20210822083603+00'00',D:20210822083603+00'00',,,Examination_of_Witnesses_Sept_2003_-_Q1-19.pdf,Examination of Witnesses (1-19) \n16 SEPTEMBER...,0,6115
1,PDF 1.7,,B Lewis,,,Microsoft Word,,D:20210822083606+00'00',D:20210822083606+00'00',,,Examination_of_Witnesses_Sept_2003_-_Q20-39.pdf,Examination of Witnesses (20-39) \n16 SEPTEMBE...,1,4002
2,PDF 1.7,,B Lewis,,,Microsoft Word,,D:20210822083609+00'00',D:20210822083609+00'00',,,Examination_of_Witnesses_Sept_2003_-_Q40-44.pdf,Examination of Witnesses (40-44) \n16 SEPTEMBE...,2,1007


In [207]:
# check if there are documents with few words
min_word_count= 10                                               # set the threshold of the minimum word count of each document 
min_word_count_filter = df['Word_count'] <= min_word_count
df_few_words = df[min_word_count_filter][['file_name', 'Content']]
df_few_words

Unnamed: 0,file_name,Content


In [208]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 169 entries, 0 to 168
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   format        169 non-null    object
 1   title         169 non-null    object
 2   author        169 non-null    object
 3   subject       169 non-null    object
 4   keywords      169 non-null    object
 5   creator       169 non-null    object
 6   producer      169 non-null    object
 7   creationDate  169 non-null    object
 8   modDate       169 non-null    object
 9   trapped       169 non-null    object
 10  encryption    3 non-null      object
 11  file_name     169 non-null    object
 12  Content       169 non-null    object
 13  document_id   169 non-null    int64 
 14  Word_count    169 non-null    int64 
dtypes: int64(2), object(13)
memory usage: 21.1+ KB


In [209]:
# Word count
df['Word_count'].sum( )

1054090

# 3. Natural language processing

### 3.1. Tokenisation

In [210]:
data = df.Content.values.tolist()

In [211]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence).encode('utf-8'), deacc=True))  # deacc=True removes punctuations

data_words= list(sent_to_words(data))

### 3.2. Processing words: 
Remove Stopwords, Make Bigrams and Trigrams,Lemmatisation, remove short words and meaningless words

In [212]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [213]:
# import the stop_words from gensim
from gensim.parsing.preprocessing import remove_stopwords, STOPWORDS
stop_words = [i for i in STOPWORDS]

# add more stop words after analysing the overall term frequncy of each topic in pyLDAvis in the "Word frequency of each topic" section
new_stop_words = ['go', 'would', 'make', 'think', 'take', 'say', 'need', 'want', 'thing', 'have', 'lot', 
                  'know', 'use', 'try', 'happen', 'ask', 'new', 'way', 'new', 'jonathan', 'stephen']            
stop_words.extend(new_stop_words)
stop_words

['amoungst',
 'mine',
 'describe',
 'another',
 'within',
 'somehow',
 'during',
 'being',
 'de',
 'fifty',
 'whereas',
 'every',
 'will',
 'ltd',
 'than',
 'itself',
 'an',
 'next',
 'therein',
 'noone',
 'four',
 'back',
 'is',
 'third',
 'last',
 'what',
 'whenever',
 'neither',
 'thick',
 'thereby',
 'thin',
 'show',
 'behind',
 'or',
 'thereupon',
 'either',
 'moreover',
 'thence',
 'nevertheless',
 'three',
 'hereby',
 'became',
 'it',
 'seemed',
 'seems',
 'everyone',
 'myself',
 'say',
 'each',
 'them',
 'whose',
 'give',
 'among',
 'sometime',
 'might',
 'now',
 'thru',
 'km',
 'does',
 'various',
 'herself',
 'should',
 'although',
 'meanwhile',
 'all',
 'everywhere',
 'yourselves',
 'been',
 'who',
 'hereafter',
 'amount',
 'very',
 'eg',
 'whether',
 'least',
 'didn',
 'indeed',
 'while',
 'interest',
 'none',
 'nobody',
 'computer',
 'using',
 'per',
 'whom',
 'hence',
 'everything',
 'they',
 'own',
 'beside',
 'would',
 'sixty',
 'find',
 'under',
 'because',
 'our',
 'm

In [214]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stop_words(texts):
    return [[word for word in doc if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [215]:
# Form Trigrams
data_words_trigrams = make_trigrams(data_words)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

#increase the maximum length of text that the parser or NER can process
nlp.max_length = 13000000 #

# Do lemmatization keeping only noun, adj, verb
data_lemmatized1 = lemmatization(data_words_trigrams, allowed_postags=['NOUN', 'ADJ', 'VERB'])

# Set a threshold for removing the words with length less than the threshold
minimum_len = 3 
data_lemmatized2 = []
for i in data_lemmatized1:
    new_element = [x for x in i if len(x) >= minimum_len]
    data_lemmatized2.append(new_element)

# remove stop words
data_lemmatized = remove_stop_words(data_lemmatized2)
data_lemmatized[:1]

[['examination',
  'witness',
  'morning',
  'like',
  'welcome',
  'today',
  'occasional',
  'series',
  'checking',
  'session',
  'follow',
  'regard',
  'olympic',
  'bid',
  'committee',
  'report',
  'bring',
  'good',
  'express',
  'number',
  'concern',
  'issue',
  'ought',
  'consider',
  'interested',
  'hear',
  'understand',
  'course',
  'acceptable',
  'appearance',
  'like',
  'open',
  'statement',
  'happy',
  'listen',
  'thank',
  'thank',
  'opportunity',
  'come',
  'today',
  'talk',
  'little_bit',
  'plan',
  'hope',
  'come',
  'month',
  'month',
  'shall',
  'able',
  'meet',
  'update',
  'progress',
  'like',
  'role',
  'week',
  'explain',
  'sitting',
  'role',
  'offer',
  'accept',
  'work',
  'like',
  'little_bit',
  'idea',
  'job',
  'philosophy',
  'bid',
  'update',
  'initial',
  'thought',
  'reaction',
  'work',
  'date',
  'sense',
  'progress',
  'week',
  'talk',
  'little_bit',
  'future',
  'motivation',
  'role',
  'come',
  'number',

###  3.3. Dictionary and Corpus

In [216]:
# Create Dictionary, set the parameters to filter out tokens in the dictionary by their frequency
no_below = 5             # remove the tokens less frequent than no_below documents (absolute number)
no_above = 0.85          # remove the tokens more frequent than no_above documents (fraction of the total corpus size)
id2word = corpora.Dictionary(data_lemmatized)
id2word.filter_extremes(no_below = no_below, no_above = no_above)

# print the number of reserved unique tokens and word count afer removal of high and low frequency words
print('After removal of high and low frequency words - Number of unique tokens: %d, %d' % (len(id2word),id2word.num_pos))

After removal of high and low frequency words - Number of unique tokens: 3413, 349895


In [217]:
# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 2), (1, 4), (2, 3), (3, 1), (4, 1), (5, 1), (6, 1), (7, 2), (8, 1), (9, 1), (10, 1), (11, 1), (12, 3), (13, 1), (14, 1), (15, 2), (16, 3), (17, 1), (18, 2), (19, 3), (20, 1), (21, 1), (22, 2), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 5), (29, 1), (30, 4), (31, 2), (32, 1), (33, 1), (34, 2), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 2), (41, 4), (42, 3), (43, 1), (44, 1), (45, 1), (46, 2), (47, 1), (48, 3), (49, 1), (50, 1), (51, 1), (52, 1), (53, 11), (54, 17), (55, 1), (56, 1), (57, 61), (58, 4), (59, 3), (60, 1), (61, 1), (62, 5), (63, 1), (64, 3), (65, 1), (66, 1), (67, 11), (68, 1), (69, 2), (70, 2), (71, 8), (72, 5), (73, 1), (74, 2), (75, 1), (76, 1), (77, 2), (78, 1), (79, 10), (80, 2), (81, 3), (82, 1), (83, 4), (84, 1), (85, 1), (86, 2), (87, 4), (88, 16), (89, 1), (90, 1), (91, 2), (92, 3), (93, 16), (94, 1), (95, 1), (96, 3), (97, 2), (98, 1), (99, 2), (100, 5), (101, 1), (102, 1), (103, 1), (104, 1), (105, 4), (106, 1), (107, 3), (108, 1), (109, 1), (

#  4. LDA Model

### 4.1. Building LDA Model, Perparameter/Hyperparameter tuning

In [218]:
# set training parameters and hyperameters
k = 20
passes = 20
iterations = 100
alpha = 50.0/k   
eta = 0.01
random_state = 12345
minimum_probability = 0

Plot the coherence score against number of topics to identify the opitmal k

In [219]:
#start=3; limit=63; step=3
#coherence_values = []
#model_list = []
#for i in range(start,limit,step):
    #model = gensim.models.LdaModel(corpus = corpus,id2word = id2word,alpha = alpha,eta = eta,
    #                               iterations = iterations,num_topics = i,passes = passes,random_state = 12345,minimum_probability = minimum_probability)
    #model_list.append(model)
    #coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=id2word, coherence='c_v')
    #coherence_values.append(coherencemodel.get_coherence())

In [220]:
#list_num_topics = [i for i in range(start, limit, step)]
#df_coherence1 = pd.DataFrame({'Number_of_Topics': list_num_topics, 'Coherence_Score': coherence_values})
#save the result to disk
#df_coherence1.to_pickle('./df_coherence1.pkl')
#load the result from disk
#df_coherence = pd.read_pickle('./df_coherence1.pkl') 

In [221]:
#fig1 = px.line(df_coherence, x = 'Number_of_Topics', y = "Coherence_Score", title = 'Coherence scores against number of topics')
#fig1.update_layout(autosize=False, width=1000, height=400)
#fig1.update_traces(mode = "lines + markers")
#fig1.show()

In [222]:
# num of topics = 15 to get the optimal coherence socre
k = 15
lda_model = gensim.models.LdaModel(
    corpus = corpus,
    id2word = id2word,
    alpha = alpha,
    eta = eta,
    iterations = iterations,
    num_topics = k,
    passes = passes,
    random_state = 12345,
    minimum_probability = minimum_probability)

### 4.2. Topic distribution of documents

In [223]:
# create the function for converting a list of tuples into a dictionary
def Convert(tup, di):
    di = dict(tup)
    return di

In [224]:
# topic distribution of documents
list_topic = []
dictionary_topic = {}
for d in texts:
    bow = id2word.doc2bow(d)
    belong = lda_model[bow]                        # generate a list of tuples of topic distribution of a document
    belong_dic = Convert(belong, dictionary_topic) # convert the list of tuples into a dictionary
    list_topic.append(belong_dic)           
                      
df_topic_distribution = pd.DataFrame(list_topic)   # convert the list of dictionaries into a dataframe

# rename the topic IDs to ensure they are as same as the topic IDs in the pyLDAvis
original_topic_id = [*df_topic_distribution]; new_topic_id = [x + 1 for x in original_topic_id]
df_topic_distribution = df_topic_distribution.rename(columns = dict(zip(original_topic_id, new_topic_id))) #rename the topic IDs to ensure they are as same as the topic IDs in the pyLDAvis
df_topic = pd.merge(df, df_topic_distribution, how = 'left', left_index=True, right_index=True) # merge with info of documents
df_topic.drop(['title','format','creator', 'producer', 'keywords', 'trapped', 'encryption','subject', 'modDate'], axis = 1)

Unnamed: 0,author,creationDate,file_name,Content,document_id,Word_count,1,2,3,4,...,6,7,8,9,10,11,12,13,14,15
0,B Lewis,D:20210822083603+00'00',Examination_of_Witnesses_Sept_2003_-_Q1-19.pdf,Examination of Witnesses (1-19) \n16 SEPTEMBER...,0,6115,0.004820,0.011355,0.032990,0.012826,...,0.663834,0.019587,0.014590,0.091884,0.008266,0.052450,0.003564,0.057776,0.015176,0.006010
1,B Lewis,D:20210822083606+00'00',Examination_of_Witnesses_Sept_2003_-_Q20-39.pdf,Examination of Witnesses (20-39) \n16 SEPTEMBE...,1,4002,0.010785,0.039997,0.026004,0.006935,...,0.717799,0.008112,0.021213,0.075689,0.010199,0.010522,0.007809,0.011441,0.025342,0.009812
2,B Lewis,D:20210822083609+00'00',Examination_of_Witnesses_Sept_2003_-_Q40-44.pdf,Examination of Witnesses (40-44) \n16 SEPTEMBE...,2,1007,0.024581,0.030571,0.084446,0.018693,...,0.580960,0.020036,0.028731,0.060172,0.021893,0.018353,0.020019,0.021789,0.026806,0.022368
3,Bronwen Lewis,D:20210822084116+00'00',Further_supplementary_memorandum_submitted_by_...,Further supplementary memorandum submitted by ...,3,431,0.040956,0.049721,0.026363,0.268673,...,0.022124,0.085368,0.022797,0.022044,0.235568,0.111551,0.021174,0.020154,0.028024,0.023715
4,Bronwen Lewis,D:20210822083921+00'00',Further_Supplementary_Memorandum_submitted_by_...,Further supplementary memorandum submitted by ...,4,288,0.033830,0.214867,0.099047,0.049259,...,0.056045,0.112267,0.038180,0.079453,0.044316,0.064786,0.031144,0.037890,0.043030,0.055079
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164,Bronwen Lewis,D:20210822084528+00'00',Written_evidence_submitted_by_UK_Sport_-_Jan_2...,Written evidence submitted by UK Sport \n \n ...,164,3089,0.012864,0.035563,0.009788,0.006837,...,0.025465,0.007117,0.515936,0.041069,0.006454,0.007099,0.010358,0.263849,0.011717,0.030494
165,Bronwen Lewis,D:20210822084531+00'00',Written_evidence_submitted_by_Vision_2020_UK_-...,Written evidence submitted by Vision 2020 UK ...,165,2284,0.004715,0.004972,0.005505,0.004796,...,0.004288,0.005174,0.005367,0.004263,0.006522,0.004256,0.004678,0.006269,0.007911,0.004397
166,Bronwen Lewis,D:20210822084535+00'00',Written_evidence_submitted_by_VisitBritain_-_J...,Written evidence submitted by VisitBritain \n...,166,2372,0.012091,0.007760,0.015914,0.006989,...,0.066306,0.013051,0.015264,0.012516,0.009403,0.009534,0.005359,0.026393,0.622641,0.170108
167,Bronwen Lewis,D:20210822084543+00'00',Written_evidence_submitted_by_Womens_Sport_and...,Written evidence submitted by the Women's Spor...,167,1966,0.008041,0.013320,0.026277,0.011300,...,0.015589,0.013262,0.019673,0.008634,0.015239,0.012286,0.006389,0.816624,0.013542,0.007976


# 5. Topic interpretation tools

I first identify the salient topics defined by PTBI proposed by Marchetti and Puranam (2020), then combine both the topic visualisation of pyLDAvis and the prototypical texts defined by PTBI to facilitate the topic interpretation.

##  5.1. Salient topics for interpretation
PTBI assumes that the topics with little salience are not worthy of interpretation. To extract the most salient topics for interpretation, for each topic, we need to compute the fraction of documents with the probability that the document belongs to the topic is more than > 1/K (Marchetti and Puranam, 2020, p. 14), and I defined the fraction as the “salience” of the topic. 

The scree plot below shows that when the topics are sorted by salience in descending order, the salience tends to level off on topic 5, as a result, we can select the topics ahead of topic 5 as the salient topics for interpretation.

In [225]:
# compute salience: the fraction of documents with the probability that the document belongs to the topic is more than > 1/K for each document
list_percent_above = []
for i in df_topic_distribution:
    num_above = df_topic_distribution[i][df_topic_distribution[i] > 1/k].count()
    percent_above =  num_above/len(df_topic_distribution)
    list_percent_above.append(percent_above)
    
df_salient_topic = pd.DataFrame({'topic_ID':  [str(i) for i in new_topic_id], 'salience': list_percent_above}).sort_values(
    by = 'salience', ascending = False)

In [226]:
fig_L1 = px.line(df_salient_topic, x = 'topic_ID', y = 'salience', title="Scree plot of salience of topics")
fig_L1.update_layout(autosize=False, width=800, height=400)
fig_L1.update_traces(mode = "lines + markers")
fig_L1.show()

## 5.2. Topic visualisation

Check the words of each topic, if there're common words with high overall frequency such as "think" "want" or "make", return to the "import the stop_words from gensim" section, add these words to the list of stop words to remove them.

In [227]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, sort_topics = False )
vis

## 5.3. prototypical paragraphs
The prototypical paragraphs, the paragraphs with a high probabitiy that they belong to a topic, can be used to assist topic interpretation. This section classify the paragraphs into topics and provides the users 4 types of filters to select the prototypical paragraphs: N most prototypical paragraphs overall, N most prototypical paragraphs where the belong() function is greater than the threshold L, N most prototypical paragraphs of each topic and N most prototypical paragraphs of a specific topic.

### 5.3.1.  Classify the paragraphs based on the trained model

##### Extract paragraphs from documents

In [228]:
# define the function for spliting texts into paragraphs by delimiter '.\n\n' or '. \n\n'
def para_split(i):
    j = parser.from_file(i)
    m = j['content']
    import re
    return re.split('[?.!-]\n|[?.!-] \n|  \n\n|\n\n[0-9]', m)

In [229]:
list_paragraphs = []
list_para_id = []
for i in pdf_files:
    para = para_split(i)
    para = [w.replace('\n', '') for w in para]
    para = [x.strip() for x in para if x.strip()] # remove empty elements
    para_id = [x for x in range(len(para))] 
    list_paragraphs.append(para)
    list_para_id.append(para_id)

In [230]:
df_para1 = df.copy()
df_para1['paragraphs'] = list_paragraphs
df_para1['para_id'] = list_para_id
df_para2 = df_para1.apply(pd.Series.explode)
df_para3 = df_para2.reset_index()
df_para4 = df_para3[['creationDate', 'document_id', 'file_name', 'para_id', 'paragraphs']]
len(df_para4) # number of paragraphs extracted

21640

In [231]:
# set a filter to filter out the paragraphs with short words
n_word_count = 10                                                        # set the threshold of word count
para_word_count = df_para4['paragraphs'].str.split().str.len()           # word count of each paragraph
df_para = df_para4[(para_word_count>=n_word_count)].reset_index()        # select only the paragraphs with word count above the threshold
df_para

Unnamed: 0,index,creationDate,document_id,file_name,para_id,paragraphs
0,2,D:20210822083603+00'00',0,Examination_of_Witnesses_Sept_2003_-_Q1-19.pdf,2,"MS BARBARA CASSANI Q1 Chairman: Good morning, ..."
1,3,D:20210822083603+00'00',0,Examination_of_Witnesses_Sept_2003_-_Q1-19.pdf,3,Ms Cassani: Thank you very much. Thank you ver...
2,4,D:20210822083603+00'00',0,Examination_of_Witnesses_Sept_2003_-_Q1-19.pdf,4,8 months I shall be able to meet frequently wi...
3,5,D:20210822083603+00'00',0,Examination_of_Witnesses_Sept_2003_-_Q1-19.pdf,5,The first thing I should like to say is that I...
4,6,D:20210822083603+00'00',0,Examination_of_Witnesses_Sept_2003_-_Q1-19.pdf,6,Really the backdrop is that I believe in the G...
...,...,...,...,...,...,...
17709,21631,D:20210822084546+00'00',168,Written_evidence_submitted_by_Youth_Sport_Trus...,37,7.3 When the impact of Olympics and Paralympi...
17710,21633,D:20210822084546+00'00',168,Written_evidence_submitted_by_Youth_Sport_Trus...,39,11 2007-08 School Sport Survey. 12 As ...
17711,21634,D:20210822084546+00'00',168,Written_evidence_submitted_by_Youth_Sport_Trus...,40,13 Gold Young Ambassadors work across School...
17712,21635,D:20210822084546+00'00',168,Written_evidence_submitted_by_Youth_Sport_Trus...,41,14 From national data supplied by Department...


##### Process the paragraphs

In [232]:
# tokenization
data2 = df_para.paragraphs.values.tolist()
data_words2 = list(sent_to_words(data2))

In [233]:
# Form Trigrams
data_words_trigrams2 = make_trigrams(data_words2)

# Do lemmatization keeping only noun, adj, vb
data_lemmatized2 = lemmatization(data_words_trigrams2, allowed_postags=['NOUN', 'ADJ', 'VERB'])

# set the length of word threshold as same as before for removing the words less than the threshold
data_lemmatized2_2 = []
for i in data_lemmatized2:
    new_element = [x for x in i if len(x) >= minimum_len]
    data_lemmatized2_2.append(new_element)
    
# Remove Stop Words
data_lemmatized2_1 = remove_stop_words(data_lemmatized2_2)

##### Classify the paragraphs based on the extracted topics

In [234]:
# belong function: classify topics of paragraphs, it might take a long time because there are 148,651 paragraphs in the 11,132,849-word corpus
list_topic_para = []
dictionary_topic_para = {}
for d in data_lemmatized2_1:
    bow = id2word.doc2bow(d)
    belong = lda_model[bow]
    doc_dic = Convert(belong, dictionary_topic_para)
    list_topic_para.append(doc_dic)
    df_topic_para = pd.DataFrame(list_topic_para)

In [235]:
# rename the topic IDs to ensure they are as same as the topic IDs in the pyLDAvis
df_topic_para = df_topic_para.rename(columns = dict(zip(original_topic_id, new_topic_id)))

# topic distribution of paragraphs
df_topic_para1_1 = pd.merge(df_para, df_topic_para, how = 'left', left_index=True, right_index=True)
df_topic_para1_1

Unnamed: 0,index,creationDate,document_id,file_name,para_id,paragraphs,1,2,3,4,...,6,7,8,9,10,11,12,13,14,15
0,2,D:20210822083603+00'00',0,Examination_of_Witnesses_Sept_2003_-_Q1-19.pdf,2,"MS BARBARA CASSANI Q1 Chairman: Good morning, ...",0.052466,0.060493,0.086755,0.044444,...,0.198510,0.068130,0.043441,0.077602,0.047930,0.090144,0.043446,0.048106,0.052635,0.043472
1,3,D:20210822083603+00'00',0,Examination_of_Witnesses_Sept_2003_-_Q1-19.pdf,3,Ms Cassani: Thank you very much. Thank you ver...,0.059793,0.063920,0.094343,0.056322,...,0.147986,0.064243,0.053368,0.053634,0.055082,0.064897,0.060908,0.055276,0.059132,0.054798
2,4,D:20210822083603+00'00',0,Examination_of_Witnesses_Sept_2003_-_Q1-19.pdf,4,8 months I shall be able to meet frequently wi...,0.058416,0.066560,0.067657,0.074964,...,0.096682,0.072135,0.062945,0.057784,0.076686,0.063779,0.058246,0.061903,0.065554,0.058407
3,5,D:20210822083603+00'00',0,Examination_of_Witnesses_Sept_2003_-_Q1-19.pdf,5,The first thing I should like to say is that I...,0.036563,0.042053,0.081075,0.027618,...,0.400027,0.024713,0.025360,0.051575,0.025596,0.037690,0.026853,0.123473,0.040269,0.027735
4,6,D:20210822083603+00'00',0,Examination_of_Witnesses_Sept_2003_-_Q1-19.pdf,6,Really the backdrop is that I believe in the G...,0.054045,0.070232,0.073157,0.043558,...,0.125000,0.034828,0.074170,0.084799,0.046659,0.064144,0.054329,0.071765,0.088206,0.057285
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17709,21631,D:20210822084546+00'00',168,Written_evidence_submitted_by_Youth_Sport_Trus...,37,7.3 When the impact of Olympics and Paralympi...,0.041718,0.054527,0.046149,0.053871,...,0.044309,0.066878,0.067978,0.045310,0.049543,0.048153,0.043755,0.234817,0.064371,0.065416
17710,21633,D:20210822084546+00'00',168,Written_evidence_submitted_by_Youth_Sport_Trus...,39,11 2007-08 School Sport Survey. 12 As ...,0.050951,0.063548,0.065188,0.054643,...,0.063866,0.049182,0.064142,0.060112,0.060950,0.055253,0.052571,0.163371,0.067487,0.050939
17711,21634,D:20210822084546+00'00',168,Written_evidence_submitted_by_Youth_Sport_Trus...,40,13 Gold Young Ambassadors work across School...,0.059381,0.069187,0.059225,0.056139,...,0.057528,0.055017,0.112016,0.052689,0.056770,0.056151,0.062635,0.101657,0.065263,0.058240
17712,21635,D:20210822084546+00'00',168,Written_evidence_submitted_by_Youth_Sport_Trus...,41,14 From national data supplied by Department...,0.056593,0.061943,0.063379,0.066049,...,0.061020,0.073316,0.073771,0.058207,0.071526,0.062140,0.057978,0.072690,0.063120,0.063438


In [236]:
# save the result to disk
df_topic_para1_1.to_pickle('./df_topic_para_Olympics.pkl')

In [237]:
# load the result from disk
df_topic_para1 = pd.read_pickle('./df_topic_para_Olympics.pkl') 

In [238]:
# drop the paragraphs with high frequency but meaningless for interperation based on the extraction of prototypical paragraphs below
list_remove_para = [13309, 12966]                                # input the index you want to drop
df_topic_para2 = df_topic_para1.copy().drop(list_remove_para)

### 5.3.2. N most prototypical paragraphs overall

In [239]:
df_topic_para2_n = df_topic_para2.copy()
df_topic_para2_n['highest_p'] = df_topic_para2_n.iloc[:, 6:].max(axis = 1)          # get the highest probability among the topic distribution of each paragraph
df_topic_para2_n['salient_topic'] = df_topic_para2_n.iloc[:, 6:].idxmax(axis = 1)    # get the corresponding topic id
df_topic_para2_n = df_topic_para2_n[['paragraphs', 'salient_topic', 'highest_p']]

In [240]:
N1 = 5   # Set N to get the N most prototypical paragraphs overall
df_topic_para2_n.nlargest(N1,['highest_p']).style.set_properties(subset = ['paragraphs'], **{'width':'1000px', 'length': '50px'})

Unnamed: 0,paragraphs,salient_topic,highest_p
17552,"4. Most clubs seem ill-prepared for enquiries from, and inclusion of, people with disabilities who wish to participate in the sport offered by the club. There is little or no support for specialist clubs who provide opportunities for sport that cannot be integrated ie wheelchair basketball or blind cricket! There is often nothing locally to support the child or their parent in accessing the specialist provision and this often involves their having to undertake extensive travel to specialist facilities or organisations catering for this group. There is poor information regarding availability etc and http://www.parasport.org.uk was established as a portal to provide pathway and provision information. The then Mayor of London published a strategy in 2007, which highlighted all these issues and to date there has been little, if any, action to redress these anomalies in London or in the rest of the country. DCSF have, in my view, shown no leadership regarding the legacy of 2012 and its impact on PE in schools and the inclusion of those with disabilities in core curriculum activities or sporting opportunities within or after school. DCMS held a legacy event in 2008 and again in April 2009 focusing on the legacy of the Games for those with disabilities. One outcome was to seek greater links between DWP, DCSF and themselves to ensure that joint strategies were developed and pathways established that enabled children to enjoy and participate in PE and sport within schools/after school clubs/integrated and specialist provision in the community, with good national talent forums and pathways established for those who wish to participate in sport at a higher level and finally, with governing bodies having clear inclusive programmes for sports men and women with disabilities active at a national and international level",5,0.657481
4954,"Stratford is a huge churn. We have done research into the nature of the flats that have gone up around Stratford—the better end—and they are being rented by people who work and basically it is a place to sleep and they go out and do other things, which is what people do when they are young and that is fine. What we could easily end up with is what we have ended up with elsewhere in the borough, which is a desert, with people not interested in the community and so we have a bunch of social tenants in there—not huge numbers—and no real community. I think we have to have a proper debate and discussion around what will happen with the rest of the housing. Ideally we want people who live there to buy and then continue to have an interest in the community. So any discussion about the Village I would argue needs to start with what happens to the private sector area because this has to work; if it does not work it can damage the rest of the Olympic area and the Olympic Park. We are now arguing, for example, for a Royal Park because we think that that will raise the whole standard of the area and people will get some sense of a better place, a place where it is really good to live. The OPLC understands that it needs to get housing and family housing and not just blocks of flats, so starting with that. Then you get to this question about what would the nature of the allocations be and we are about to have a big discussion with various partners where we talk about who would we move in. Certainly from a Newham point of view—because the Village is in Newham—we took a court case which opposed housing by need and were successful in turning the policy around, and the Government has now introduced a policy which would allow us to support people who are working into social housing because our view would tend to be if you have a low rent that is a great benefit if you are working and on a lower income, but if you are on benefit that is not much of a benefit at all. We are certainly at the moment having a discussion around allocations policy and there are plans to have it. We need to make the community work—start off with the community. I get very frustrated when I hear people talk about ""units""; talk about the community that you are trying to create and how that will look. If we want to have this as a place that people want to live in in 100 years' time the community has to work. To do that you do not then take, for example, social housing for everybody that is not working together because we have evidence that that does not work, so if it does not work perhaps we should stop trying to do it. The nature of the allocation as to who moves in there is, I think, up for debate at the moment and all the people involved are willing to look at that in a radical way that will enable us to build a community there. But I will come back again, you can do that in the social and affordable rent side, what about the rented properties that you will end up with? People will invest and buy chunks off the plan and they will end up with a lot of people that do not necessarily spend a lot of time there. For me it is a very clear thing, this should be the responsibility of the OPLC, it should not be the responsibility of the ODA and the ODA should be handing over that responsibility to the OPLC tomorrow because the ODA has no locus or interest there. It is not that they are doing badly because these guys have built an Olympic site on time and on budget, they are doing a really good job from the point of view of doing what they were asked to do, but they will not be there after 2014 and we will. The OPLC should be the people who are actually involved in the Village now. I am very clear about that",6,0.613745
17551,"3. After school activities often exclude this group of children ie no possibility of inclusion in team sports such as football, cricket, rugby, basket ball etc., limited access to swimming baths, athletic fields etc. All of these sports are undertaken by people with a disability, but not now normally at school. Specialist schools did provide a massive range of sporting opportunities and sport played a major part of my own adjustment to disability. I learned about teamwork, was able to set individual goals, have competition to extend my abilities, occasionally experienced ""being a winner"" and had that thrill of competition. One example of efforts to redress this issue within a school setting is in Leeds, which has a programme of monthly sporting and physical activities arranged in school time for vision impaired children within the schools. One solution easily achieved would be for groups of schools to come together monthly to provide sporting and recreational activities for disabled children and young people within their schools",5,0.59458
8304,"You think that the assessment of risks is our best estimate of the most likely outcome of the budget as a whole. But actually the assessment of risk-and how we have compiled it-is this: we have not sought to estimate how likely it is that every risk arises. We just said, ""Let us think about every risk that could arise, and let us assume that they all arise and work out the likely cost of them all arising."" On top of that, we said, ""And there will be some risks that we just cannot think about that are unknown unknowns. There will be some multiple consequentials if everything came together."" So we end up with an estimate not of the most likely cost of the project, which is what the burden of paragraph 1 of the PAC Report understands it is, but an estimate of how much we would need to set aside in the very unlikely event that all risks arise and some more unknown risks arise as well. The purpose of that is not to get to an estimate of the likely outcome of the budget. Its purpose and why we do it is to see, against any reasonable view of the likely risk that might arise, even on an assumption that they all arise and some more unknown risks arise, whether we have enough money. The conclusion has always been, yes, we had. Against what is therefore, in my view, a conservative and prudent estimate, we had £36 million headroom at the time of the NAO Report. We had more, and indeed the picture over the six-month period since the original figures on which the NAO was recording this, is that the contingency has gone down by £27 million or so-we reckon, because these are provisional figures, but I want to give our best figures-and the assessed risks on that very conservative and prudent basis have gone down by £136 million. So the picture on the budget as a whole is that we are spending contingency significantly slower than risks are disappearing from the programme. That is why, without in any sense being complacent, I am confident that we will bring this in within budget, and I do not think that the budget is close to being used up",11,0.591896
2847,"9 The planning work has, however, identified new operational requirements and risks to delivery, with significant additional costs. In some cases, programme management information shows that planning is also behind schedule, although delivery bodies are seeking to mitigate delays. For example:Preparations for the London 2012 Olympic and Paralympic Games: Progress report December 2011 Summary 7• the Home Office is responsible for setting security requirements and funding LOCOG’s delivery of venue security: LOCOG is responsible for working out the operational implications of the requirements, including recruitment. In 2006, LOCOG estimated that 10,000 guards would be a reasonable basis to invite tenders from contractors and this informed the 2010 Spending Review settlement for venue security of £282 million. The guarding contract was let to G4S by LOCOG in December 2010. Detailed planning was undertaken by LOCOG and security partners only once the competition schedule and venues were finalised in early spring 2011. This planning, and the revised security requirements flowing from the implementation of the agreed security standards, have increased the peak requirement of guards to 23,700 and the likely cost to an estimated £553 million, a £271 million increase, although this sum is not yet finalised. The near doubling of the costs has increased the strain on the Public Sector Funding Package; • the increased guarding requirement is a significant recruitment challenge and means LOCOG is having to renegotiate its contractual requirements. In addition, the Home Office is in discussions with the Ministry of Defence about the provision of military personnel to act in security roles;• in accordance with its responsibilities under the 2006 Olympics and Paralympic Games Act, the Olympic Delivery Authority has produced a transport plan for the Games, with which other delivery bodies must cooperate. The Delivery Authority produced plans for transport operations in competition venues",4,0.583286


### 5.3.3.  N most prototypical paragraphs where the belong() function is greater than the threshold L
I followed the method of extraction of prototypical text suggested by PTBI (Marchetti and Puranam, 2020. p. 14). PTBI attempts to not only extract the prototypical documents to improve interpretability, but also to find the minimum number of prototypical documents for topic interpretation. The algorithm is shown as follows:
1. Defines a threshold L  (L < ∈ [0,1]). For instance, we set L to be 0.5.
2. For each topic, select the documents with the probability that they belong to the topic is not less than L (0.5). 
3. For each topic, check whether the number of documents selected is not less than 1/L. For instance, if L = 0.5, for each topic we need at least 2 documents for topic interpretation. This method weakens the limitation that a few documents have a high proportion of a topic is because of randomness.
4. Compute the percentage of interpretable topics as described in step iii
5. Change L, keep iterating and find the optimal L with which the percentage of interpretable topics is the highest. 

##### Indenfication of the optimal L and miminum number of paragraphs for topic interpretation

In [241]:
List_num_doc = [x for x in range(1, 20, 1)] # generate a list of 1/L (minimum number of documents to interpret a topic)
list_L = [1/x for x in List_num_doc]        # generate a list of L

In [242]:
# create the function for computing the percentage of potentially interpretable topics against parameter L
def perc(i, df):
    list_num_topics = []
    for j in df:                                  
        topic_filter = df[j] >= i         
        m = df[j][topic_filter].count()           
        list_num_topics.append(m)                                             
        count1 = sum(map(lambda x : x >= 1/i, list_num_topics))                                     
        perc1 = count1 / k
    return(perc1)

The following chart shows that when L = 0.25, the percentage of interpretable topics is 86.7%, so we set L = 0.25 - ie, each topic needs at least 4 (1/0.25) paragraphs to be interpreted.
It is worth noting that L is inversely proportional to the mininum number of paragaphs of each topic for interpreation (1/L), in other words, the lower the threshold L is, the more paragraphs we need to interpret the topics. Although when L = 0.1 the percentage of interpratable topics reaches the highest point (100%), the mininum number of paragaphs of each topic for interpreation also rises to 10 (1/0.1), which increases the workload of interpretation.

In [243]:
list_perc2 = []
for i in list_L:
    num = perc(i, df_topic_para.drop(list_remove_para))
    list_perc2.append(num)

df_L2 = pd.DataFrame({'Parameter L': list_L, 'Percentage of interpretable topics': list_perc2})
fig_L2 = px.line(df_L2, x = 'Parameter L', y="Percentage of interpretable topics", title = 'Percentage of interpretable topics (paragraph-based )')
fig_L2.update_layout(autosize=False, width=800, height=400)
fig_L2.update_traces(mode = "lines + markers")
fig_L2.show()

In [244]:
# define the function for extracting the highest N ranked paragraphs from each topic
def top_n_filter(df, top_n):
    list_topic_id = [x+1 for x in range(0,k)]
    list_n_para = []
    list_n_p = []
    list_n_index = []
    for x in range(1, k + 1): 
        n_para = [i for i in df.nlargest(top_n, [x])['paragraphs']]
        n_p = [i for i in df.nlargest(top_n, [x])[x]]
        n_index = [i for i in df_topic_para1.nlargest(top_n, [x]).index]
        list_n_para.append(n_para)
        list_n_p.append(n_p)
        list_n_index.append(n_index)
    pd_n_para = pd.DataFrame({'Index':list_n_index, 'topic_id': list_topic_id, 'salient_paragraph': list_n_para, 'probability': list_n_p})
    return(pd_n_para.apply(pd.Series.explode).reset_index().drop('index', axis = 1))

Below we get the the 4 most prototypical paragraphs of each topic when we set the optimal L to be 0.25. 

In [245]:
L = 0.25                                                # set the optimal L based on the analysis above                                             
top_n_above_L = top_n_filter(df_topic_para2, int(1/L))
top_n_above_L['porobability >= L'] = top_n_above_L['probability'] >= L
top_n_above_L.style.set_properties(subset = ['salient_paragraph'], **{'width':'500px', 'length': '50px'})

Unnamed: 0,Index,topic_id,salient_paragraph,probability,porobability >= L
0,2115,1,"In July LOCOG issued guidelines for UK sporting facilities to help them in their bid to offer Pre-Games Training facilities to host international teams in 2012. Following the on-line application process, due to end in January 2007, those facilities meeting the criteria will be included in a guide for all National Olympic Committees (NOCs) and National Paralympic Committees (NPCs) which will be issued in 2008. Pre-Games Training Camps will help to engage the whole country and spread the benefits of the Games throughout the UK",0.133998,False
1,10022,1,"Lord Coe: Some of these sports are less specific. If you are a track and field nation then you might decide on the track and field facilities at Bath, Loughborough, Sheffield or wherever, it is not quite as specific as sailing. The other important issue is that we have a whole raft of facilities that can be used at any one stage in that whole build-up process but it is, as Keith said, for the National Olympic Committees to make that decision. I have to say nobody would have told Craig or Simon Clegg from the British Olympic Association that Narromine was better than Noosa and Noosa better than somewhere on the Queensland coast of Australia, that was a judgment those guys made",0.130003,False
2,12442,1,"ODA response There has been significant work to date on the venue specific business and legacy planning for each of the major sporting venues. This work has informed design development and investment decisions for venues in legacy. The LDA, working with the ODA and wider stakeholders, will now take forward through the Legacy Master-planning Framework and business-planning work (shortly to be commissioned through Grant Thornton) the detailed legacy plans for the parkland and retained venues",0.125486,False
3,15065,1,"4.2 Whilst some members are optimistic about the benefits of the Games and the Olympics legacy, someare concerned that the Games will not benefit their local areas. The Government and LOCOG shouldaddress this by putting in place specific prgarammes to help support grassroots organisations and provideguidance on how they and their communities might better benefit from the Olympics",0.12207,False
4,12700,2,"2. British Cycling told us that the cycling facilities at the Velopark had “the potential to be absolutely world-class” and that they “should be the very best anywhere in the world”.152 There has nonetheless been a certain amount of controversy about the extent to which the Velopark will offer a suitable replacement for off-road facilities at the former Eastway Circuit, lost when land was assembled by the LDA for incorporation into the Olympic Park. The design currently proposed by the ODA for the Velopark offers most of the facilities previously available at Eastway, albeit in a more fragmented layout. British Cycling, despite being supportive of the proposed design for use during the Games and despite anticipating that, after the Games, the Velopark will “provide a boost for cycling”,153 initially lodged objections to the relevant planning applications on the grounds that they did “not provide an adequate or comparable replacement for the road and off-road facilities provided to cycling on the Eastway Circuit”. British Cycling is now satisfied that the ODA has taken on board its concerns and that current plans for the Velopark offer an acceptable replacement for Eastway. The Eastway Users Group, which has campaigned for off-road cycling facilities in the Velopark in legacy mode, remains frustrated by the uncertainty about future provision, and it has pointed out to us that facilities at Eastway closed before the ODA or LDA had provided any suitable temporary alternative, causing 149 Ev 107",0.429325,True
5,13056,2,"2. Unlike many of the swimming facilities built for previous Olympics, which have been little used afterthe games, the design of the Aquatic Centre is based upon the technical swimming and diving requirementsfor the games with certain of the other specific requirements for the Games being added as a temporary“overlay” and being dispensed with after the Games. In summary the Centre in legacy mode will have thefollowing:— a 50 metre by 10 lane competition pool with a depth of 3 metres with a moveable floor andbulkhead and permanent spectator seating for 2,500 with an ability for this to be expanded to 3,500when required;Processed: 24-04-2008 19:36:04 Page Layout: COENEW [O] PPSysB Job: 386236 Unit: PAG1Culture, Media and Sport Committee: Evidence Ev 7— in the same hall as the competition pool a 25 metre by 21 metre diving pool with a moveable floorproviding a range of platforms and spring boards to Olympic and World Championshipstandards;— a separate 50 metre by 8 lane training pool with a depth of 2 metres and two moveable floors andbulkheads; and— ancillary matters including timing, scoreboard with video recording and playback, and areas forchanging, catering, dryland training and provision for sports science and will meet therequirements of the various swimming disciplines",0.381628,True
6,1519,2,"Legacy Mode 12. Unlike many of the swimming facilities built for previous Olympics, which have been little used after the games, the design of the Aquatic Centre is based upon the technical swimming and diving requirements for the games with certain of the other specific requirements for the Games being added as a temporary ""overlay"" and being dispensed with after the Games. In summary the Centre in legacy mode will have the following: — a 50 metre by 10 lane competition pool with a depth of 3 metres with a moveable floor and bulkhead and permanent spectator seating for 2,500 with an ability for this to be expanded to 3,500 when required; — in the same hall as the competition pool a 25 metre by 21 metre diving pool with a moveable floor providing a range of platforms and spring boards to Olympic and World Championship standards; — a separate 50 metre by 8 lane training pool with a depth of 2 metres and two moveable floors and bulkheads; and — ancillary matters including timing, scoreboard with video recording and playback, and areas for changing, catering, dryland training and provision for sports science and will meet the requirements of the various swimming disciplines",0.377969,True
7,13309,2,"A fuller explanation of what Mission 2012 involves is contained within a three page document attachedas Annex A.1Driving Standards of Performance—Other InitiativesIn addition to Mission 2012, UK Sport is also seeking to facilitate the development of a network of “EliteTraining Centres” across the UK. The concept reflects the conclusion that the UK’s athletes need to havea clearer sense of location for their world class ambitions than is often currently the case. Such Centres willoVer an environment in which athletes can experience a seamless integration of world-class coaching andtraining in appropriate facilities together with scientific and medical support, all backed-up with lifestyleand educational support systems that ensure their wider development. A full explanation of what the EliteTraining Centre concept involves is contained within a two page document attached as Annex B.2UK Sport also continues to operate dedicated programmes to support its World Class PerformanceProgramme investment. Three examples are:Elite CoachA major initiative designed to ensure that a new generation of world-class British coaches will be availableto pick up the baton and ensure long term success for our athletes post-London. The scheme providesdedicated programme support to up to a total of 50 coaches over a three year programme, helping them todevelop their technical skills and understanding through both residential and individual learnings. Thereare currently 34 coaches on the programme across 18 sports, with the first year graduating at the UK SportWorld Class Coaching Conference on 12–15 November 2007",0.292883,True
8,728,3,"Mr Johnson: As I said at the very beginning, I do think it is my duty and, indeed, I think it is absolutely possible and necessary for us to deliver the Olympic Games within the £9.3 billion. It is half what the Chinese spent, but we will produce a Games that will be every bit as good. The cost pressures at the moment, for your guidance, are really on the Village, the media centre and the stadium. Those are the things where it is obvious that there are the biggest difficulties, and I am determined to make sure that we extract value. I will tell you why: because the LDA, the body for which I am responsible, are putting in a huge amount of money, the tax payer is putting a huge amount of money into that site in East London. It is absolutely vital that we get value from it for generations to come, and that means that we cannot afford to waste money and we cannot afford to lose money that is going to be of legacy value, if you see what I mean. I do not want to see money unnecessarily squandered on this or that when it might be making that site more valuable to Londoners for generations to come, and that is why, obviously we are looking at some changes, some economies that we might be able to make. I do not propose to go into the detail, but there are on-going discussions about whether there are savings to be made about venues, about whether there is more we could do to bear down on some of the expenditure. I will just give you an example of some of the ways we save money in our own small way. I do not know whether you saw the party in London House. I am afraid that it was on television, so I cannot deny that it took place. There was a celebration at London House in Beijing which actually cost considerably less, about a million pounds less, four million pounds less than was originally thought, and yet was widely thought to have been the best event in Beijing. Keeping London House and that operation going for the whole period of the Olympic Games we managed to cut the budget by more than a third on the proposals of the previous mayor, and I do not think it is any particular secret that me and my team flew, not exactly steerage, but not club class either, in our bid to save the tax payer many, many thousands of pounds. You can take it from me, Chairman, that I do regard it as my duty on behalf of London tax payers to bear down on these costs. We can deliver a fantastic Games without wasting money",0.305503,True
9,700,3,"The budget is about 600 million for policing of the Olympics and, of course, it is vital that we get that right. One of the most important lessons from Beijing was looking at how the Chinese did it, and there is clearly a balance to be struck between having very intrusive, endless airport-style screening everywhere and having an approach that lets people circulate more freely. I think I am right in thinking that the Chinese had searches on the underground system as well, which I think some people found a little bit oppressive. It is an open question, I think, whether we go down that route or whether we use a more intelligence-based approach and exactly where we strike the balance, but it is, of course, vital that we have a 100%, as far as we can make it, secure Games. I would just say that if you think about what happened in the G8 in 2005, during the G8, when 12,000 police were moved to Scotland, the tragedy was, of course, that the terrorists struck in London. So I make an elementary point that the security threat does not merely arise in London. The security threat to the Olympic Games, in my view, could arise anywhere across the country throughout the Olympic period",0.301467,True


### 5.3.4. N most prototypical paragraphs of each topic

In [246]:
# 2 most prototypical paragraphs of each topic
N2 = 2
top_n_filter(df_topic_para2, N2).style.set_properties(subset = ['salient_paragraph'], **{'width':'500px', 'length': '50px'})

Unnamed: 0,Index,topic_id,salient_paragraph,probability
0,2115,1,"In July LOCOG issued guidelines for UK sporting facilities to help them in their bid to offer Pre-Games Training facilities to host international teams in 2012. Following the on-line application process, due to end in January 2007, those facilities meeting the criteria will be included in a guide for all National Olympic Committees (NOCs) and National Paralympic Committees (NPCs) which will be issued in 2008. Pre-Games Training Camps will help to engage the whole country and spread the benefits of the Games throughout the UK",0.133998
1,10022,1,"Lord Coe: Some of these sports are less specific. If you are a track and field nation then you might decide on the track and field facilities at Bath, Loughborough, Sheffield or wherever, it is not quite as specific as sailing. The other important issue is that we have a whole raft of facilities that can be used at any one stage in that whole build-up process but it is, as Keith said, for the National Olympic Committees to make that decision. I have to say nobody would have told Craig or Simon Clegg from the British Olympic Association that Narromine was better than Noosa and Noosa better than somewhere on the Queensland coast of Australia, that was a judgment those guys made",0.130003
2,12700,2,"2. British Cycling told us that the cycling facilities at the Velopark had “the potential to be absolutely world-class” and that they “should be the very best anywhere in the world”.152 There has nonetheless been a certain amount of controversy about the extent to which the Velopark will offer a suitable replacement for off-road facilities at the former Eastway Circuit, lost when land was assembled by the LDA for incorporation into the Olympic Park. The design currently proposed by the ODA for the Velopark offers most of the facilities previously available at Eastway, albeit in a more fragmented layout. British Cycling, despite being supportive of the proposed design for use during the Games and despite anticipating that, after the Games, the Velopark will “provide a boost for cycling”,153 initially lodged objections to the relevant planning applications on the grounds that they did “not provide an adequate or comparable replacement for the road and off-road facilities provided to cycling on the Eastway Circuit”. British Cycling is now satisfied that the ODA has taken on board its concerns and that current plans for the Velopark offer an acceptable replacement for Eastway. The Eastway Users Group, which has campaigned for off-road cycling facilities in the Velopark in legacy mode, remains frustrated by the uncertainty about future provision, and it has pointed out to us that facilities at Eastway closed before the ODA or LDA had provided any suitable temporary alternative, causing 149 Ev 107",0.429325
3,13056,2,"2. Unlike many of the swimming facilities built for previous Olympics, which have been little used afterthe games, the design of the Aquatic Centre is based upon the technical swimming and diving requirementsfor the games with certain of the other specific requirements for the Games being added as a temporary“overlay” and being dispensed with after the Games. In summary the Centre in legacy mode will have thefollowing:— a 50 metre by 10 lane competition pool with a depth of 3 metres with a moveable floor andbulkhead and permanent spectator seating for 2,500 with an ability for this to be expanded to 3,500when required;Processed: 24-04-2008 19:36:04 Page Layout: COENEW [O] PPSysB Job: 386236 Unit: PAG1Culture, Media and Sport Committee: Evidence Ev 7— in the same hall as the competition pool a 25 metre by 21 metre diving pool with a moveable floorproviding a range of platforms and spring boards to Olympic and World Championshipstandards;— a separate 50 metre by 8 lane training pool with a depth of 2 metres and two moveable floors andbulkheads; and— ancillary matters including timing, scoreboard with video recording and playback, and areas forchanging, catering, dryland training and provision for sports science and will meet therequirements of the various swimming disciplines",0.381628
4,728,3,"Mr Johnson: As I said at the very beginning, I do think it is my duty and, indeed, I think it is absolutely possible and necessary for us to deliver the Olympic Games within the £9.3 billion. It is half what the Chinese spent, but we will produce a Games that will be every bit as good. The cost pressures at the moment, for your guidance, are really on the Village, the media centre and the stadium. Those are the things where it is obvious that there are the biggest difficulties, and I am determined to make sure that we extract value. I will tell you why: because the LDA, the body for which I am responsible, are putting in a huge amount of money, the tax payer is putting a huge amount of money into that site in East London. It is absolutely vital that we get value from it for generations to come, and that means that we cannot afford to waste money and we cannot afford to lose money that is going to be of legacy value, if you see what I mean. I do not want to see money unnecessarily squandered on this or that when it might be making that site more valuable to Londoners for generations to come, and that is why, obviously we are looking at some changes, some economies that we might be able to make. I do not propose to go into the detail, but there are on-going discussions about whether there are savings to be made about venues, about whether there is more we could do to bear down on some of the expenditure. I will just give you an example of some of the ways we save money in our own small way. I do not know whether you saw the party in London House. I am afraid that it was on television, so I cannot deny that it took place. There was a celebration at London House in Beijing which actually cost considerably less, about a million pounds less, four million pounds less than was originally thought, and yet was widely thought to have been the best event in Beijing. Keeping London House and that operation going for the whole period of the Olympic Games we managed to cut the budget by more than a third on the proposals of the previous mayor, and I do not think it is any particular secret that me and my team flew, not exactly steerage, but not club class either, in our bid to save the tax payer many, many thousands of pounds. You can take it from me, Chairman, that I do regard it as my duty on behalf of London tax payers to bear down on these costs. We can deliver a fantastic Games without wasting money",0.305503
5,700,3,"The budget is about 600 million for policing of the Olympics and, of course, it is vital that we get that right. One of the most important lessons from Beijing was looking at how the Chinese did it, and there is clearly a balance to be struck between having very intrusive, endless airport-style screening everywhere and having an approach that lets people circulate more freely. I think I am right in thinking that the Chinese had searches on the underground system as well, which I think some people found a little bit oppressive. It is an open question, I think, whether we go down that route or whether we use a more intelligence-based approach and exactly where we strike the balance, but it is, of course, vital that we have a 100%, as far as we can make it, secure Games. I would just say that if you think about what happened in the G8 in 2005, during the G8, when 12,000 police were moved to Scotland, the tragedy was, of course, that the terrorists struck in London. So I make an elementary point that the security threat does not merely arise in London. The security threat to the Olympic Games, in my view, could arise anywhere across the country throughout the Olympic period",0.301467
6,2847,4,"9 The planning work has, however, identified new operational requirements and risks to delivery, with significant additional costs. In some cases, programme management information shows that planning is also behind schedule, although delivery bodies are seeking to mitigate delays. For example:Preparations for the London 2012 Olympic and Paralympic Games: Progress report December 2011 Summary 7• the Home Office is responsible for setting security requirements and funding LOCOG’s delivery of venue security: LOCOG is responsible for working out the operational implications of the requirements, including recruitment. In 2006, LOCOG estimated that 10,000 guards would be a reasonable basis to invite tenders from contractors and this informed the 2010 Spending Review settlement for venue security of £282 million. The guarding contract was let to G4S by LOCOG in December 2010. Detailed planning was undertaken by LOCOG and security partners only once the competition schedule and venues were finalised in early spring 2011. This planning, and the revised security requirements flowing from the implementation of the agreed security standards, have increased the peak requirement of guards to 23,700 and the likely cost to an estimated £553 million, a £271 million increase, although this sum is not yet finalised. The near doubling of the costs has increased the strain on the Public Sector Funding Package; • the increased guarding requirement is a significant recruitment challenge and means LOCOG is having to renegotiate its contractual requirements. In addition, the Home Office is in discussions with the Ministry of Defence about the provision of military personnel to act in security roles;• in accordance with its responsibilities under the 2006 Olympics and Paralympic Games Act, the Olympic Delivery Authority has produced a transport plan for the Games, with which other delivery bodies must cooperate. The Delivery Authority produced plans for transport operations in competition venues",0.583286
7,3329,4,"6 PREPARATIONS FOR THE LONDON 2012 OLyMPIC AND PARALyMPIC GAMES: PROGRESS REPORT JuNE 2008 10 The start and completion dates for the construction of the main venue and infrastructure projects delivered by the Olympic Delivery Authority at the end of March 2008 compared with the milestones in the November 2007 Programme Baseline ReportProjectEnabling Works (site preparation) Power Lines under Grounding (switchover only) Structures, Bridges and Highways utilities Main Stadium Aquatics Centre VeloparkHandball/Indoor Sports ArenaBasketballInternational Broadcast Centre/ Main Press CentreOlympic Village Eton Manor (training facilities and Paralympic events) Broxbourne (white water canoeing) Eton Dorney (rowing) Weymouth and Portland (sailing)construction start date November 2007 March 2008 Change in programme Forecast start date baseline (months)October 2006 October 2006 0 July 2008 July 2008 0 April 2008 April 2008 0 January 2008 January 2008 0 July 2008 May 2008 –21 September 2008 September 2008 0 March 2009 March 2009 0August 2009 June 2009 –2July 2009 November 2009 4May 2009 March 2009 –2 June 2008 May 2008 –1 March 2010 January 2010 –2 August 2008 May 2009 9 March 2009 January 2009 –2 May 2008 January 2008 –4construction end date November 2007 March 2008 Change in programme Forecast end date baseline (months)September 2009 September 2009 0 September 2008 November 2008 2 December 2011 December 2011 0 December 2011 August 2011 –4 Construction Construction end date end dateFebruary 2011 April 2011 2Completion date Completion date for construction for construction and initial overlay and initial overlay for test events for test eventsJune 2011 June 2011 0Construction Construction end date end dateApril 2011 August 2011 4Completion date Completion date for construction for construction and initial overlay and initial overlay for test events for test eventsJuly 2011 August 2011 1April 2011 February 2011 –2April 2011 March 2011 –1April 2011 April 2011 0June 2011 July 2011 1 December 2011 December 2011 0 February 2012 April 2011 –10 June 2010 October 2010 4 April 2010 July 2009 –9 February 2009 January 2009 –1Source: National Audit Office examination of actual and forecast progress against the November 2007 Programme BaselineNOTE",0.541452
8,17552,5,"4. Most clubs seem ill-prepared for enquiries from, and inclusion of, people with disabilities who wish to participate in the sport offered by the club. There is little or no support for specialist clubs who provide opportunities for sport that cannot be integrated ie wheelchair basketball or blind cricket! There is often nothing locally to support the child or their parent in accessing the specialist provision and this often involves their having to undertake extensive travel to specialist facilities or organisations catering for this group. There is poor information regarding availability etc and http://www.parasport.org.uk was established as a portal to provide pathway and provision information. The then Mayor of London published a strategy in 2007, which highlighted all these issues and to date there has been little, if any, action to redress these anomalies in London or in the rest of the country. DCSF have, in my view, shown no leadership regarding the legacy of 2012 and its impact on PE in schools and the inclusion of those with disabilities in core curriculum activities or sporting opportunities within or after school. DCMS held a legacy event in 2008 and again in April 2009 focusing on the legacy of the Games for those with disabilities. One outcome was to seek greater links between DWP, DCSF and themselves to ensure that joint strategies were developed and pathways established that enabled children to enjoy and participate in PE and sport within schools/after school clubs/integrated and specialist provision in the community, with good national talent forums and pathways established for those who wish to participate in sport at a higher level and finally, with governing bodies having clear inclusive programmes for sports men and women with disabilities active at a national and international level",0.657481
9,17551,5,"3. After school activities often exclude this group of children ie no possibility of inclusion in team sports such as football, cricket, rugby, basket ball etc., limited access to swimming baths, athletic fields etc. All of these sports are undertaken by people with a disability, but not now normally at school. Specialist schools did provide a massive range of sporting opportunities and sport played a major part of my own adjustment to disability. I learned about teamwork, was able to set individual goals, have competition to extend my abilities, occasionally experienced ""being a winner"" and had that thrill of competition. One example of efforts to redress this issue within a school setting is in Leeds, which has a programme of monthly sporting and physical activities arranged in school time for vision impaired children within the schools. One solution easily achieved would be for groups of schools to come together monthly to provide sporting and recreational activities for disabled children and young people within their schools",0.59458


### 5.3.5.  N most prototypical paragraphs of a specific topic

In [247]:
topic_id_chosen = 15                                    # choose the topic ID
num_para = 2                                            # set N to extract the N most prototypical paragraphs of a specific topic
df_n_topic_k = top_n_filter(df_topic_para2, num_para)
topic_id_filter = df_n_topic_k['topic_id'] == topic_id_chosen
df_n_topic_k[topic_id_filter].style.set_properties(subset = ['salient_paragraph'], **{'width':'500px', 'length': '50px'})

Unnamed: 0,Index,topic_id,salient_paragraph,probability
28,4538,15,".2.8Sub-objectiveMaximise the economic, social, health and environmental benefits the Games bring to the UK and all sections of the UK populationMaximise the employment and skills benefits for the UK arising from Games-related businessMaximise the wider economic benefits of the Games across the UK, including those for tourism and business promotionMaximise cultural benefits from hosting the Games and the Cultural OlympiadMaximise social benefits, including in health, education and volunteering, of hosting the GamesEnsure that the Games contribute to Sustainable Communities priorities, including the wider Thames GatewayAgree and promote sustainable development and procurement policies, including commitments to sustainable energy and waste management goalsPromote positive images of the UK to an international audienceEnsure the UK’s diverse communities are engaged with, and benefit from, the changes and opportunities arising from hosting the Games in the UKMaximise the economic, social, health and environmental benefits the Games bring to London and all LondonersMaximise the employment and skills benefits for Londoners arising from Games-related businessMaximise the wider economic benefits of the Games to London, including those for tourism and business promotionMaximise cultural benefits to Londoners from hosting the Games and the Cultural OlympiadMaximise social benefits to Londoners, including in health, education and volunteering, of hosting the GamesEnsure that the Games contribute to Sustainable Communities priorities, including the London Thames GatewayAgree and promote sustainable development and procurement policies, including commitments to sustainable energy and waste management goalsPromote London’s image as a leading world city to an international audienceEnsure London’s diverse communities are engaged with, and benefit from, the changes and opportunities arising from hosting the Games in LondonAPPENDIX FIVE",0.455466
29,16483,15,"4.1 The aim is that in the next 20 years, residents in the host boroughs will equal the London average in a range of the life indicators which you would expect to find in a successful community: — employment rates will increase to the London average; — average incomes in the bottom two fifths of earners in the host borough area will be increased to the London average; — young people in the host borough area will have improved GCSE results to at least the London average; — host borough 11 year olds will have at least the same educational attainment as the London average; — the number of families in receipt of benefits in the host boroughs area will fall to no more than the London average; — the rate of violent crime will continue to fall and reflect the London average; and — residents in the host boroughs area, particularly men, will have increased life expectancy to the London average",0.44192
