# 1. Prerequisites

Install the libraries below. Download the two css files from https://github.com/suhao3123/CSS, create a folder named assets in the root of your app directory and include the two files in that folder to lauch the Dashboard we created in the final step.

In [76]:
# pip install numpy                      # （install numpy）
# pip intall pandas                      # (install pandas)
# pip install PyMuPDF                    # (install PyMuPDF for extracting info from PDF files)
# pip install tika                       # (install tika for extracting paragraphs from PDF files)
# pip install spacy==2.2.0               # (install spacy for lemmatization)
# conda install gensim                   # (intall gesim for topic modelling)
# pip install pyLDAvis                   # (install pyLDAvis for topic modelling visulisation)
# conda install -c conda-forge pyldavis  # (if you use aconda to install pyLADvis)
# pip install plotly                     # (install plotly for visualisation)

In [77]:
import pandas as pd
import numpy as np
import re

# glob for extracting the directories of metadata
import glob

# PyMuPDF
import fitz

# tika
import tika               
from tika import parser   

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Visualisation
import plotly.express as px
import pyLDAvis
import pyLDAvis.gensim_models
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)
import os

# 2. Import pdf files, data wrangling and overview

In [78]:
# Extract the directories of the PDF files, make sure the folder name does not contain number
pdf_dir = "D:\LEON\Business Analytics\Study\9. Business Project\Data set\Olympics"
pdf_files = glob.glob("%s/*.pdf" % pdf_dir)
pdf_files[:1]

['D:\\LEON\\Business Analytics\\Study\\9. Business Project\\Data set\\Olympics\\Examination_of_Witnesses_Sept_2003_-_Q1-19.pdf']

In [79]:
# Use PyMuPDF to extract all info of the PDF files (text, title, date, etc)
list_metadata = []
for i in pdf_files:
    with fitz.open(i) as doc:
        info = doc.metadata
        info['file_name'] = os.path.basename(i)
        text = ''
        for page in doc:
            text+= page.getText()
        info['Content'] = text       
    list_metadata.append(info)

In [80]:
df = pd.DataFrame(list_metadata)
df['document_id'] = df.index
df = df.drop_duplicates(subset = ['Content'])             # drop duplicate rows
df = df.dropna(subset=df.columns[[12]], how='any')       # drop rows whose text content is NaN                   
df['Word_count'] = df ['Content'].str.count(' ') + 1
df.head(3)

Unnamed: 0,format,title,author,subject,keywords,creator,producer,creationDate,modDate,trapped,encryption,file_name,Content,document_id,Word_count
0,PDF 1.7,,B Lewis,,,Microsoft Word,,D:20210822083603+00'00',D:20210822083603+00'00',,,Examination_of_Witnesses_Sept_2003_-_Q1-19.pdf,Examination of Witnesses (1-19) \n16 SEPTEMBER...,0,6115
1,PDF 1.7,,B Lewis,,,Microsoft Word,,D:20210822083606+00'00',D:20210822083606+00'00',,,Examination_of_Witnesses_Sept_2003_-_Q20-39.pdf,Examination of Witnesses (20-39) \n16 SEPTEMBE...,1,4002
2,PDF 1.7,,B Lewis,,,Microsoft Word,,D:20210822083609+00'00',D:20210822083609+00'00',,,Examination_of_Witnesses_Sept_2003_-_Q40-44.pdf,Examination of Witnesses (40-44) \n16 SEPTEMBE...,2,1007


In [81]:
# check if there are documents with few words
min_word_count= 10                                               # set the threshold of the minimum word count of each document 
min_word_count_filter = df['Word_count'] <= min_word_count
df_few_words = df[min_word_count_filter][['file_name', 'Content']]
df_few_words

Unnamed: 0,file_name,Content


In [82]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 169 entries, 0 to 168
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   format        169 non-null    object
 1   title         169 non-null    object
 2   author        169 non-null    object
 3   subject       169 non-null    object
 4   keywords      169 non-null    object
 5   creator       169 non-null    object
 6   producer      169 non-null    object
 7   creationDate  169 non-null    object
 8   modDate       169 non-null    object
 9   trapped       169 non-null    object
 10  encryption    3 non-null      object
 11  file_name     169 non-null    object
 12  Content       169 non-null    object
 13  document_id   169 non-null    int64 
 14  Word_count    169 non-null    int64 
dtypes: int64(2), object(13)
memory usage: 21.1+ KB


In [83]:
# Word count
df['Word_count'].sum( )

1054090

# 3. Natural language processing

### 3.1. Tokenisation

In [84]:
data = df.Content.values.tolist()

In [85]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence).encode('utf-8'), deacc=True))  # deacc=True removes punctuations

data_words= list(sent_to_words(data))

### 3.2. Processing words: 
Remove Stopwords, Make Bigrams and Trigrams,Lemmatisation, remove short words and meaningless words

In [86]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [87]:
# import the stop_words from gensim
from gensim.parsing.preprocessing import remove_stopwords, STOPWORDS
stop_words = [i for i in STOPWORDS]

# add more stop words after analysing the overall term frequncy of each topic in pyLDAvis in the "Word frequency of each topic" section
new_stop_words = ['go', 'would', 'make', 'think', 'take', 'say', 'need', 'want', 'thing', 'have', 'lot', 'people', 'year',
                   'work','time', 'know', 'use', 'try', 'happen', 'ask', 'new', 'way', 'jonathan_stephen', 'david_higgin', 'dame_helen_ghosh']              
stop_words.extend(new_stop_words)

In [88]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stop_words(texts):
    return [[word for word in doc if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [89]:
# Form Trigrams
data_words_trigrams = make_trigrams(data_words)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

#increase the maximum length of text that the parser or NER can process
nlp.max_length = 13000000 #

# Do lemmatization keeping only noun, adj, verb
data_lemmatized1 = lemmatization(data_words_trigrams, allowed_postags=['NOUN', 'ADJ', 'VERB'])

# Set a threshold for removing the words with length less than the threshold
minimum_len = 3 
data_lemmatized2 = []
for i in data_lemmatized1:
    new_element = [x for x in i if len(x) >= minimum_len]
    data_lemmatized2.append(new_element)

# remove stop words
data_lemmatized = remove_stop_words(data_lemmatized2)

###  3.3. Dictionary and Corpus

In [90]:
# Create Dictionary, set the parameters to filter out tokens in the dictionary by their frequency
no_below = 5             # remove the tokens less frequent than no_below documents (absolute number)
no_above = 0.85          # remove the tokens more frequent than no_above documents (fraction of the total corpus size)
id2word = corpora.Dictionary(data_lemmatized)
id2word.filter_extremes(no_below = no_below, no_above = no_above)

# print the number of reserved unique tokens and word count afer removal of high and low frequency words
print('After removal of high and low frequency words - Number of unique tokens: %d, %d' % (len(id2word),id2word.num_pos))

After removal of high and low frequency words - Number of unique tokens: 3406, 339810


In [91]:
# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

#  4. LDA Model

### 4.1. Building LDA Model, Perparameter/Hyperparameter tuning

In [92]:
# set training parameters and hyperameters
k = 20
passes = 20
iterations = 100
alpha = 50.0/k   
eta = 0.01
random_state = 12345
minimum_probability = 0

Plotting the coherence score against k to identify the opitmal k where the coherence socre reaches the highest point. Because running it is quite time-consuming, I stopped some chunks below and just set k to be 10 based on the analysis of the reuslt. If users want to fit the model to the other corpus, they can remove the hashs to reactivate the chunks and analyse the coherence socres aginst k.

In [93]:
#start=1; limit=21; step=1 # set the parameters to generate a sequence of k values starting with "start" and ending in "limit" by a step of "step" f
#coherence_values = []
#model_list = []
#for i in range(start,limit,step):
    #model = gensim.models.LdaModel(corpus = corpus,id2word = id2word,alpha = alpha,eta = eta,
                                  #iterations = iterations,num_topics = i,passes = passes,random_state = 12345,minimum_probability = minimum_probability)
    #model_list.append(model)
    #coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=id2word, coherence='c_v')
    #coherence_values.append(coherencemodel.get_coherence())

In [94]:
#list_num_topics = [i for i in range(start, limit, step)]
#df_coherence1 = pd.DataFrame({'Number_of_Topics': list_num_topics, 'Coherence_Score': coherence_values})
#df_coherence1.to_pickle('./df_coherence1.pkl') #save the result to disk
#df_coherence = pd.read_pickle('./df_coherence1.pkl') #load the result from disk

In [95]:
#fig1 = px.line(df_coherence, x = 'Number_of_Topics', y = "Coherence_Score", title = 'Coherence scores against number of topics')
#fig1.update_layout(autosize=False, width=1000, height=400)
#fig1.update_traces(mode = "lines + markers")
#fig1.show()

In [96]:
# num of topics =  to get the highest coherence socre
k = 10
lda_model = gensim.models.LdaModel(
    corpus = corpus,
    id2word = id2word,
    alpha = alpha,
    eta = eta,
    iterations = iterations,
    num_topics = k,
    passes = passes,
    random_state = 12345,
    minimum_probability = minimum_probability)

In [97]:
# print the coherence of the LDA model
coherencemodel2 = CoherenceModel(model=lda_model, texts=texts, dictionary=id2word, coherence='c_v')
coherence_score = coherencemodel2.get_coherence()
coherence_score

0.4214233672915886

### 4.2. Topic distribution of documents

In [98]:
# create the function for converting a list of tuples into a dictionary
def Convert(tup, di):
    di = dict(tup)
    return di

In [99]:
# topic distribution of documents
list_topic = []
dictionary_topic = {}
for d in texts:
    bow = id2word.doc2bow(d)
    belong = lda_model[bow]                        # generate a list of tuples of topic distribution of a document
    belong_dic = Convert(belong, dictionary_topic) # convert the list of tuples into a dictionary
    list_topic.append(belong_dic)           
                      
df_topic_distribution = pd.DataFrame(list_topic)   # convert the list of dictionaries into a dataframe

# rename the topic IDs to ensure they are as same as the topic IDs in the pyLDAvis
original_topic_id = [*df_topic_distribution]; new_topic_id = [x + 1 for x in original_topic_id]
df_topic_distribution = df_topic_distribution.rename(columns = dict(zip(original_topic_id, new_topic_id))) #rename the topic IDs to ensure they are as same as the topic IDs in the pyLDAvis
df_topic = pd.merge(df, df_topic_distribution, how = 'left', left_index=True, right_index=True) # merge with info of documents
df_topic.drop(['title','format','creator', 'producer', 'keywords', 'trapped', 'encryption','subject', 'modDate'], axis = 1)

Unnamed: 0,author,creationDate,file_name,Content,document_id,Word_count,1,2,3,4,5,6,7,8,9,10
0,B Lewis,D:20210822083603+00'00',Examination_of_Witnesses_Sept_2003_-_Q1-19.pdf,Examination of Witnesses (1-19) \n16 SEPTEMBER...,0,6115,0.006733,0.032729,0.338991,0.007977,0.014635,0.010442,0.006408,0.561253,0.005858,0.014975
1,B Lewis,D:20210822083606+00'00',Examination_of_Witnesses_Sept_2003_-_Q20-39.pdf,Examination of Witnesses (20-39) \n16 SEPTEMBE...,1,4002,0.009603,0.008758,0.220258,0.015240,0.015121,0.027104,0.006672,0.675124,0.009199,0.012920
2,B Lewis,D:20210822083609+00'00',Examination_of_Witnesses_Sept_2003_-_Q40-44.pdf,Examination of Witnesses (40-44) \n16 SEPTEMBE...,2,1007,0.021492,0.024135,0.242508,0.023948,0.029972,0.029233,0.019753,0.546063,0.028111,0.034784
3,Bronwen Lewis,D:20210822084116+00'00',Further_supplementary_memorandum_submitted_by_...,Further supplementary memorandum submitted by ...,3,431,0.036792,0.130970,0.035828,0.027276,0.022223,0.030978,0.500668,0.030064,0.149588,0.035612
4,Bronwen Lewis,D:20210822083921+00'00',Further_Supplementary_Memorandum_submitted_by_...,Further supplementary memorandum submitted by ...,4,288,0.083911,0.053000,0.093137,0.119460,0.066731,0.060815,0.070883,0.241594,0.155329,0.055140
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164,Bronwen Lewis,D:20210822084528+00'00',Written_evidence_submitted_by_UK_Sport_-_Jan_2...,Written evidence submitted by UK Sport \n \n ...,164,3089,0.025478,0.006420,0.025063,0.698303,0.169433,0.015175,0.007070,0.028591,0.007063,0.017403
165,Bronwen Lewis,D:20210822084531+00'00',Written_evidence_submitted_by_Vision_2020_UK_-...,Written evidence submitted by Vision 2020 UK ...,165,2284,0.015467,0.016690,0.008076,0.079248,0.767609,0.013739,0.017736,0.058075,0.013306,0.010053
166,Bronwen Lewis,D:20210822084535+00'00',Written_evidence_submitted_by_VisitBritain_-_J...,Written evidence submitted by VisitBritain \n...,166,2372,0.006974,0.012218,0.022306,0.012489,0.109075,0.784511,0.011448,0.014708,0.010362,0.015911
167,Bronwen Lewis,D:20210822084543+00'00',Written_evidence_submitted_by_Womens_Sport_and...,Written evidence submitted by the Women's Spor...,167,1966,0.009271,0.014071,0.010122,0.024958,0.864630,0.012340,0.017318,0.024847,0.013601,0.008841


# 5. Topic interpretation tools

I first identify the salient topics defined by PTBI proposed by Marchetti and Puranam (2020), then combine both the topic visualisation of pyLDAvis and the prototypical texts defined by PTBI to facilitate the topic interpretation.

##  5.1. Salient topics for interpretation
PTBI assumes that the topics with little salience are not worthy of interpretation. To extract the most salient topics for interpretation, for each topic, we need to compute the fraction of documents with the probability that the documents belong to the topic is more than > 1/K (Marchetti and Puranam, 2020, p. 14), and I defined the fraction as the “salience” of the topic. 

The scree plot below shows that when the topics are sorted by salience in descending order, the salience tends to level off on topic 1, as a result, we can select the topics ahead of topic 1 as the salient topics for interpretation.

In [100]:
# compute salience: the fraction of documents with the probability that the document belongs to the topic is more than > 1/K for each document
list_percent_above = []
for i in df_topic_distribution:
    num_above = df_topic_distribution[i][df_topic_distribution[i] > 1/k].count()
    percent_above =  num_above/len(df_topic_distribution)
    list_percent_above.append(percent_above)
    
df_salient_topic = pd.DataFrame({'topic_ID':  [str(i) for i in new_topic_id], 'salience': list_percent_above}).sort_values(
    by = 'salience', ascending = False)

In [101]:
fig_L1 = px.line(df_salient_topic, x = 'topic_ID', y = 'salience', title="Scree plot of salience of topics")
fig_L1.update_layout(autosize=False, width=800, height=400)
fig_L1.update_traces(mode = "lines + markers")
fig_L1.show()

## 5.2. Topic visualisation

Check the words of each topic, if there're common words with high overall frequency such as "think" "want" or "make", return to the "import the stop_words from gensim" section, add these words to the list of stop words to remove them.

In [102]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, sort_topics = False )
pyLDAvis.save_html(vis, './assets/lda.html') # save the reult to disk   
vis

## 5.3. prototypical paragraphs
The prototypical paragraphs, the paragraphs with a high probabitiy that they belong to a topic, can be used to assist topic interpretation. This section classify the paragraphs into topics and provides the users 4 types of filters to select the prototypical paragraphs: N most prototypical paragraphs overall, N most prototypical paragraphs where the belong() function is greater than the threshold L, N most prototypical paragraphs of each topic and N most prototypical paragraphs of a specific topic.

### 5.3.1.  Classify the paragraphs based on the trained model

##### Extract paragraphs from documents

In [103]:
# define the function for spliting texts into paragraphs by delimiter '.\n\n' or '. \n\n'
def para_split(i):
    j = parser.from_file(i)
    m = j['content']
    import re
    return re.split('[?.!-]\n|[?.!-] \n|  \n\n|\n\n[0-9]', m)

In [104]:
list_paragraphs = []
list_para_id = []
for i in pdf_files:
    para = para_split(i)
    para = [w.replace('\n', '') for w in para]
    para = [x.strip() for x in para if x.strip()] # remove empty elements
    para_id = [x for x in range(len(para))] 
    list_paragraphs.append(para)
    list_para_id.append(para_id)

In [105]:
df_para1 = df.copy()
df_para1['paragraphs'] = list_paragraphs
df_para1['para_id'] = list_para_id
df_para2 = df_para1.apply(pd.Series.explode)
df_para3 = df_para2.reset_index()
df_para4 = df_para3[['creationDate', 'document_id', 'file_name', 'para_id', 'paragraphs']]
len(df_para4) # number of paragraphs extracted

21640

In [106]:
# set a filter to filter out the paragraphs with short words
n_word_count = 10                                                        # set the threshold of word count
para_word_count = df_para4['paragraphs'].str.split().str.len()           # word count of each paragraph
df_para = df_para4[(para_word_count>=n_word_count)].reset_index()        # select the paragraphs with word count not less than the threshold
df_para

Unnamed: 0,index,creationDate,document_id,file_name,para_id,paragraphs
0,2,D:20210822083603+00'00',0,Examination_of_Witnesses_Sept_2003_-_Q1-19.pdf,2,"MS BARBARA CASSANI Q1 Chairman: Good morning, ..."
1,3,D:20210822083603+00'00',0,Examination_of_Witnesses_Sept_2003_-_Q1-19.pdf,3,Ms Cassani: Thank you very much. Thank you ver...
2,4,D:20210822083603+00'00',0,Examination_of_Witnesses_Sept_2003_-_Q1-19.pdf,4,8 months I shall be able to meet frequently wi...
3,5,D:20210822083603+00'00',0,Examination_of_Witnesses_Sept_2003_-_Q1-19.pdf,5,The first thing I should like to say is that I...
4,6,D:20210822083603+00'00',0,Examination_of_Witnesses_Sept_2003_-_Q1-19.pdf,6,Really the backdrop is that I believe in the G...
...,...,...,...,...,...,...
17709,21631,D:20210822084546+00'00',168,Written_evidence_submitted_by_Youth_Sport_Trus...,37,7.3 When the impact of Olympics and Paralympi...
17710,21633,D:20210822084546+00'00',168,Written_evidence_submitted_by_Youth_Sport_Trus...,39,11 2007-08 School Sport Survey. 12 As ...
17711,21634,D:20210822084546+00'00',168,Written_evidence_submitted_by_Youth_Sport_Trus...,40,13 Gold Young Ambassadors work across School...
17712,21635,D:20210822084546+00'00',168,Written_evidence_submitted_by_Youth_Sport_Trus...,41,14 From national data supplied by Department...


##### Process the paragraphs

In [107]:
# tokenization
data2 = df_para.paragraphs.values.tolist()
data_words2 = list(sent_to_words(data2))

In [108]:
# Form Trigrams
data_words_trigrams2 = make_trigrams(data_words2)

# Do lemmatization keeping only noun, adj, vb
data_lemmatized2 = lemmatization(data_words_trigrams2, allowed_postags=['NOUN', 'ADJ', 'VERB'])

# set the length of word threshold as same as before for removing the words less than the threshold
data_lemmatized2_2 = []
for i in data_lemmatized2:
    new_element = [x for x in i if len(x) >= minimum_len]
    data_lemmatized2_2.append(new_element)
    
# Remove Stop Words
data_lemmatized2_1 = remove_stop_words(data_lemmatized2_2)

##### Classify the paragraphs based on the extracted topics

In [109]:
# belong function: classify topics of paragraphs, it might take a long time because there are 148,651 paragraphs in the 11,132,849-word corpus
list_topic_para = []
dictionary_topic_para = {}
for d in data_lemmatized2_1:
    bow = id2word.doc2bow(d)
    belong = lda_model[bow]
    doc_dic = Convert(belong, dictionary_topic_para)
    list_topic_para.append(doc_dic)
    df_topic_para = pd.DataFrame(list_topic_para)

In [110]:
# rename the topic IDs to ensure they are as same as the topic IDs in the pyLDAvis
df_topic_para = df_topic_para.rename(columns = dict(zip(original_topic_id, new_topic_id)))

# topic distribution of paragraphs
df_topic_para1_1 = pd.merge(df_para, df_topic_para, how = 'left', left_index=True, right_index=True)
df_topic_para1_1

Unnamed: 0,index,creationDate,document_id,file_name,para_id,paragraphs,1,2,3,4,5,6,7,8,9,10
0,2,D:20210822083603+00'00',0,Examination_of_Witnesses_Sept_2003_-_Q1-19.pdf,2,"MS BARBARA CASSANI Q1 Chairman: Good morning, ...",0.063916,0.124839,0.194832,0.055195,0.067197,0.070795,0.057485,0.220236,0.094435,0.051069
1,3,D:20210822083603+00'00',0,Examination_of_Witnesses_Sept_2003_-_Q1-19.pdf,3,Ms Cassani: Thank you very much. Thank you ver...,0.075215,0.082753,0.129494,0.073575,0.078204,0.086212,0.077309,0.217785,0.097698,0.081755
2,4,D:20210822083603+00'00',0,Examination_of_Witnesses_Sept_2003_-_Q1-19.pdf,4,8 months I shall be able to meet frequently wi...,0.089452,0.094595,0.104457,0.091917,0.084713,0.093249,0.111744,0.128840,0.100635,0.100398
3,5,D:20210822083603+00'00',0,Examination_of_Witnesses_Sept_2003_-_Q1-19.pdf,5,The first thing I should like to say is that I...,0.025881,0.071182,0.088061,0.032208,0.099692,0.050140,0.031363,0.522551,0.037673,0.041248
4,6,D:20210822083603+00'00',0,Examination_of_Witnesses_Sept_2003_-_Q1-19.pdf,6,Really the backdrop is that I believe in the G...,0.051199,0.084564,0.166066,0.139687,0.099277,0.112839,0.057875,0.145985,0.066059,0.076449
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17709,21631,D:20210822084546+00'00',168,Written_evidence_submitted_by_Youth_Sport_Trus...,37,7.3 When the impact of Olympics and Paralympi...,0.066862,0.063191,0.057503,0.094378,0.377863,0.089630,0.067530,0.058593,0.060486,0.063964
17710,21633,D:20210822084546+00'00',168,Written_evidence_submitted_by_Youth_Sport_Trus...,39,11 2007-08 School Sport Survey. 12 As ...,0.083357,0.068824,0.089464,0.090349,0.223662,0.096629,0.080137,0.098947,0.083594,0.085038
17711,21634,D:20210822084546+00'00',168,Written_evidence_submitted_by_Youth_Sport_Trus...,40,13 Gold Young Ambassadors work across School...,0.135806,0.079290,0.076687,0.177863,0.114509,0.086698,0.077985,0.086662,0.081836,0.082664
17712,21635,D:20210822084546+00'00',168,Written_evidence_submitted_by_Youth_Sport_Trus...,41,14 From national data supplied by Department...,0.090812,0.084098,0.082604,0.101100,0.165467,0.093090,0.103398,0.090760,0.097093,0.091579


In [111]:
# save the result to disk
df_topic_para1_1.to_pickle('./df_topic_para_Olympics.pkl')

In [112]:
# load the result from disk
df_topic_para1 = pd.read_pickle('./df_topic_para_Olympics.pkl') 

In [113]:
# drop the paragraphs with high frequency but meaningless for interperation based on the extraction of prototypical paragraphs below
list_remove_para = []                                # input the index of the paragraphs you want to drop
df_topic_para2 = df_topic_para1.copy().drop(list_remove_para) 
df_topic_para2.to_pickle('./df_topic_para_Olympics2.pkl') # save the resuilt to disk

### 5.3.2. N most prototypical paragraphs overall

In [114]:
df_topic_para2

Unnamed: 0,index,creationDate,document_id,file_name,para_id,paragraphs,1,2,3,4,5,6,7,8,9,10
0,2,D:20210822083603+00'00',0,Examination_of_Witnesses_Sept_2003_-_Q1-19.pdf,2,"MS BARBARA CASSANI Q1 Chairman: Good morning, ...",0.063916,0.124839,0.194832,0.055195,0.067197,0.070795,0.057485,0.220236,0.094435,0.051069
1,3,D:20210822083603+00'00',0,Examination_of_Witnesses_Sept_2003_-_Q1-19.pdf,3,Ms Cassani: Thank you very much. Thank you ver...,0.075215,0.082753,0.129494,0.073575,0.078204,0.086212,0.077309,0.217785,0.097698,0.081755
2,4,D:20210822083603+00'00',0,Examination_of_Witnesses_Sept_2003_-_Q1-19.pdf,4,8 months I shall be able to meet frequently wi...,0.089452,0.094595,0.104457,0.091917,0.084713,0.093249,0.111744,0.128840,0.100635,0.100398
3,5,D:20210822083603+00'00',0,Examination_of_Witnesses_Sept_2003_-_Q1-19.pdf,5,The first thing I should like to say is that I...,0.025881,0.071182,0.088061,0.032208,0.099692,0.050140,0.031363,0.522551,0.037673,0.041248
4,6,D:20210822083603+00'00',0,Examination_of_Witnesses_Sept_2003_-_Q1-19.pdf,6,Really the backdrop is that I believe in the G...,0.051199,0.084564,0.166066,0.139687,0.099277,0.112839,0.057875,0.145985,0.066059,0.076449
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17709,21631,D:20210822084546+00'00',168,Written_evidence_submitted_by_Youth_Sport_Trus...,37,7.3 When the impact of Olympics and Paralympi...,0.066862,0.063191,0.057503,0.094378,0.377863,0.089630,0.067530,0.058593,0.060486,0.063964
17710,21633,D:20210822084546+00'00',168,Written_evidence_submitted_by_Youth_Sport_Trus...,39,11 2007-08 School Sport Survey. 12 As ...,0.083357,0.068824,0.089464,0.090349,0.223662,0.096629,0.080137,0.098947,0.083594,0.085038
17711,21634,D:20210822084546+00'00',168,Written_evidence_submitted_by_Youth_Sport_Trus...,40,13 Gold Young Ambassadors work across School...,0.135806,0.079290,0.076687,0.177863,0.114509,0.086698,0.077985,0.086662,0.081836,0.082664
17712,21635,D:20210822084546+00'00',168,Written_evidence_submitted_by_Youth_Sport_Trus...,41,14 From national data supplied by Department...,0.090812,0.084098,0.082604,0.101100,0.165467,0.093090,0.103398,0.090760,0.097093,0.091579


In [115]:
#N most prototypical paragraphs overall
df_topic_para2_n = df_topic_para2.copy()
df_topic_para2_n['highest_p'] = df_topic_para2_n.iloc[:, 6:].max(axis = 1)          # get the highest probability among the topic distribution of each paragraph
df_topic_para2_n['salient_topic'] = df_topic_para2_n.iloc[:, 6:].idxmax(axis = 1)    # get the corresponding topic id
df_topic_para2_n = df_topic_para2_n[['index','file_name','salient_topic','paragraphs','highest_p',]]
df_topic_para2_n.columns = ['Index','file','topic', 'paragraph','probability']

In [116]:
N1 = 5   # Set N to get the N most prototypical paragraphs overall
df_topic_para2_n.nlargest(N1,['probability']).style.set_properties(subset = ['paragraph'], **{'width':'1000px', 'length': '50px'})

Unnamed: 0,Index,file,topic,paragraph,probability
8304,9897,PAC_26_April_2012_-_Olympic_Costs_-_corrected_evidence_(no_report).pdf,2,"You think that the assessment of risks is our best estimate of the most likely outcome of the budget as a whole. But actually the assessment of risk-and how we have compiled it-is this: we have not sought to estimate how likely it is that every risk arises. We just said, ""Let us think about every risk that could arise, and let us assume that they all arise and work out the likely cost of them all arising."" On top of that, we said, ""And there will be some risks that we just cannot think about that are unknown unknowns. There will be some multiple consequentials if everything came together."" So we end up with an estimate not of the most likely cost of the project, which is what the burden of paragraph 1 of the PAC Report understands it is, but an estimate of how much we would need to set aside in the very unlikely event that all risks arise and some more unknown risks arise as well. The purpose of that is not to get to an estimate of the likely outcome of the budget. Its purpose and why we do it is to see, against any reasonable view of the likely risk that might arise, even on an assumption that they all arise and some more unknown risks arise, whether we have enough money. The conclusion has always been, yes, we had. Against what is therefore, in my view, a conservative and prudent estimate, we had £36 million headroom at the time of the NAO Report. We had more, and indeed the picture over the six-month period since the original figures on which the NAO was recording this, is that the contingency has gone down by £27 million or so-we reckon, because these are provisional figures, but I want to give our best figures-and the assessed risks on that very conservative and prudent basis have gone down by £136 million. So the picture on the budget as a whole is that we are spending contingency significantly slower than risks are disappearing from the programme. That is why, without in any sense being complacent, I am confident that we will bring this in within budget, and I do not think that the budget is close to being used up",0.684665
8388,9995,PAC_26_April_2012_-_Olympic_Costs_-_corrected_evidence_(no_report).pdf,2,"Jonathan Stephens: There are two aspects to risk: likelihood and impact. What we are saying is that we made no estimate of likelihood, we just wrote in a 100% likelihood of all the risks we could think of and some unknown risks that we could not think of. We then looked at impact, and on impact we said, ""If this risk were to materialise-we are assuming a 100% likelihood that it materialises-what is the likely cost?"" That is where you get the low, the most likely outcome and the high outcome. When you add those together, you do not get to an outcome of, ""What is the most likely expenditure on the programme?""; you get to an outcome of, ""If all conceivable risks arise, plus some unknown risks that we cannot identify, what is the likely expenditure?"" That is a conservative and prudent view of, ""Do we have enough contingency left, if all those risks arise?"" In practice, they won’t all arise. It is conceivable that some will arise, but it is pretty unlikely that all of them will arise. It is perfectly conceivable that some individual risk will arise at a higher estimate than the most likely estimate, but the prospect of all those risks arising is unlikely. The prospect of them all arising at the very highest possible cost is so unlikely as to not provide a good basis for planning. I am sorry. I am going on at some length, but there is a real point",0.664391
17552,21447,Written_evidence_submitted_by_Vision_2020_UK_-_Jan_2010.pdf,5,"4. Most clubs seem ill-prepared for enquiries from, and inclusion of, people with disabilities who wish to participate in the sport offered by the club. There is little or no support for specialist clubs who provide opportunities for sport that cannot be integrated ie wheelchair basketball or blind cricket! There is often nothing locally to support the child or their parent in accessing the specialist provision and this often involves their having to undertake extensive travel to specialist facilities or organisations catering for this group. There is poor information regarding availability etc and http://www.parasport.org.uk was established as a portal to provide pathway and provision information. The then Mayor of London published a strategy in 2007, which highlighted all these issues and to date there has been little, if any, action to redress these anomalies in London or in the rest of the country. DCSF have, in my view, shown no leadership regarding the legacy of 2012 and its impact on PE in schools and the inclusion of those with disabilities in core curriculum activities or sporting opportunities within or after school. DCMS held a legacy event in 2008 and again in April 2009 focusing on the legacy of the Games for those with disabilities. One outcome was to seek greater links between DWP, DCSF and themselves to ensure that joint strategies were developed and pathways established that enabled children to enjoy and participate in PE and sport within schools/after school clubs/integrated and specialist provision in the community, with good national talent forums and pathways established for those who wish to participate in sport at a higher level and finally, with governing bodies having clear inclusive programmes for sports men and women with disabilities active at a national and international level",0.605244
3329,3935,NAO_Preparations_for_the_Olympics_-_Progress_report_-_June_2008.pdf,7,"6 PREPARATIONS FOR THE LONDON 2012 OLyMPIC AND PARALyMPIC GAMES: PROGRESS REPORT JuNE 2008 10 The start and completion dates for the construction of the main venue and infrastructure projects delivered by the Olympic Delivery Authority at the end of March 2008 compared with the milestones in the November 2007 Programme Baseline ReportProjectEnabling Works (site preparation) Power Lines under Grounding (switchover only) Structures, Bridges and Highways utilities Main Stadium Aquatics Centre VeloparkHandball/Indoor Sports ArenaBasketballInternational Broadcast Centre/ Main Press CentreOlympic Village Eton Manor (training facilities and Paralympic events) Broxbourne (white water canoeing) Eton Dorney (rowing) Weymouth and Portland (sailing)construction start date November 2007 March 2008 Change in programme Forecast start date baseline (months)October 2006 October 2006 0 July 2008 July 2008 0 April 2008 April 2008 0 January 2008 January 2008 0 July 2008 May 2008 –21 September 2008 September 2008 0 March 2009 March 2009 0August 2009 June 2009 –2July 2009 November 2009 4May 2009 March 2009 –2 June 2008 May 2008 –1 March 2010 January 2010 –2 August 2008 May 2009 9 March 2009 January 2009 –2 May 2008 January 2008 –4construction end date November 2007 March 2008 Change in programme Forecast end date baseline (months)September 2009 September 2009 0 September 2008 November 2008 2 December 2011 December 2011 0 December 2011 August 2011 –4 Construction Construction end date end dateFebruary 2011 April 2011 2Completion date Completion date for construction for construction and initial overlay and initial overlay for test events for test eventsJune 2011 June 2011 0Construction Construction end date end dateApril 2011 August 2011 4Completion date Completion date for construction for construction and initial overlay and initial overlay for test events for test eventsJuly 2011 August 2011 1April 2011 February 2011 –2April 2011 March 2011 –1April 2011 April 2011 0June 2011 July 2011 1 December 2011 December 2011 0 February 2012 April 2011 –10 June 2010 October 2010 4 April 2010 July 2009 –9 February 2009 January 2009 –1Source: National Audit Office examination of actual and forecast progress against the November 2007 Programme BaselineNOTE",0.604578
3639,4286,NAO_Preparing_for_sporting_success_-_March_2008.pdf,4,"4 UK Sport’s ‘ultimate goals’ for medal success at the London 2012 Games will require a step change in performance amongst elite athletes. The achievements of athletes at recent elite international events in a number of sports, including sailing, cycling, rowing, boxing, disability equestrian and disability shooting, suggest that performance levels in some sports are already improving significantly. Following increased spending on elite sport, host nations can typically expect to win an extra six or seven gold medals at an Olympic Games and to win medals across a wider range of sports. This ‘host nation effect’ would not in itself be enough to deliver UK Sport’s Olympic goal, which is likely to require an improvement of eight or nine gold medals over the Great Britain team’s performance at the Athens Games in 2004 if the relative performance of other nations remained the same. Changes in the performance of other nations since 2004, especially in the context of a general trend of increased spending on elite sport, sometimes referred to as a ‘global sporting arms race’, may also have implications for UK Sport in delivering its medal aspirations",0.599309


### 5.3.3.  N most prototypical paragraphs where the belong() function is greater than the threshold L
I followed the method of extraction of prototypical text suggested by PTBI (Marchetti and Puranam, 2020. p. 14). PTBI attempts to not only extract the prototypical documents to improve interpretability, but also to find the minimum number of prototypical documents for topic interpretation. The algorithm is shown as follows:
1. Defines a threshold L  (L < ∈ [0,1]). For instance, we set L to be 0.5.
2. For each topic, select the documents with the probability that they belong to the topic is not less than L (0.5). 
3. For each topic, check whether the number of documents selected is not less than 1/L. For instance, if L = 0.5, for each topic we need at least 2 documents for topic interpretation. This method weakens the limitation that a few documents have a high proportion of a topic is because of randomness.
4. Compute the percentage of interpretable topics as described in step iii
5. Change L, keep iterating and find the optimal L with which the percentage of interpretable topics is the highest. 

##### Indenfication of the optimal L and miminum number of paragraphs for topic interpretation

In [117]:
List_num_doc = [x for x in range(1, 20, 1)] # generate a list of 1/L (minimum number of documents to interpret a topic)
list_L = [1/x for x in List_num_doc]        # generate a list of L

In [118]:
# define the function for computing the percentage of potentially interpretable topics against parameter L
def perc(i, df):
    list_num_topics = []
    for j in df:                                  
        topic_filter = df[j] >= i         
        m = df[j][topic_filter].count()           
        list_num_topics.append(m)                                             
        count1 = sum(map(lambda x : x >= 1/i, list_num_topics))                                     
        perc1 = count1 / k
    return(perc1)

The plot shows that when L = 0.333, the percentage of interpretable topics is 100%, so I set L to be 0.333 - ie, each topic needs at least 3 (1/0.333) paragraphs with the probability that they belong to the topic is no less than 3  for interpretation. It is worth noting that L is inversely proportional to the minimum number of paragraphs of each topic for interpretation (1/L), in other words, the lower the threshold L is, the more paragraphs that users need to interpret the topics. Although when L = 0.1 the percentage of interpretable topics is also 100%, the minimum number of paragraphs of each topic for interpretation also rises to 10 (1/0.1), which increases the workload of interpretation significantly.

In [119]:
list_perc2 = []
for i in list_L:
    num = perc(i, df_topic_para.drop(list_remove_para))
    list_perc2.append(num)

df_L2 = pd.DataFrame({'Threshold_L': list_L, 'Percentage of interpretable topics': list_perc2})
fig_L2 = px.line(df_L2, x = 'Threshold_L', y="Percentage of interpretable topics", title = 'Percentage of interpretable topics')
fig_L2.update_layout(autosize=False, width=800, height=400)
fig_L2.update_traces(mode = "lines + markers")
fig_L2.show()

In [120]:
# define the function for extracting the highest N ranked paragraphs from each topic
def top_n_filter(df, top_n):
    list_topic_id = [x+1 for x in range(0,k)]
    list_n_para = []
    list_n_p = []
    list_n_index = []
    list_n_file_name = []
    for x in range(1, k + 1): 
        n_para = [i for i in df.nlargest(top_n, [x])['paragraphs']]
        n_p = [i for i in df.nlargest(top_n, [x])[x]]
        n_index = [i for i in df_topic_para2.nlargest(top_n, [x]).index]
        n_file_name = [i for i in df.nlargest(top_n, [x])['file_name']]
        list_n_para.append(n_para)
        list_n_p.append(n_p)
        list_n_file_name.append(n_file_name)
        list_n_index.append(n_index)
    pd_n_para = pd.DataFrame({'Index':list_n_index, 'topic_id': list_topic_id, 'file': list_n_file_name, 'paragraph': list_n_para, 'probability': list_n_p})
    return(pd_n_para.apply(pd.Series.explode).reset_index().drop('index', axis = 1))

Below we get the the 3 most prototypical paragraphs of each topic when we set the optimal L to be 0.333. 

In [121]:
L = 1/3 # set the optimal L based on the analysis above
top_n_filter(df_topic_para2, int(1/L))[top_n_filter(df_topic_para2, int(1/L))['probability'] >= L].style.set_properties(subset = ['paragraph'], **{'width':'500px', 'length': '50px'})

Unnamed: 0,Index,topic_id,file,paragraph,probability
0,17263,1,Written_evidence_submitted_by_Richard_Baldwin_-_Dec_2009.pdf,"In my view what is needed is a specific initiative to promote the Olympic Legacy linked to CASC status and in which those responsible for that Legacy will actively participate. This would involve the provision of funding to develop and execute the CASC initiative which would be aimed at the increase of awareness of the CASC scheme and the provision of support to clubs wishing to register. The effectiveness of it could be measured—eg by setting targets for the number of club registrations and total cash benefits received. The initiative would include: — working with the Olympic ""authority"" responsible for ""soft legacy"" to exploit all CASC opportunities; — working with National Governing Bodies of Sport particularly the Olympic sports to assist their clubs to register; — circulars/communication with sports clubs; — regional workshops working with County Sports Partnerships and Local Authorities; — hotline support—by telephone or e mail; and — monitoring progress, feedback and easing administration in consultation with HMRC CASC Unit in Bootle",0.53141
1,12700,1,The_next_lap_-_April_2008_-_vol_1.pdf,"2. British Cycling told us that the cycling facilities at the Velopark had “the potential to be absolutely world-class” and that they “should be the very best anywhere in the world”.152 There has nonetheless been a certain amount of controversy about the extent to which the Velopark will offer a suitable replacement for off-road facilities at the former Eastway Circuit, lost when land was assembled by the LDA for incorporation into the Olympic Park. The design currently proposed by the ODA for the Velopark offers most of the facilities previously available at Eastway, albeit in a more fragmented layout. British Cycling, despite being supportive of the proposed design for use during the Games and despite anticipating that, after the Games, the Velopark will “provide a boost for cycling”,153 initially lodged objections to the relevant planning applications on the grounds that they did “not provide an adequate or comparable replacement for the road and off-road facilities provided to cycling on the Eastway Circuit”. British Cycling is now satisfied that the ODA has taken on board its concerns and that current plans for the Velopark offer an acceptable replacement for Eastway. The Eastway Users Group, which has campaigned for off-road cycling facilities in the Velopark in legacy mode, remains frustrated by the uncertainty about future provision, and it has pointed out to us that facilities at Eastway closed before the ODA or LDA had provided any suitable temporary alternative, causing 149 Ev 107",0.529183
2,17258,1,Written_evidence_submitted_by_Richard_Baldwin_-_Dec_2009.pdf,"To date over 5,500 clubs have registered under the scheme benefiting from a cash injection of around £65 million. The CASC scheme therefore has the capacity to inject the necessary finance—if there were no further registrations the registered clubs will continue to benefit at the rate of £15 million per year but increasing the number of registered clubs by 1,000 would increase the flow of cash by around £3 million per year in total. The average club benefits by just under £3,000 each year and this would treble the surpluses by the average club within sports other than golf providing much needed finance to support an increased capacity for participation",0.499889
3,8304,2,PAC_26_April_2012_-_Olympic_Costs_-_corrected_evidence_(no_report).pdf,"You think that the assessment of risks is our best estimate of the most likely outcome of the budget as a whole. But actually the assessment of risk-and how we have compiled it-is this: we have not sought to estimate how likely it is that every risk arises. We just said, ""Let us think about every risk that could arise, and let us assume that they all arise and work out the likely cost of them all arising."" On top of that, we said, ""And there will be some risks that we just cannot think about that are unknown unknowns. There will be some multiple consequentials if everything came together."" So we end up with an estimate not of the most likely cost of the project, which is what the burden of paragraph 1 of the PAC Report understands it is, but an estimate of how much we would need to set aside in the very unlikely event that all risks arise and some more unknown risks arise as well. The purpose of that is not to get to an estimate of the likely outcome of the budget. Its purpose and why we do it is to see, against any reasonable view of the likely risk that might arise, even on an assumption that they all arise and some more unknown risks arise, whether we have enough money. The conclusion has always been, yes, we had. Against what is therefore, in my view, a conservative and prudent estimate, we had £36 million headroom at the time of the NAO Report. We had more, and indeed the picture over the six-month period since the original figures on which the NAO was recording this, is that the contingency has gone down by £27 million or so-we reckon, because these are provisional figures, but I want to give our best figures-and the assessed risks on that very conservative and prudent basis have gone down by £136 million. So the picture on the budget as a whole is that we are spending contingency significantly slower than risks are disappearing from the programme. That is why, without in any sense being complacent, I am confident that we will bring this in within budget, and I do not think that the budget is close to being used up",0.684665
4,8388,2,PAC_26_April_2012_-_Olympic_Costs_-_corrected_evidence_(no_report).pdf,"Jonathan Stephens: There are two aspects to risk: likelihood and impact. What we are saying is that we made no estimate of likelihood, we just wrote in a 100% likelihood of all the risks we could think of and some unknown risks that we could not think of. We then looked at impact, and on impact we said, ""If this risk were to materialise-we are assuming a 100% likelihood that it materialises-what is the likely cost?"" That is where you get the low, the most likely outcome and the high outcome. When you add those together, you do not get to an outcome of, ""What is the most likely expenditure on the programme?""; you get to an outcome of, ""If all conceivable risks arise, plus some unknown risks that we cannot identify, what is the likely expenditure?"" That is a conservative and prudent view of, ""Do we have enough contingency left, if all those risks arise?"" In practice, they won’t all arise. It is conceivable that some will arise, but it is pretty unlikely that all of them will arise. It is perfectly conceivable that some individual risk will arise at a higher estimate than the most likely estimate, but the prospect of all those risks arising is unlikely. The prospect of them all arising at the very highest possible cost is so unlikely as to not provide a good basis for planning. I am sorry. I am going on at some length, but there is a real point",0.664391
5,8320,2,PAC_26_April_2012_-_Olympic_Costs_-_corrected_evidence_(no_report).pdf,"Jonathan Stephens: No, what I am saying is that we have tried to identify all possible risks that we can think of. We have then said, what impact do they have? What is the range? The high, medium and low, the medium being the most likely. If that risk materialises, what is the most likely outcome? You might then also say, having identified all your risks, how likely is it that each one materialises? That would then lead you to something that could be, as the Committee’s Report refers to it, characterised as ""the most likely expenditure"". But I am just trying to be clear. I suspect we haven’t explained it clearly enough, but there is a real important point at the bottom of it: that is not what we did. We said we will just assume all the risk materialised. We will not try to estimate which one is more likely to materialise and which one isn’t, and what sort of percentage figure to put against it. We will just assume they will all materialise, and put in the most likely cost if it does materialise to get to a conservative and prudent",0.594573
6,284,3,Jan_2003_-_Qs_140-159.pdf,"That is why you do it, but that is the easy bit. The hard bit, as the experience of the Commonwealth Games has shown, the experience of Picketts Lock has shown, the experience of Wembley has shown, is that it is very easy to be driven by euphoria alone and then the hard reckoning follows afterwards and that is what we want to avoid, so that we say to people, if we decide to bid, yes, we have decided because, in full understanding of the consequences, this is such a great thing for Britain and if we do not bid it will be because we have decided after rigorous examination that the costs are just too great and other very precious priorities, not just of the Government but of people up and down the country, would have to suffer if we were to do this. I think the poll, which I hope you got in good time, that we commissioned showed some very interesting conclusions. Yes, people are overwhelmingly in favour of our making a bid. The numbers fall when people are pressed in committing themselves to that alongside the consequences. There is one very interesting chart in the polling evidence which shows that, from memory, if you tell somebody they have £100, overwhelmingly what the sample showed was that you spend the largest slug of your money on more schools, the next largest slug on more hospitals, the next largest slug on increasing the value of pensions and then, interestingly, above reducing taxes, you spend money on an Olympic bid. I think the public's priorities are very clear indeed, but nobody should believe that hosting the Olympics would be somehow a free good, that we can just decide to do it as a decision that is divorced from the costs, the costs for sport, the costs for transport and all our commitments to public service renewal",0.569327
7,113,3,Jan_2003_-_Qs_1-19.pdf,"(Mr Bostock) Thank you, Mr Chairman. There are three very specific points there. Could you just give me a few minutes to give you the context of the work we did, and then I would like to answer those three specific points very clearly? I have a very short introduction to start with because I think it is very important for you to understand the context in which the work was done. My name is Mark Bostock; I am a director of Arup and was project director for the particular study which is the subject of the discussion today. First of all, very quickly, Arup is a very large, global multi disciplinary, planning, project management and engineering practice with a very large resource base here in London. We were http://www.publications.parliament.uk/pa/cm200203/cmselect/cmcumeds/268/3011401.htmcommissioned, appointed, to undertake this work in association with Insignia Richard Ellis following a competitive tender, and during the 16-week elapsed time for this work we met with our steering committee approximately every two weeks and a pretty rigorous programme was set out, and that steering committee comprised representatives from the Greater London Authority, the British Olympic Association, the London Development Agency, UK Sport, Sport England, Her Majesty's Treasury and the Department of Culture, Media and Sport, and we had the Cabinet Office's Performance and Innovation Unit with observer status. We reported on 21 May last year, 2002. After approximately five months Government put into the public domain the 12-page summary report which you made reference to, and I need to emphasise again that the main report is not in the public domain and I am very pleased you have made reference to that in your introduction. I would like to make a comment on the scope of work. What we were commissioned to do was to establish the outline construction costs for all the permanent and temporary sport facilities and all the sporting infrastructure; we were asked to look at the costs for bidding for and staging the Games; we were asked to establish the benefits of bidding for and staging the Games and establishing the residual legacy costs and benefits. What we were not asked to do was examine winnability, nor were we asked to examine the opportunity costs of Government spending, and I must make that very clear. Coming closer to the points in the context of the answers that I will give you in a second, we developed our own specimen Games—and I must emphasise specimen Games—and these were centred on an Olympic zone in the Lower Lee Valley in East London, so that provided the basis of our appraisal. During our work and in our report we have been totally transparent in terms of all the various assumptions that we have made in our evaluation and I need to say, with great emphasis, that the financial appraisal methodology was accepted by the Treasury. We always anticipated, as we did this work and completed it, that each of the key stakeholders—the Government, the London Government and the Britain Olympic Association—would evaluate the project from their own point of view, including risks, risk assessment and mitigation of that risk in order to examine the impact of a possible Olympic Games on their respective organisations, and I need to make this very specific point, because risk assessment from the point of view of each of the key stakeholders was not part of our commission. What I want to make absolutely clear, therefore, is that in providing an independent assessment of whether London should or should not bid or stage for the London Olympic Games we concluded very strongly that the key stakeholders needed to enter into a concordat, and I would be very happy to indicate what that meant, because this is planning for a major event in 12 years plus a legacy thereafter, and we have concluded that in the event that an appropriate concordat cannot be entered into between the three stakeholders there is absolutely no point in proceeding with these propositions. Also, we have indicated in the summary report five areas where we felt more work needed to be done from the time we completed our work in May and June and a decision which we thought would have been made in November—it is January now—and that additional work is very clearly set out in section S16 of the summary report",0.568172
8,451,3,Jan_2003_-_Qs_200-220.pdf,"(Sir Rodney Walker) I hardly think I am the appropriate person to pass judgement on this but, as always, I am happy to give you a view. You asked the questions of the previous witnesses and the fact is that this project can succeed providing you get the ownership right. The ownership has to be with government, absolutely no question. It seems to me that it is unlikely that Crossrail could be relied upon to be ready in time. I think there would need to be very convincing arguments put forward as to how the transport can be organised so as to make traffic movements around London acceptable to those who go about their work and those who come to London to attend the Olympic Games. This is a very personal view. I do not think it is necessarily helpful that at this stage two expert groups using the same base data on what the underwriting costs of the Olympic Games might be managed to come up with figures about a billion pounds apart, although I heard today that those numbers are still being worked upon. I accept the Chairman's point that Manchester worked because there was in the end clear ownership. Wembley became difficult because at several stages there was no clear ownership of the project. If there is clear ministerial-led ownership of this project and if all the right bodies work together, if vested interests are set aside, there is absolutely no reason why this country is not capable of both bidding for and succeeding in securing the Olympic Games, and indeed organising a successful event",0.561426
9,3639,4,NAO_Preparing_for_sporting_success_-_March_2008.pdf,"4 UK Sport’s ‘ultimate goals’ for medal success at the London 2012 Games will require a step change in performance amongst elite athletes. The achievements of athletes at recent elite international events in a number of sports, including sailing, cycling, rowing, boxing, disability equestrian and disability shooting, suggest that performance levels in some sports are already improving significantly. Following increased spending on elite sport, host nations can typically expect to win an extra six or seven gold medals at an Olympic Games and to win medals across a wider range of sports. This ‘host nation effect’ would not in itself be enough to deliver UK Sport’s Olympic goal, which is likely to require an improvement of eight or nine gold medals over the Great Britain team’s performance at the Athens Games in 2004 if the relative performance of other nations remained the same. Changes in the performance of other nations since 2004, especially in the context of a general trend of increased spending on elite sport, sometimes referred to as a ‘global sporting arms race’, may also have implications for UK Sport in delivering its medal aspirations",0.599309


### 5.3.4. N most prototypical paragraphs of each topic

In [122]:
# 2 most prototypical paragraphs of each topic
N2 = 2
top_n_filter(df_topic_para2, N2).style.set_properties(subset = ['paragraph'], **{'width':'500px', 'length': '50px'})

Unnamed: 0,Index,topic_id,file,paragraph,probability
0,17263,1,Written_evidence_submitted_by_Richard_Baldwin_-_Dec_2009.pdf,"In my view what is needed is a specific initiative to promote the Olympic Legacy linked to CASC status and in which those responsible for that Legacy will actively participate. This would involve the provision of funding to develop and execute the CASC initiative which would be aimed at the increase of awareness of the CASC scheme and the provision of support to clubs wishing to register. The effectiveness of it could be measured—eg by setting targets for the number of club registrations and total cash benefits received. The initiative would include: — working with the Olympic ""authority"" responsible for ""soft legacy"" to exploit all CASC opportunities; — working with National Governing Bodies of Sport particularly the Olympic sports to assist their clubs to register; — circulars/communication with sports clubs; — regional workshops working with County Sports Partnerships and Local Authorities; — hotline support—by telephone or e mail; and — monitoring progress, feedback and easing administration in consultation with HMRC CASC Unit in Bootle",0.53141
1,12700,1,The_next_lap_-_April_2008_-_vol_1.pdf,"2. British Cycling told us that the cycling facilities at the Velopark had “the potential to be absolutely world-class” and that they “should be the very best anywhere in the world”.152 There has nonetheless been a certain amount of controversy about the extent to which the Velopark will offer a suitable replacement for off-road facilities at the former Eastway Circuit, lost when land was assembled by the LDA for incorporation into the Olympic Park. The design currently proposed by the ODA for the Velopark offers most of the facilities previously available at Eastway, albeit in a more fragmented layout. British Cycling, despite being supportive of the proposed design for use during the Games and despite anticipating that, after the Games, the Velopark will “provide a boost for cycling”,153 initially lodged objections to the relevant planning applications on the grounds that they did “not provide an adequate or comparable replacement for the road and off-road facilities provided to cycling on the Eastway Circuit”. British Cycling is now satisfied that the ODA has taken on board its concerns and that current plans for the Velopark offer an acceptable replacement for Eastway. The Eastway Users Group, which has campaigned for off-road cycling facilities in the Velopark in legacy mode, remains frustrated by the uncertainty about future provision, and it has pointed out to us that facilities at Eastway closed before the ODA or LDA had provided any suitable temporary alternative, causing 149 Ev 107",0.529183
2,8304,2,PAC_26_April_2012_-_Olympic_Costs_-_corrected_evidence_(no_report).pdf,"You think that the assessment of risks is our best estimate of the most likely outcome of the budget as a whole. But actually the assessment of risk-and how we have compiled it-is this: we have not sought to estimate how likely it is that every risk arises. We just said, ""Let us think about every risk that could arise, and let us assume that they all arise and work out the likely cost of them all arising."" On top of that, we said, ""And there will be some risks that we just cannot think about that are unknown unknowns. There will be some multiple consequentials if everything came together."" So we end up with an estimate not of the most likely cost of the project, which is what the burden of paragraph 1 of the PAC Report understands it is, but an estimate of how much we would need to set aside in the very unlikely event that all risks arise and some more unknown risks arise as well. The purpose of that is not to get to an estimate of the likely outcome of the budget. Its purpose and why we do it is to see, against any reasonable view of the likely risk that might arise, even on an assumption that they all arise and some more unknown risks arise, whether we have enough money. The conclusion has always been, yes, we had. Against what is therefore, in my view, a conservative and prudent estimate, we had £36 million headroom at the time of the NAO Report. We had more, and indeed the picture over the six-month period since the original figures on which the NAO was recording this, is that the contingency has gone down by £27 million or so-we reckon, because these are provisional figures, but I want to give our best figures-and the assessed risks on that very conservative and prudent basis have gone down by £136 million. So the picture on the budget as a whole is that we are spending contingency significantly slower than risks are disappearing from the programme. That is why, without in any sense being complacent, I am confident that we will bring this in within budget, and I do not think that the budget is close to being used up",0.684665
3,8388,2,PAC_26_April_2012_-_Olympic_Costs_-_corrected_evidence_(no_report).pdf,"Jonathan Stephens: There are two aspects to risk: likelihood and impact. What we are saying is that we made no estimate of likelihood, we just wrote in a 100% likelihood of all the risks we could think of and some unknown risks that we could not think of. We then looked at impact, and on impact we said, ""If this risk were to materialise-we are assuming a 100% likelihood that it materialises-what is the likely cost?"" That is where you get the low, the most likely outcome and the high outcome. When you add those together, you do not get to an outcome of, ""What is the most likely expenditure on the programme?""; you get to an outcome of, ""If all conceivable risks arise, plus some unknown risks that we cannot identify, what is the likely expenditure?"" That is a conservative and prudent view of, ""Do we have enough contingency left, if all those risks arise?"" In practice, they won’t all arise. It is conceivable that some will arise, but it is pretty unlikely that all of them will arise. It is perfectly conceivable that some individual risk will arise at a higher estimate than the most likely estimate, but the prospect of all those risks arising is unlikely. The prospect of them all arising at the very highest possible cost is so unlikely as to not provide a good basis for planning. I am sorry. I am going on at some length, but there is a real point",0.664391
4,284,3,Jan_2003_-_Qs_140-159.pdf,"That is why you do it, but that is the easy bit. The hard bit, as the experience of the Commonwealth Games has shown, the experience of Picketts Lock has shown, the experience of Wembley has shown, is that it is very easy to be driven by euphoria alone and then the hard reckoning follows afterwards and that is what we want to avoid, so that we say to people, if we decide to bid, yes, we have decided because, in full understanding of the consequences, this is such a great thing for Britain and if we do not bid it will be because we have decided after rigorous examination that the costs are just too great and other very precious priorities, not just of the Government but of people up and down the country, would have to suffer if we were to do this. I think the poll, which I hope you got in good time, that we commissioned showed some very interesting conclusions. Yes, people are overwhelmingly in favour of our making a bid. The numbers fall when people are pressed in committing themselves to that alongside the consequences. There is one very interesting chart in the polling evidence which shows that, from memory, if you tell somebody they have £100, overwhelmingly what the sample showed was that you spend the largest slug of your money on more schools, the next largest slug on more hospitals, the next largest slug on increasing the value of pensions and then, interestingly, above reducing taxes, you spend money on an Olympic bid. I think the public's priorities are very clear indeed, but nobody should believe that hosting the Olympics would be somehow a free good, that we can just decide to do it as a decision that is divorced from the costs, the costs for sport, the costs for transport and all our commitments to public service renewal",0.569327
5,113,3,Jan_2003_-_Qs_1-19.pdf,"(Mr Bostock) Thank you, Mr Chairman. There are three very specific points there. Could you just give me a few minutes to give you the context of the work we did, and then I would like to answer those three specific points very clearly? I have a very short introduction to start with because I think it is very important for you to understand the context in which the work was done. My name is Mark Bostock; I am a director of Arup and was project director for the particular study which is the subject of the discussion today. First of all, very quickly, Arup is a very large, global multi disciplinary, planning, project management and engineering practice with a very large resource base here in London. We were http://www.publications.parliament.uk/pa/cm200203/cmselect/cmcumeds/268/3011401.htmcommissioned, appointed, to undertake this work in association with Insignia Richard Ellis following a competitive tender, and during the 16-week elapsed time for this work we met with our steering committee approximately every two weeks and a pretty rigorous programme was set out, and that steering committee comprised representatives from the Greater London Authority, the British Olympic Association, the London Development Agency, UK Sport, Sport England, Her Majesty's Treasury and the Department of Culture, Media and Sport, and we had the Cabinet Office's Performance and Innovation Unit with observer status. We reported on 21 May last year, 2002. After approximately five months Government put into the public domain the 12-page summary report which you made reference to, and I need to emphasise again that the main report is not in the public domain and I am very pleased you have made reference to that in your introduction. I would like to make a comment on the scope of work. What we were commissioned to do was to establish the outline construction costs for all the permanent and temporary sport facilities and all the sporting infrastructure; we were asked to look at the costs for bidding for and staging the Games; we were asked to establish the benefits of bidding for and staging the Games and establishing the residual legacy costs and benefits. What we were not asked to do was examine winnability, nor were we asked to examine the opportunity costs of Government spending, and I must make that very clear. Coming closer to the points in the context of the answers that I will give you in a second, we developed our own specimen Games—and I must emphasise specimen Games—and these were centred on an Olympic zone in the Lower Lee Valley in East London, so that provided the basis of our appraisal. During our work and in our report we have been totally transparent in terms of all the various assumptions that we have made in our evaluation and I need to say, with great emphasis, that the financial appraisal methodology was accepted by the Treasury. We always anticipated, as we did this work and completed it, that each of the key stakeholders—the Government, the London Government and the Britain Olympic Association—would evaluate the project from their own point of view, including risks, risk assessment and mitigation of that risk in order to examine the impact of a possible Olympic Games on their respective organisations, and I need to make this very specific point, because risk assessment from the point of view of each of the key stakeholders was not part of our commission. What I want to make absolutely clear, therefore, is that in providing an independent assessment of whether London should or should not bid or stage for the London Olympic Games we concluded very strongly that the key stakeholders needed to enter into a concordat, and I would be very happy to indicate what that meant, because this is planning for a major event in 12 years plus a legacy thereafter, and we have concluded that in the event that an appropriate concordat cannot be entered into between the three stakeholders there is absolutely no point in proceeding with these propositions. Also, we have indicated in the summary report five areas where we felt more work needed to be done from the time we completed our work in May and June and a decision which we thought would have been made in November—it is January now—and that additional work is very clearly set out in section S16 of the summary report",0.568172
6,3639,4,NAO_Preparing_for_sporting_success_-_March_2008.pdf,"4 UK Sport’s ‘ultimate goals’ for medal success at the London 2012 Games will require a step change in performance amongst elite athletes. The achievements of athletes at recent elite international events in a number of sports, including sailing, cycling, rowing, boxing, disability equestrian and disability shooting, suggest that performance levels in some sports are already improving significantly. Following increased spending on elite sport, host nations can typically expect to win an extra six or seven gold medals at an Olympic Games and to win medals across a wider range of sports. This ‘host nation effect’ would not in itself be enough to deliver UK Sport’s Olympic goal, which is likely to require an improvement of eight or nine gold medals over the Great Britain team’s performance at the Athens Games in 2004 if the relative performance of other nations remained the same. Changes in the performance of other nations since 2004, especially in the context of a general trend of increased spending on elite sport, sometimes referred to as a ‘global sporting arms race’, may also have implications for UK Sport in delivering its medal aspirations",0.599309
7,1468,4,Memorandum_submitted_by_British_Olympic_Association_-_Nov_2007.pdf,"Six days after London was awarded the 2012 Olympic Games during a conference called by the BOA, British Sport agreed to set an aspirational target for Team GB to finish in fourth place in the 2012 medal table. The BOA has had a succession of outstanding performance personnel, Sir Clive Woodward being the current Director of Elite Performance who is taking forward the brief to analyse the ways in which the organisation could continue to strengthen its services to the Olympic governing bodies and their athletes in order to help reach the 2012 target. After a year's research, the result has been the development of an ""Elite Performance Programme"" which places the athlete and coach at the centre of a support network made up of leading specialists from areas including kinesiology, physiology, nutrition and performance analysis. A unique communication and analysis system will ensure the athlete receives 24/7 support from the network",0.590843
8,17552,5,Written_evidence_submitted_by_Vision_2020_UK_-_Jan_2010.pdf,"4. Most clubs seem ill-prepared for enquiries from, and inclusion of, people with disabilities who wish to participate in the sport offered by the club. There is little or no support for specialist clubs who provide opportunities for sport that cannot be integrated ie wheelchair basketball or blind cricket! There is often nothing locally to support the child or their parent in accessing the specialist provision and this often involves their having to undertake extensive travel to specialist facilities or organisations catering for this group. There is poor information regarding availability etc and http://www.parasport.org.uk was established as a portal to provide pathway and provision information. The then Mayor of London published a strategy in 2007, which highlighted all these issues and to date there has been little, if any, action to redress these anomalies in London or in the rest of the country. DCSF have, in my view, shown no leadership regarding the legacy of 2012 and its impact on PE in schools and the inclusion of those with disabilities in core curriculum activities or sporting opportunities within or after school. DCMS held a legacy event in 2008 and again in April 2009 focusing on the legacy of the Games for those with disabilities. One outcome was to seek greater links between DWP, DCSF and themselves to ensure that joint strategies were developed and pathways established that enabled children to enjoy and participate in PE and sport within schools/after school clubs/integrated and specialist provision in the community, with good national talent forums and pathways established for those who wish to participate in sport at a higher level and finally, with governing bodies having clear inclusive programmes for sports men and women with disabilities active at a national and international level",0.605244
9,16542,5,Written_evidence_submitted_by_Incorporated_Society_of_Musicians_-_Jan_2010.pdf,"2.3.1.1 Creative industries grow at least 1% more than the rest of the economy. [1] Music grew by 4.7% from 2007 to 2008. [2] 2.3.1.2 Creative industries more than double our investment. The contribution of music to the UK economy reached well over £3.5 billion with the arts estimated to put over £2 back into the economy for every £1 invested [2]. Over 120,000 people are employed in music. [1] In addition, consumers spent over £4 billion on music in 2000. [2] 2.3.2 Music education and effective teaching of music technology are vital to the continuing world leadership of the UK in music. The UK currently lacks the capability to take full advantage of music technology even though music technology has the ability to motivate pupils. [3] The Cultural Olympiad gives an opportunity to put music technology at the centre of Continuing Professional Development. We believe that there is an opportunity to create Cultural Olympiad music technology teachers, available to every local authority and able to provide training and continuing professional development to the current music teacher workforce",0.580403


### 5.3.5.  N most prototypical paragraphs of a specific topic

In [123]:
topic_id_chosen = 2                                    # choose the topic ID
num_para = 2                                            # set N to extract the N most prototypical paragraphs of a specific topic
df_n_topic_k = top_n_filter(df_topic_para2, num_para)
topic_id_filter = df_n_topic_k['topic_id'] == topic_id_chosen
df_n_topic_k[topic_id_filter].style.set_properties(subset = ['paragraph'], **{'width':'500px', 'length': '50px'})

Unnamed: 0,Index,topic_id,file,paragraph,probability
2,8304,2,PAC_26_April_2012_-_Olympic_Costs_-_corrected_evidence_(no_report).pdf,"You think that the assessment of risks is our best estimate of the most likely outcome of the budget as a whole. But actually the assessment of risk-and how we have compiled it-is this: we have not sought to estimate how likely it is that every risk arises. We just said, ""Let us think about every risk that could arise, and let us assume that they all arise and work out the likely cost of them all arising."" On top of that, we said, ""And there will be some risks that we just cannot think about that are unknown unknowns. There will be some multiple consequentials if everything came together."" So we end up with an estimate not of the most likely cost of the project, which is what the burden of paragraph 1 of the PAC Report understands it is, but an estimate of how much we would need to set aside in the very unlikely event that all risks arise and some more unknown risks arise as well. The purpose of that is not to get to an estimate of the likely outcome of the budget. Its purpose and why we do it is to see, against any reasonable view of the likely risk that might arise, even on an assumption that they all arise and some more unknown risks arise, whether we have enough money. The conclusion has always been, yes, we had. Against what is therefore, in my view, a conservative and prudent estimate, we had £36 million headroom at the time of the NAO Report. We had more, and indeed the picture over the six-month period since the original figures on which the NAO was recording this, is that the contingency has gone down by £27 million or so-we reckon, because these are provisional figures, but I want to give our best figures-and the assessed risks on that very conservative and prudent basis have gone down by £136 million. So the picture on the budget as a whole is that we are spending contingency significantly slower than risks are disappearing from the programme. That is why, without in any sense being complacent, I am confident that we will bring this in within budget, and I do not think that the budget is close to being used up",0.684665
3,8388,2,PAC_26_April_2012_-_Olympic_Costs_-_corrected_evidence_(no_report).pdf,"Jonathan Stephens: There are two aspects to risk: likelihood and impact. What we are saying is that we made no estimate of likelihood, we just wrote in a 100% likelihood of all the risks we could think of and some unknown risks that we could not think of. We then looked at impact, and on impact we said, ""If this risk were to materialise-we are assuming a 100% likelihood that it materialises-what is the likely cost?"" That is where you get the low, the most likely outcome and the high outcome. When you add those together, you do not get to an outcome of, ""What is the most likely expenditure on the programme?""; you get to an outcome of, ""If all conceivable risks arise, plus some unknown risks that we cannot identify, what is the likely expenditure?"" That is a conservative and prudent view of, ""Do we have enough contingency left, if all those risks arise?"" In practice, they won’t all arise. It is conceivable that some will arise, but it is pretty unlikely that all of them will arise. It is perfectly conceivable that some individual risk will arise at a higher estimate than the most likely estimate, but the prospect of all those risks arising is unlikely. The prospect of them all arising at the very highest possible cost is so unlikely as to not provide a good basis for planning. I am sorry. I am going on at some length, but there is a real point",0.664391


## 5.4 Dashboard for topic interpretation

Below the visualisation of PyLDAvis and the prototypical paragraphs are integrated into a dashboard, users can click the link generated to open the dashboard and interpret the topics more easily. To 
launch the dash, remember to download the two css files from https://github.com/suhao3123/CSS, create a folder named assets in the root of your app directory and include the two files in that folder.
After the first run of the whole program, users can run the chunks below independently.

In [130]:
import plotly.express as px 
import plotly.graph_objects as go

from jupyter_dash import JupyterDash

import dash
import dash_table
import dash_core_components as dcc
import dash_html_components as html
import dash_bootstrap_components as dbc
from dash_table.Format import Format, Scheme, Trim
from dash.dependencies import Input, Output, State
from dash.exceptions import PreventUpdate

In [131]:
# load the topic distribution of paragraphs from disk
df_topic_para3 = pd.read_pickle('./df_topic_para_Olympics2.pkl')
df_topic_para3_n = df_topic_para3.copy()
df_topic_para3_n['highest_p'] = df_topic_para3_n.iloc[:, 6:].max(axis = 1)         
df_topic_para3_n['salient_topic'] = df_topic_para3_n.iloc[:, 6:].idxmax(axis = 1)  
df_topic_para3_n = df_topic_para3_n[['index','file_name','salient_topic','paragraphs','highest_p',]]
df_topic_para3_n.columns = ['Index','file','topic', 'paragraph','probability']

In [132]:
# define the function for extracting the highest N ranked paragraphs from each topic
def top_n_filter(df, top_n):
    list_topic_id = [x+1 for x in range(0,k)]
    list_n_para = []
    list_n_p = []
    list_n_index = []
    list_n_file_name = []
    for x in range(1, k + 1): 
        n_para = [i for i in df.nlargest(top_n, [x])['paragraphs']]
        n_p = [i for i in df.nlargest(top_n, [x])[x]]
        n_index = [i for i in df_topic_para3.nlargest(top_n, [x]).index]
        n_file_name = [i for i in df.nlargest(top_n, [x])['file_name']]
        list_n_para.append(n_para)
        list_n_p.append(n_p)
        list_n_file_name.append(n_file_name)
        list_n_index.append(n_index)
    pd_n_para = pd.DataFrame({'Index':list_n_index, 'topic_id': list_topic_id, 'file': list_n_file_name, 'paragraph': list_n_para, 'probability': list_n_p})
    return(pd_n_para.apply(pd.Series.explode).reset_index().drop('index', axis = 1))

In [133]:
list_mark = list(np.arange(0,1.050,0.050))
list_mark_round = [round(i, 2) for i in list_mark]
marks= {x: str(x) for x in list_mark_round}

In [134]:
# Set up the app
external_stylesheets = [dbc.themes.BOOTSTRAP, "assets/bootstrap.min.css"]
app = JupyterDash(__name__, external_stylesheets=external_stylesheets)

# Bootstrap's cards provide a flexible content container with multiple variants and options.
pyLDAcard = dbc.Card(
    [
            dbc.CardHeader(html.H4("Topic visualisation")),                # title
            dbc.CardBody(
            [
                dbc.Row(
                        dbc.Col(
                            [
                                html.Embed(src = "assets/lda.html" ,style={ 'position': 'relative', 'left': '-250px', 'top': '-100px',
                                                                            'width':'1400px', 'height':'860px', 'transform': 'scale(0.70)'}), 
                            ]
                        )
                )
            ]
        ),
    ]
)


table_card = dbc.Card(
    [
        dbc.CardHeader(
            dbc.Row([
                  dbc.Col(html.H4("Prototypical paragraphs"))
            ])            
        ),
        
        
        dbc.CardHeader(
                         dbc.Row(
                            [
                            dbc.Col(
                            [
                                html.H6("Threshold of probability "),
                                dcc.Slider(
                                            id='slider',
                                            min=0,
                                            max=1,
                                            step=0.01,
                                marks=marks,
                                        value=0.1,
                                        ),html.Div(style={'width': '1000px'})
                            ]
                        ),
                        dbc.Col(
                        [
                                html.H6("Topic Selection"),
                                dcc.Input(id="topic_selection", type="number",min=1, max=11, step=1, value=1),
                                html.Div(style={'width': '100px'})
                        ]
                        ),
                          dbc.Col(
                        [
                                html.H6("N Selection"),
                                dcc.Input(id="rank_selection", type="number",min=1, max=20, step=1,value=5),
                                html.Div(style={'width': '100px'})
                        ]
                        ),
                                dbc.Col(
                                    [
                                        html.H6("Mode"),
                                        dcc.Dropdown(
                                                            id='dropdown',
                                                            options=[
                                                                {'label': 'N most prototypical paragraphs for topic K', 'value': 'c1'},
                                                                {'label': 'N most prototypical paragraphs overall', 'value': 'c2'},
                                                                {'label': 'N most prototypical paragraphs for each topic', 'value': 'c3'}
                                                            ],
                                               #             value = 'c1',
                                                            searchable=False,
                                                            clearable=False,
                                                            placeholder="Select a mode",
                                                        ),html.Div(style={'width': '380px'})
                                    ]
                                ),                                
                ]
            )                    
                ),
        
        dbc.CardBody(
                dbc.Col([
                    dash_table.DataTable(),html.Div(id="data_table")           
                ])    
                ),
        
        dbc.CardFooter(
            dbc.Row([
                dbc.Col(
                                    [
                                        html.H6('Please click the "Submit" button after setting the parameters above'),html.Div(style={'width': '500px'})

                                    ]
                                ),
                                
                dbc.Col(
                                    [
                                        dbc.Button("Submit", id='submit', color="success"),
                                        html.Div(id='button')
                                    ]
                                )
                ])
                )       
    ]
)
        
app.layout = html.Div(
    [
        dbc.Container(
            [dbc.Row(
                [
                dbc.Col(pyLDAcard,md=7), 
                dbc.Col(table_card,md=5)
            ]             
            )
            ],
            fluid=True,
        ),
    ]
)

@app.callback(
    Output('data_table','children'),
    Input('submit', 'n_clicks'), Input('dropdown', 'value'), Input('slider', 'value'), Input('topic_selection','value'), Input('rank_selection','value')
     )

def update_datatable(n_clicks, dropdown_value, slider_value,topic_value,top_n):
  
    ctx = dash.callback_context
    if not ctx.triggered:
        button_id = 'No clicks'
    else:
        button_id = ctx.triggered[0]['prop_id'].split('.')[0]
    
#    print(button_id)
                 
    if button_id=="submit":
        topic = topic_value                        #Topic filter of the Highest ranked paragraphs
        Top_N = top_n                          #Set rank of for for topic 
#        print(topic_value)
#        print(Top_N)   
        
        minimum_probability = slider_value #Topics with an assigned probability lower than this threshold will be discarded.
#        print(minimum_probability)            

        if dropdown_value=='c1':
            c_df = top_n_filter(df_topic_para3, Top_N)[top_n_filter(df_topic_para3, Top_N)['topic_id'] == topic]
        elif dropdown_value=='c2':
            c_df = df_topic_para3_n.nlargest(Top_N,['probability'])
        elif dropdown_value=='c3':
            c_df = top_n_filter(df_topic_para3, Top_N)
        else:
            return None
#        print(dropdown_value)
        
        table = dash_table.DataTable(
                                    id="table-line-1",
                                    columns=[
                                                dict(id=c_df.columns[0], name=c_df.columns[0]),
                                                dict(id=c_df.columns[1], name=c_df.columns[1]),
                                                dict(id=c_df.columns[2], name=c_df.columns[2]),
                                                dict(id=c_df.columns[3], name=c_df.columns[3]),
                                                dict(id=c_df.columns[4], name=c_df.columns[4], type='numeric', format=Format(precision=2, scheme=Scheme.fixed)),             
                                            ],
                                    data=c_df.to_dict("records"),
                       #             page_action='none',
                                    page_size=5,
                                    style_table={'height': '1000px', 'overflowY': 'auto'},
                                    fixed_rows={'headers': True},
                                    style_header={ 'border': '1px solid black', 'fontWeight': 'bold','textAlign': 'center', 'fontSize':'1px'},
                                    style_cell={  'fontSize':'1px','border': '1px solid grey','minWidth': 10, 'maxWidth': 30, 'width': 30,'whiteSpace': 'normal',
                                                'height': 'auto', 'lineHeight': '15px','textAlign': 'center','textOverflow': 'ellipsis', 'maxWidth': 0},
                                    css=[{
                                            'selector': '.dash-spreadsheet td div',
                                            'rule': '''
                                                line-height: 15px;
                                                max-height: 300px; min-height: 50px; height: 300px;
                                                display: block;
                                                overflow-y: hidden;
                                            '''
                                        }],
                                     style_cell_conditional=[
                                                                    {'if': {'column_id': 'Index'},'width': '5%'},
                                                                    {'if': {'column_id': 'file'},'width': '10%' },
                                                                    {'if': {'column_id': 'topic_id'},'width': '5%' },
                                                                    {'if': {'column_id': 'paragraph'},'width': '75%','textAlign': 'left'},
                                                                    {'if': {'column_id': 'probability'},'width': '5%'},
                                                                    
    
                                                                ],
    
                                    style_as_list_view=True,
                 )
#        print('end')
        return table
          
app.run_server(mode = 'external')

Dash app running on http://127.0.0.1:8050/


In [135]:
# remove the hash below and run the chunk to terminate the Dash
# app._terminate_server_for_port('localhost', 8050)