# Prerequisites

In [57]:
# pip install PyMuPDF                    # (install PyMuPDF for extracting info from PDF files)
# pip install tika                       # (install tika for extracting paragraphs from PDF files)
# pip install spacy==2.2.0               # (install spacy for lemmatization)
# conda install gensim                   # (intall gesim for topic modelling)
# pip install pyLDAvis                   # (install pyLDAvis for topic modelling visulisation)
# conda install -c conda-forge pyldavis  # (if you use aconda to install pyLADvis)

In [58]:
import pandas as pd
import numpy as np
import re
from pprint import pprint

# glob for extracting the directories of metadata
import glob

# PyMuPDF
import fitz

# tika
import tika               
from tika import parser   

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Visualisation
import plotly.express as px
import pyLDAvis
import pyLDAvis.gensim_models
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)
import os

# Import pdf files, data wrangling and overview

In [59]:
# Extract the directories of the PDF files, make sure the folder name does not contain number
pdf_dir = "D:\LEON\Business Analytics\Study\9. Business Project\Data set\Olympics"
pdf_files = glob.glob("%s/*.pdf" % pdf_dir)
pdf_files[:1]

['D:\\LEON\\Business Analytics\\Study\\9. Business Project\\Data set\\Olympics\\Examination_of_Witnesses_Sept_2003_-_Q1-19.pdf']

In [60]:
# Use PyMuPDF to extract all info of the PDF files (text, title, date, etc)
list_metadata = []
for i in pdf_files:
    with fitz.open(i) as doc:
        info = doc.metadata
        info['file_name'] = os.path.basename(i)
        text = ''
        for page in doc:
            text+= page.getText()
        info['Content'] = text
        
    list_metadata.append(info)

In [61]:
df = pd.DataFrame(list_metadata)
df['document_id'] = df.index
df = df.drop_duplicates(subset = 'Content') # drop duplicate rows
df = df.dropna(subset=['Content']) # drop rows whose text content is NaN
df['Word_count'] = df ['Content'].str.count(' ') + 1
df.head(3)

Unnamed: 0,format,title,author,subject,keywords,creator,producer,creationDate,modDate,trapped,encryption,file_name,Content,document_id,Word_count
0,PDF 1.7,,B Lewis,,,Microsoft Word,,D:20210822083603+00'00',D:20210822083603+00'00',,,Examination_of_Witnesses_Sept_2003_-_Q1-19.pdf,Examination of Witnesses (1-19) \n16 SEPTEMBER...,0,6115
1,PDF 1.7,,B Lewis,,,Microsoft Word,,D:20210822083606+00'00',D:20210822083606+00'00',,,Examination_of_Witnesses_Sept_2003_-_Q20-39.pdf,Examination of Witnesses (20-39) \n16 SEPTEMBE...,1,4002
2,PDF 1.7,,B Lewis,,,Microsoft Word,,D:20210822083609+00'00',D:20210822083609+00'00',,,Examination_of_Witnesses_Sept_2003_-_Q40-44.pdf,Examination of Witnesses (40-44) \n16 SEPTEMBE...,2,1007


In [62]:
# check if there are documents with few words
min_word_count= 10                                               # set the mininum word_count of a document
min_word_count_filter = df['Word_count'] <= min_word_count
df_few_words = df[min_word_count_filter][['file_name', 'Content']]
df_few_words

Unnamed: 0,file_name,Content


In [63]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 169 entries, 0 to 168
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   format        169 non-null    object
 1   title         169 non-null    object
 2   author        169 non-null    object
 3   subject       169 non-null    object
 4   keywords      169 non-null    object
 5   creator       169 non-null    object
 6   producer      169 non-null    object
 7   creationDate  169 non-null    object
 8   modDate       169 non-null    object
 9   trapped       169 non-null    object
 10  encryption    3 non-null      object
 11  file_name     169 non-null    object
 12  Content       169 non-null    object
 13  document_id   169 non-null    int64 
 14  Word_count    169 non-null    int64 
dtypes: int64(2), object(13)
memory usage: 21.1+ KB


### Word count

In [64]:
# Word count
df['Word_count'].sum( )

1054090

In [65]:
# Word count distribution
#import seaborn as sns
#ax1 = sns.distplot(df['Word_count'])
#ax1.set(title = 'Word Count Distribution',
#       xlabel = 'Word Count of Each Document');

# Tokenization

In [66]:
data = df.Content.values.tolist()

In [67]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence).encode('utf-8'), deacc=True))  # deacc=True removes punctuations

data_words= list(sent_to_words(data))

# Processing words: 
Remove Stopwords, Make Bigrams and Trigrams,Lemmatisation, remove short words and meaningless words

In [68]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [69]:
# import the stop_words from gensim
from gensim.parsing.preprocessing import remove_stopwords, STOPWORDS
stop_words = [i for i in STOPWORDS]

# add more stop words after analysing the overall term frequncy of each topic in pyLDAvis in the "Word frequency of each topic" section
new_stop_words = ['go', 'would', 'make', 'think', 'take', 'say', 'need', 'want', 'thing', 'have', 'lot'] 
stop_words.extend(new_stop_words)
stop_words

['further',
 'against',
 'which',
 'found',
 'their',
 'toward',
 'fill',
 'them',
 'couldnt',
 'yourself',
 'about',
 'by',
 'own',
 'only',
 'always',
 'therein',
 'go',
 'nobody',
 'thru',
 'a',
 'if',
 'these',
 'bottom',
 'have',
 'moreover',
 'take',
 'computer',
 'done',
 'doesn',
 'onto',
 'made',
 'ltd',
 'empty',
 'herself',
 'among',
 'should',
 'various',
 'amount',
 'her',
 'some',
 'sometime',
 'per',
 'up',
 'hers',
 'hence',
 'quite',
 'herein',
 'full',
 'most',
 'through',
 'who',
 'bill',
 'such',
 'con',
 'unless',
 'us',
 'either',
 'are',
 'may',
 'whatever',
 'someone',
 'because',
 'again',
 'across',
 'we',
 'ours',
 'those',
 'within',
 'anywhere',
 'third',
 'he',
 'seemed',
 'twenty',
 'others',
 'everywhere',
 'being',
 'his',
 'no',
 'was',
 'meanwhile',
 'somewhere',
 'been',
 'same',
 'give',
 'they',
 'my',
 'hereby',
 'another',
 'say',
 'mostly',
 'everything',
 'system',
 'towards',
 'thereafter',
 'noone',
 'whenever',
 'several',
 'for',
 'wherein'

In [70]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stop_words(texts):
    return [[word for word in doc if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [71]:
# Form Trigrams
data_words_trigrams = make_trigrams(data_words)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

#increase the maximum length of text that the parser or NER can process
nlp.max_length = 13000000 #

# Do lemmatization keeping only noun, adj, vb
data_lemmatized1 = lemmatization(data_words_trigrams, allowed_postags=['NOUN', 'ADJ', 'VERB'])

# Set the length of word threshold for removing the words less than the threshold
minimum_len = 3 #
data_lemmatized2 = []
for i in data_lemmatized1:
    new_element = [x for x in i if len(x) >= minimum_len]
    data_lemmatized2.append(new_element)

# remove stop words
data_lemmatized = remove_stop_words(data_lemmatized2)
data_lemmatized[:1]

[['examination',
  'witness',
  'morning',
  'like',
  'welcome',
  'today',
  'occasional',
  'series',
  'checking',
  'session',
  'follow',
  'regard',
  'olympic',
  'bid',
  'committee',
  'report',
  'bring',
  'good',
  'express',
  'number',
  'concern',
  'issue',
  'ought',
  'consider',
  'interested',
  'hear',
  'understand',
  'course',
  'acceptable',
  'appearance',
  'like',
  'open',
  'statement',
  'happy',
  'listen',
  'thank',
  'thank',
  'opportunity',
  'come',
  'today',
  'talk',
  'little_bit',
  'plan',
  'hope',
  'come',
  'month',
  'month',
  'shall',
  'able',
  'meet',
  'update',
  'progress',
  'like',
  'role',
  'week',
  'explain',
  'sitting',
  'role',
  'offer',
  'accept',
  'work',
  'like',
  'little_bit',
  'idea',
  'job',
  'philosophy',
  'bid',
  'update',
  'initial',
  'thought',
  'reaction',
  'work',
  'date',
  'sense',
  'progress',
  'week',
  'talk',
  'little_bit',
  'future',
  'motivation',
  'role',
  'come',
  'number',

#  Create the Dictionary and Corpus needed for Topic Modeling

In [72]:
# Create Dictionary, set the parameters to filter out tokens in the dictionary by their frequency
no_below = 5             # filter out the tokens less frequent than no_below documents (absolute number)
no_above = 0.85         # filter out the tokens more frequent than no_above documents (fraction of the total corpus size)
id2word = corpora.Dictionary(data_lemmatized)
id2word.filter_extremes(no_below = no_below, no_above = no_above)
print('After removal of high and low frequency words - Number of unique tokens: %d, %d' % (len(id2word),id2word.num_pos))

After removal of high and low frequency words - Number of unique tokens: 3420, 356211


In [73]:
# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 2), (1, 4), (2, 3), (3, 1), (4, 1), (5, 1), (6, 1), (7, 2), (8, 1), (9, 1), (10, 1), (11, 1), (12, 3), (13, 1), (14, 1), (15, 2), (16, 3), (17, 1), (18, 2), (19, 3), (20, 1), (21, 1), (22, 2), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 5), (29, 1), (30, 4), (31, 2), (32, 1), (33, 1), (34, 2), (35, 9), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 2), (42, 4), (43, 3), (44, 1), (45, 1), (46, 1), (47, 2), (48, 1), (49, 3), (50, 1), (51, 1), (52, 1), (53, 1), (54, 11), (55, 17), (56, 1), (57, 1), (58, 61), (59, 4), (60, 3), (61, 1), (62, 1), (63, 5), (64, 1), (65, 3), (66, 1), (67, 1), (68, 11), (69, 1), (70, 2), (71, 2), (72, 8), (73, 5), (74, 1), (75, 2), (76, 1), (77, 1), (78, 2), (79, 1), (80, 10), (81, 2), (82, 3), (83, 1), (84, 4), (85, 1), (86, 1), (87, 2), (88, 4), (89, 16), (90, 1), (91, 1), (92, 2), (93, 3), (94, 16), (95, 1), (96, 1), (97, 3), (98, 2), (99, 1), (100, 2), (101, 5), (102, 1), (103, 1), (104, 1), (105, 1), (106, 4), (107, 1), (108, 3), (109, 1), (

# Building LDA Model, Hyperameter (k) tuning

In [74]:
# set training parameters
k = 20
passes = 20
iterations = 100
alpha = 50.0/k   
eta = 0.01
random_state = 12345
minimum_probability = 0

In [75]:
#start=3; limit=63; step=3
#coherence_values = []
#model_list = []
#for i in range(start,limit,step):
    #model = gensim.models.LdaModel(corpus = corpus,id2word = id2word,alpha = alpha,eta = eta,
    #                               iterations = iterations,num_topics = i,passes = passes,random_state = 12345,minimum_probability = minimum_probability)
    #model_list.append(model)
    #coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=id2word, coherence='c_v')
    #coherence_values.append(coherencemodel.get_coherence())

In [76]:
#list_num_topics = [i for i in range(start, limit, step)]
#df_coherence1 = pd.DataFrame({'Number_of_Topics': list_num_topics, 'Coherence_Score': coherence_values})
# save the result to disk
#df_coherence1.to_pickle('./df_coherence1.pkl')
# load the result from disk
#df_coherence = pd.read_pickle('./df_coherence1.pkl') 

In [77]:
# plot the coherence score against number of topics
#fig1 = px.line(df_coherence, x = 'Number_of_Topics', y="Coherence_Score", title = 'Coherence score against number of topics')
#fig1.update_layout(autosize=False, width=1000, height=400)
#fig1.update_traces(mode = "lines + markers")
#fig1.show()

In [78]:
# num of topics = 15 to get the optimal coherence socre
k = 15
lda_model = gensim.models.LdaModel(
    corpus = corpus,
    id2word = id2word,
    alpha = alpha,
    eta = eta,
    iterations = iterations,
    num_topics = k,
    passes = passes,
    random_state = 12345,
    minimum_probability = minimum_probability)

# Classify the paragraphs based on the trained model

### Extract paragraphs from documents

In [79]:
# define the function for spliting texts into paragraphs by delimiter '.\n\n' or '. \n\n'
def para_split(i):
    j = parser.from_file(i)
    m = j['content']
    if m.count('.\n\n') > 0:
        return m.split('.\n\n')
    else:
        return m.split('. \n\n')

In [80]:
list_paragraphs = []
list_para_id = []
for i in pdf_files:
    para = para_split(i)
    para = [w.replace('\n', '') for w in para]
    para = [x.strip() for x in para if x.strip()] # remove empty elements
    para_id = [x for x in range(len(para))] 
    list_paragraphs.append(para)
    list_para_id.append(para_id)

2021-08-24 16:38:38,781 [MainThread  ] [WARNI]  Failed to see startup log message; retrying...


In [81]:
df_para1 = df.copy()
df_para1['paragraphs'] = list_paragraphs
df_para1['para_id'] = list_para_id
df_para2 = df_para1.apply(pd.Series.explode)
df_para3 = df_para2.reset_index()
df_para4 = df_para3[['creationDate', 'document_id', 'file_name', 'para_id', 'paragraphs']]
df_para4

Unnamed: 0,creationDate,document_id,file_name,para_id,paragraphs
0,D:20210822083603+00'00',0,Examination_of_Witnesses_Sept_2003_-_Q1-19.pdf,0,Examination of Witnesses (1-19) 16 SEPTEMBER 2...
1,D:20210822083603+00'00',0,Examination_of_Witnesses_Sept_2003_-_Q1-19.pdf,1,Ms Cassani: Thank you very much. Thank you ver...
2,D:20210822083603+00'00',0,Examination_of_Witnesses_Sept_2003_-_Q1-19.pdf,2,The first thing I should like to say is that I...
3,D:20210822083603+00'00',0,Examination_of_Witnesses_Sept_2003_-_Q1-19.pdf,3,Really the backdrop is that I believe in the G...
4,D:20210822083603+00'00',0,Examination_of_Witnesses_Sept_2003_-_Q1-19.pdf,4,"Work had been done before I arrived, but it wa..."
...,...,...,...,...,...
9413,D:20210822084546+00'00',168,Written_evidence_submitted_by_Youth_Sport_Trus...,8,3.6 The UK School Games (UKSG) is in its fift...
9414,D:20210822084546+00'00',168,Written_evidence_submitted_by_Youth_Sport_Trus...,9,3.7 National Talent Orientation Camp (NTOC) i...
9415,D:20210822084546+00'00',168,Written_evidence_submitted_by_Youth_Sport_Trus...,10,4. The use and management of the Olympic Par...
9416,D:20210822084546+00'00',168,Written_evidence_submitted_by_Youth_Sport_Trus...,11,To reflect the diverse range of learning oppor...


In [82]:
# set a filter to filter out the paragraphs with short words
n_word_count = 10                                                        # set the threshold of word count
para_word_count = df_para4['paragraphs'].str.split().str.len()           # word count of each paragraph
df_para = df_para4[(para_word_count>=n_word_count)].reset_index()        # select only the paragraphs with word count above the threshold
df_para

Unnamed: 0,index,creationDate,document_id,file_name,para_id,paragraphs
0,0,D:20210822083603+00'00',0,Examination_of_Witnesses_Sept_2003_-_Q1-19.pdf,0,Examination of Witnesses (1-19) 16 SEPTEMBER 2...
1,1,D:20210822083603+00'00',0,Examination_of_Witnesses_Sept_2003_-_Q1-19.pdf,1,Ms Cassani: Thank you very much. Thank you ver...
2,2,D:20210822083603+00'00',0,Examination_of_Witnesses_Sept_2003_-_Q1-19.pdf,2,The first thing I should like to say is that I...
3,3,D:20210822083603+00'00',0,Examination_of_Witnesses_Sept_2003_-_Q1-19.pdf,3,Really the backdrop is that I believe in the G...
4,4,D:20210822083603+00'00',0,Examination_of_Witnesses_Sept_2003_-_Q1-19.pdf,4,"Work had been done before I arrived, but it wa..."
...,...,...,...,...,...,...
9031,9413,D:20210822084546+00'00',168,Written_evidence_submitted_by_Youth_Sport_Trus...,8,3.6 The UK School Games (UKSG) is in its fift...
9032,9414,D:20210822084546+00'00',168,Written_evidence_submitted_by_Youth_Sport_Trus...,9,3.7 National Talent Orientation Camp (NTOC) i...
9033,9415,D:20210822084546+00'00',168,Written_evidence_submitted_by_Youth_Sport_Trus...,10,4. The use and management of the Olympic Par...
9034,9416,D:20210822084546+00'00',168,Written_evidence_submitted_by_Youth_Sport_Trus...,11,To reflect the diverse range of learning oppor...


### Process the paragraphs

In [83]:
# tokenization
data2 = df_para.paragraphs.values.tolist()
data_words2 = list(sent_to_words(data2))

In [84]:
# Form Trigrams
data_words_trigrams2 = make_trigrams(data_words2)

# Do lemmatization keeping only noun, adj, vb
data_lemmatized2 = lemmatization(data_words_trigrams2, allowed_postags=['NOUN', 'ADJ', 'VERB'])

# set the length of word threshold as same as before for removing the words less than the threshold
data_lemmatized2_2 = []
for i in data_lemmatized2:
    new_element = [x for x in i if len(x) >= minimum_len]
    data_lemmatized2_2.append(new_element)
    
# Remove Stop Words
data_lemmatized2_1 = remove_stop_words(data_lemmatized2_2)

### Classify topics of paragraphs

In [85]:
# create the function for converting a list of tuples into a dictionary
def Convert(tup, di):
    di = dict(tup)
    return di

In [86]:
# belong function: classify topics of paragraphs, it might take a long time because there are 148,651 paragraphs in the 11,132,849-word corpus
list_topic_para = []
dictionary_topic_para = {}
for d in data_lemmatized2_1:
    bow = id2word.doc2bow(d)
    belong = lda_model[bow]
    doc_dic = Convert(belong, dictionary_topic_para)
    list_topic_para.append(doc_dic)
    df_topic_para = pd.DataFrame(list_topic_para)

In [87]:
# topic distribution across paragraphs
df_topic_para1_1 = pd.merge(df_para, df_topic_para, how = 'left', left_index=True, right_index=True)
df_topic_para1_1

Unnamed: 0,index,creationDate,document_id,file_name,para_id,paragraphs,0,1,2,3,...,5,6,7,8,9,10,11,12,13,14
0,0,D:20210822083603+00'00',0,Examination_of_Witnesses_Sept_2003_-_Q1-19.pdf,0,Examination of Witnesses (1-19) 16 SEPTEMBER 2...,0.264099,0.056441,0.053210,0.043741,...,0.039536,0.041172,0.041856,0.057480,0.049892,0.049552,0.046832,0.046121,0.085228,0.074961
1,1,D:20210822083603+00'00',0,Examination_of_Witnesses_Sept_2003_-_Q1-19.pdf,1,Ms Cassani: Thank you very much. Thank you ver...,0.178765,0.060504,0.059209,0.055825,...,0.068108,0.050133,0.053222,0.060253,0.051254,0.047354,0.060308,0.072113,0.062526,0.071474
2,2,D:20210822083603+00'00',0,Examination_of_Witnesses_Sept_2003_-_Q1-19.pdf,2,The first thing I should like to say is that I...,0.448698,0.042182,0.050906,0.032559,...,0.041565,0.042981,0.067326,0.042903,0.042512,0.033146,0.026889,0.021902,0.043790,0.040101
3,3,D:20210822083603+00'00',0,Examination_of_Witnesses_Sept_2003_-_Q1-19.pdf,3,Really the backdrop is that I believe in the G...,0.086949,0.082949,0.088710,0.072257,...,0.064899,0.066332,0.061827,0.069837,0.055869,0.071979,0.046693,0.044440,0.089825,0.058302
4,4,D:20210822083603+00'00',0,Examination_of_Witnesses_Sept_2003_-_Q1-19.pdf,4,"Work had been done before I arrived, but it wa...",0.544812,0.027690,0.040986,0.020258,...,0.063154,0.020994,0.022601,0.031870,0.023925,0.074512,0.022566,0.026889,0.030235,0.034321
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9031,9413,D:20210822084546+00'00',168,Written_evidence_submitted_by_Youth_Sport_Trus...,8,3.6 The UK School Games (UKSG) is in its fift...,0.051836,0.044182,0.140922,0.035175,...,0.064867,0.158303,0.156996,0.050473,0.061442,0.071039,0.034026,0.026567,0.037492,0.041286
9032,9414,D:20210822084546+00'00',168,Written_evidence_submitted_by_Youth_Sport_Trus...,9,3.7 National Talent Orientation Camp (NTOC) i...,0.044399,0.037435,0.190747,0.032430,...,0.063587,0.077983,0.195458,0.096571,0.039054,0.031579,0.052412,0.030616,0.029055,0.036747
9033,9415,D:20210822084546+00'00',168,Written_evidence_submitted_by_Youth_Sport_Trus...,10,4. The use and management of the Olympic Par...,0.029769,0.031935,0.149961,0.015824,...,0.029594,0.195677,0.314154,0.043026,0.047350,0.018870,0.036016,0.021760,0.018036,0.023226
9034,9416,D:20210822084546+00'00',168,Written_evidence_submitted_by_Youth_Sport_Trus...,11,To reflect the diverse range of learning oppor...,0.039409,0.050738,0.093577,0.026326,...,0.063554,0.207879,0.249455,0.046287,0.069563,0.029143,0.030464,0.020033,0.023470,0.026987


In [88]:
# save the result to disk
df_topic_para1_1.to_pickle('./df_topic_para_Olympics.pkl')

In [89]:
# load the result from disk
df_topic_para1 = pd.read_pickle('./df_topic_para_Olympics.pkl') 

### Highest N ranked paragraphs overall

In [117]:
df_topic_para1_n = df_topic_para1.copy()
df_topic_para1_n['highest_p'] = df_topic_para1_n.iloc[:, 6:].max(axis = 1)         # get the highest probability among the topic distribution of each paragraph
df_topic_para1_n['salient_topic'] = df_topic_para1.iloc[:, 6:].idxmax(axis = 1)    # get the corresponding topic id
df_topic_para1_n = df_topic_para1_n[['file_name', 'para_id', 'paragraphs', 'salient_topic', 'highest_p']]

In [118]:
# highest 10 ranked paragraphs overall
df_topic_para1_n.nlargest(10,['highest_p'])

Unnamed: 0,file_name,para_id,paragraphs,salient_topic,highest_p
4308,Preparations_for_Olympic_Games_March_2012_Repo...,9,cobber Pack: U PL: COE1 [O] Processed: [05-03-...,14,0.811896
3690,PAC_-_Preparations_for_the_Olympics_-_July_200...,1,DCMS/UK SportELITE SPORTLOCOGSTAGEOLYMPIC PARK...,11,0.784927
8070,Written_evidence_-_Appendix_6_-_TfL.pdf,5,Although part of this proposed new capacity wi...,4,0.78314
3689,PAC_-_Preparations_for_the_Olympics_-_July_200...,0,Microsoft Word - Preparations for the London 2...,11,0.763432
4302,Preparations_for_Olympic_Games_March_2012_Repo...,3,Jonathan Stephens: That was reduced and we hav...,14,0.759308
4303,Preparations_for_Olympic_Games_March_2012_Repo...,4,Q14 Matthew Hancock: If your risks materialise...,14,0.751763
1987,NAO_Preparations_for_the_Olympics_-_Progress_r...,193,PART FOuR41PREPARATIONS FOR THE LONDON 2012 OL...,11,0.746663
6181,The_next_lap_-_April_2008_-_vol_1.pdf,68,"289 Ev 126 290 Q 1 291 Ev 29 57 medals, of wh...",8,0.741447
1911,NAO_Preparations_for_the_Olympics_-_Progress_r...,117,Whether the Olympic Delivery Authority’s expe...,11,0.736445
4301,Preparations_for_Olympic_Games_March_2012_Repo...,2,Q1 Chair: Apologies for keeping you waiting. I...,14,0.733505


### Highest N ranked paragraphs from each topic

In [119]:
# define the function for extracting the highest N ranked paragraphs from each topic
def top_n_filter(df, top_n):
    list_topic_id = [x for x in range(0,k)]
    list_n_para = []
    list_n_p = []
    list_n_file_name = []
    list_n_para_id = []
    for x in range(0, k): 
        n_para = [i for i in df.nlargest(top_n, [x])['paragraphs']]
        n_p = [i for i in df.nlargest(top_n, [x])[x]]
        n_file_name = [i for i in df.nlargest(top_n, [x])['file_name']]
        n_para_id = [i for i in df.nlargest(top_n, [x])['para_id']]
        list_n_para.append(n_para)
        list_n_p.append(n_p)
        list_n_file_name.append(n_file_name)
        list_n_para_id.append(n_para_id)
    pd_n_para = pd.DataFrame({'topic_id': list_topic_id, 'salient_paragraph': list_n_para, 'probability': list_n_p, 
                              'file_name': list_n_file_name, 'para_id': list_n_para_id})
    return(pd_n_para.apply(pd.Series.explode))

In [120]:
# highest 2 ranked paragraphs from each topic
top_n_filter(df_topic_para1, 2)

Unnamed: 0,topic_id,salient_paragraph,probability,file_name,para_id
0,0,Q195 Paul Farrelly: I just wanted to explore s...,0.612425,Qs_180-199.pdf,18
0,0,Q36 Alan Keen: Mischievously I was thinking t...,0.586679,Qs_20-39.pdf,16
1,1,282 Q 185 283 Paper from West Sussex County Co...,0.581509,Report_and_Minutes_-_Jan_2007.pdf,70
1,1,118. The Tourism Alliance calculated that any ...,0.491753,Report_and_Minutes_-_Jan_2007.pdf,62
2,2,Q79 Adam Price: We do fantastically well in t...,0.595137,Qs_65-79.pdf,27
2,2,"MR CRAIG REEDIE CBE, MS SUE CAMPBELL CBE, MR D...",0.593952,Qs_80-87.pdf,0
3,3,24% of the respondents said they had been infl...,0.118456,Report_and_Minutes_-_Jan_2007_-_vol_2_-_eviden...,1243
3,3,11. ESCA urges the Government to support a bid...,0.109099,Written_evidence_-_Appendix_20_-_European_Spon...,10
4,4,Although part of this proposed new capacity wi...,0.78314,Written_evidence_-_Appendix_6_-_TfL.pdf,5
4,4,"However, the distance of the venues and facili...",0.660676,Written_evidence_-_Appendix_6_-_TfL.pdf,2


### Highest N ranked paragraphs for topic K

In [121]:
topic_id_chosen = 7                                    # choose the topic ID
num_para = 2                                           # set N
df_n_topic_k = top_n_filter(df_topic_para1, num_para)
topic_id_filter = df_n_topic_k['topic_id'] == topic_id_chosen
df_n_topic_k[topic_id_filter]

Unnamed: 0,topic_id,salient_paragraph,probability,file_name,para_id
7,7,"RECOMMENDATION 2 — Clarify the term ""sports l...",0.57963,Written_evidence_submitted_by_Institute_for_Sp...,32
7,7,11. Progress towards meeting targets to incr...,0.574416,Written_evidence_submitted_by_Business_in_Spor...,9


### Highest N ranked paragraphs where the belong() function is greater than the threshold for M topics at a time

In [122]:
# selecting the paragraphs where the belong() function is greater than the threshold for M topics at a time
threshold = 1/3                                                    # set threshold 
topic_filter = df_topic_para1.iloc[:, 5:].max(axis=1) > threshold  # set filter
df_topic_para_M = df_topic_para1[topic_filter]                     # extract the qualified paragraphs
df_topic_para_M

Unnamed: 0,index,creationDate,document_id,file_name,para_id,paragraphs,0,1,2,3,...,5,6,7,8,9,10,11,12,13,14
2,2,D:20210822083603+00'00',0,Examination_of_Witnesses_Sept_2003_-_Q1-19.pdf,2,The first thing I should like to say is that I...,0.448698,0.042182,0.050906,0.032559,...,0.041565,0.042981,0.067326,0.042903,0.042512,0.033146,0.026889,0.021902,0.043790,0.040101
4,4,D:20210822083603+00'00',0,Examination_of_Witnesses_Sept_2003_-_Q1-19.pdf,4,"Work had been done before I arrived, but it wa...",0.544812,0.027690,0.040986,0.020258,...,0.063154,0.020994,0.022601,0.031870,0.023925,0.074512,0.022566,0.026889,0.030235,0.034321
7,7,D:20210822083603+00'00',0,Examination_of_Witnesses_Sept_2003_-_Q1-19.pdf,7,Q4 Michael Fabricant: On the subject of chief ...,0.347304,0.047192,0.064022,0.035998,...,0.037753,0.055128,0.051708,0.048856,0.036429,0.051404,0.035263,0.035098,0.049965,0.066178
10,10,D:20210822083603+00'00',0,Examination_of_Witnesses_Sept_2003_-_Q1-19.pdf,10,Q5 Michael Fabricant: You have quite rightly p...,0.416513,0.055265,0.051724,0.033384,...,0.033659,0.033500,0.034667,0.045696,0.034636,0.051259,0.030681,0.026341,0.075497,0.045886
25,25,D:20210822083603+00'00',0,Examination_of_Witnesses_Sept_2003_-_Q1-19.pdf,25,"Q15 Derek Wyatt: When push comes to shove, bec...",0.418475,0.041771,0.048149,0.026838,...,0.034159,0.038141,0.041549,0.036144,0.032368,0.037540,0.030989,0.027088,0.116715,0.041385
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8975,9357,D:20210822084535+00'00',166,Written_evidence_submitted_by_VisitBritain_-_J...,0,Written evidence submitted by VisitBritain ...,0.040789,0.413946,0.024507,0.026332,...,0.035273,0.022108,0.065532,0.033995,0.138878,0.060030,0.030546,0.026136,0.022509,0.037007
8979,9361,D:20210822084535+00'00',166,Written_evidence_submitted_by_VisitBritain_-_J...,4,— Germany 2006 helped propel Germany to the nu...,0.037791,0.363466,0.045149,0.044737,...,0.038407,0.035220,0.071158,0.039809,0.066573,0.085637,0.035973,0.036451,0.030553,0.032698
8986,9368,D:20210822084535+00'00',166,Written_evidence_submitted_by_VisitBritain_-_J...,11,(ii) Industry engagement: We are providing inf...,0.042116,0.390533,0.030286,0.022817,...,0.060635,0.024670,0.058087,0.026387,0.129058,0.031158,0.037205,0.025153,0.021997,0.040223
9012,9394,D:20210822084543+00'00',167,Written_evidence_submitted_by_Womens_Sport_and...,5,— Of specific concern to us is the real risk t...,0.025756,0.040511,0.503758,0.016669,...,0.024758,0.030565,0.136145,0.029987,0.033208,0.019933,0.035771,0.019698,0.020927,0.036597


In [123]:
# Highest 2 ranked paragraphs where the belong() function is greater than the threshold for M topics at a time
top_n_filter(df_topic_para_M, 2)

Unnamed: 0,topic_id,salient_paragraph,probability,file_name,para_id
0,0,Q195 Paul Farrelly: I just wanted to explore s...,0.612425,Qs_180-199.pdf,18
0,0,Q36 Alan Keen: Mischievously I was thinking t...,0.586679,Qs_20-39.pdf,16
1,1,282 Q 185 283 Paper from West Sussex County Co...,0.581509,Report_and_Minutes_-_Jan_2007.pdf,70
1,1,118. The Tourism Alliance calculated that any ...,0.491753,Report_and_Minutes_-_Jan_2007.pdf,62
2,2,Q79 Adam Price: We do fantastically well in t...,0.595137,Qs_65-79.pdf,27
2,2,"MR CRAIG REEDIE CBE, MS SUE CAMPBELL CBE, MR D...",0.593952,Qs_80-87.pdf,0
3,3,Q37 Alan Keen: Of course not. Have you given a...,0.07532,Examination_of_Witnesses_Sept_2003_-_Q20-39.pdf,21
3,3,2.4 With the additional funding awarded to UK ...,0.072699,NAO_Preparing_for_sporting_success_-_March_200...,47
4,4,Although part of this proposed new capacity wi...,0.78314,Written_evidence_-_Appendix_6_-_TfL.pdf,5
4,4,"However, the distance of the venues and facili...",0.660676,Written_evidence_-_Appendix_6_-_TfL.pdf,2


# Overview of topics

### The most frequent 10 words of each topic

In [124]:
pprint(lda_model.print_topics())

[(0,
  '0.018*"people" + 0.013*"come" + 0.010*"work" + 0.010*"look" + 0.009*"year" '
  '+ 0.009*"time" + 0.008*"good" + 0.007*"country" + 0.007*"way" + '
  '0.007*"know"'),
 (1,
  '0.011*"work" + 0.009*"legacy" + 0.009*"lottery" + 0.008*"tourism" + '
  '0.007*"opportunity" + 0.007*"year" + 0.007*"community" + 0.007*"cost" + '
  '0.007*"ensure" + 0.006*"funding"'),
 (2,
  '0.021*"athlete" + 0.019*"people" + 0.011*"facility" + 0.011*"young" + '
  '0.010*"legacy" + 0.009*"work" + 0.009*"woman" + 0.009*"coach" + '
  '0.009*"year" + 0.009*"participation"'),
 (3,
  '0.030*"support" + 0.023*"country" + 0.023*"event" + 0.020*"legacy" + '
  '0.020*"people" + 0.019*"year" + 0.016*"work" + 0.016*"benefit" + '
  '0.016*"government" + 0.015*"world"'),
 (4,
  '0.079*"transport" + 0.020*"demand" + 0.018*"capacity" + 0.017*"strategy" + '
  '0.017*"traffic" + 0.016*"service" + 0.016*"measure" + 0.015*"network" + '
  '0.014*"provide" + 0.014*"additional"'),
 (5,
  '0.025*"work" + 0.020*"venue" + 0.015*"

### Topic distribution across documents

In [125]:
# topic distribution over documents
list_topic = []
dictionary_topic = {}
for d in texts:
    bow = id2word.doc2bow(d)
    belong = lda_model[bow]                        # generate a list of tuples of topic distribution of a document
    belong_dic = Convert(belong, dictionary_topic) # convert the list of tuples into a dictionary
    list_topic.append(belong_dic)           
                      
df_topic_distribution = pd.DataFrame(list_topic)   # convert the list of dictionaries into a dataframe
df_topic = pd.merge(df, df_topic_distribution, how = 'left', left_index=True, right_index=True) # merge with info of documents
df_topic.drop(['title','format','creator', 'producer', 'keywords', 'trapped', 'encryption','subject', 'modDate'], axis = 1)

Unnamed: 0,author,creationDate,file_name,Content,document_id,Word_count,0,1,2,3,...,5,6,7,8,9,10,11,12,13,14
0,B Lewis,D:20210822083603+00'00',Examination_of_Witnesses_Sept_2003_-_Q1-19.pdf,Examination of Witnesses (1-19) \n16 SEPTEMBER...,0,6115,0.762270,0.014045,0.011273,0.003808,...,0.032934,0.007550,0.015072,0.009716,0.006242,0.026078,0.007708,0.008904,0.069848,0.006448
1,B Lewis,D:20210822083606+00'00',Examination_of_Witnesses_Sept_2003_-_Q20-39.pdf,Examination of Witnesses (20-39) \n16 SEPTEMBE...,1,4002,0.721928,0.040403,0.025434,0.010214,...,0.012686,0.017510,0.024745,0.025301,0.012830,0.017021,0.007002,0.008145,0.063700,0.007418
2,B Lewis,D:20210822083609+00'00',Examination_of_Witnesses_Sept_2003_-_Q40-44.pdf,Examination of Witnesses (40-44) \n16 SEPTEMBE...,2,1007,0.601631,0.026055,0.041408,0.020618,...,0.036289,0.014932,0.028744,0.028767,0.022419,0.022237,0.017032,0.031813,0.063847,0.028610
3,Bronwen Lewis,D:20210822084116+00'00',Further_supplementary_memorandum_submitted_by_...,Further supplementary memorandum submitted by ...,3,431,0.026270,0.029447,0.021083,0.020948,...,0.024601,0.022109,0.020284,0.034565,0.024005,0.028920,0.558704,0.042472,0.040768,0.077290
4,Bronwen Lewis,D:20210822083921+00'00',Further_Supplementary_Memorandum_submitted_by_...,Further supplementary memorandum submitted by ...,4,288,0.108709,0.045323,0.080110,0.035178,...,0.034570,0.041434,0.043759,0.172957,0.044690,0.044557,0.059830,0.094619,0.064567,0.078919
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164,Bronwen Lewis,D:20210822084528+00'00',Written_evidence_submitted_by_UK_Sport_-_Jan_2...,Written evidence submitted by UK Sport \n \n ...,164,3089,0.014719,0.009452,0.118485,0.010908,...,0.009336,0.008633,0.397369,0.351860,0.013551,0.028016,0.008437,0.007704,0.006472,0.005597
165,Bronwen Lewis,D:20210822084531+00'00',Written_evidence_submitted_by_Vision_2020_UK_-...,Written evidence submitted by Vision 2020 UK ...,165,2284,0.004685,0.004967,0.005504,0.004435,...,0.004582,0.932329,0.005961,0.005188,0.004420,0.005068,0.004531,0.004355,0.004663,0.004458
166,Bronwen Lewis,D:20210822084535+00'00',Written_evidence_submitted_by_VisitBritain_-_J...,Written evidence submitted by VisitBritain \n...,166,2372,0.021996,0.585849,0.009938,0.006729,...,0.010932,0.005936,0.070428,0.008040,0.184818,0.044575,0.008384,0.010812,0.005491,0.009665
167,Bronwen Lewis,D:20210822084543+00'00',Written_evidence_submitted_by_Womens_Sport_and...,Written evidence submitted by the Women's Spor...,167,1966,0.011424,0.025606,0.616108,0.005895,...,0.013362,0.010853,0.212886,0.013126,0.022316,0.010588,0.015661,0.006185,0.008599,0.014753


# Topic interpretation
To interpret the topics, I combined the word frequncy demonstrated by pyLDAvis with prototypical documents or paragraphs suggested by PTBI proposed by Marchetti and Puranam (2020)

### Selection of most salient topics for interpretation
according to the PTBI proposed by Marchetti and Puranam (2020), not all topics are worth interpretation. To extract most salient topics for interpretation, for each topic, we need to compute the percentage of documents loading on the topic with the probability that the document belong to the topic > 1/k. Then graph the metric and select the sub-set of topics scoring the highest on it (e.g., based on scree plot).

In [126]:
# percentage of documents above threshold(1/k)
list_percent_above = []
for i in df_topic_distribution:
    num_above = df_topic_distribution[i][df_topic_distribution[i] > 1/k].count()
    percent_above =  num_above/len(df_topic_distribution)
    list_percent_above.append(percent_above)

In [127]:
list_topic_id = [x for x in range(0,k)]
df_salient_topic = pd.DataFrame({'topic_id': list_topic_id, 'percentage_of_documents_above_threshold': list_percent_above})
df_salient_topic.sort_values(by = 'percentage_of_documents_above_threshold', ascending = False)

Unnamed: 0,topic_id,percentage_of_documents_above_threshold
0,0,0.449704
9,9,0.278107
11,11,0.272189
1,1,0.260355
7,7,0.236686
8,8,0.236686
10,10,0.201183
2,2,0.189349
5,5,0.183432
13,13,0.177515


### Word frequency of each topic
check the words of each topic, if there're common words with high overall frequency such as "think" "want" or "make", return to the "import the stop_words from gensim" section, add these words to the list of stop words to remove them.

In [128]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, sort_topics = False )
vis

### Prototypical texts for each topic
I followed the method of extraction of prototypical text suggested by PTBI proposed by Marchetti and Puranam (2020). Its heart lies in that for parameter L (probability that a document belongs to a topic), at least 1/L documents with probablity that they belong to the topic >= L are needed to interpret the topics, this method tries to find out the opitimal value of L to maximize the percentage of interpretable topics (Marchetti and Puranam, 2020, p. 20)

In [129]:
List_num_doc = [x for x in range(1, 20, 1)] # generate a list of 1/L (minimum number of documents to interpret a topic)
list_L = [1/x for x in List_num_doc]        # generate a list of L

In [130]:
# create the function for computing the percentage of potentially interpretable topics against parameter L
def perc(i, df):
    list_num_topics = []
    for j in df:                                  
        topic_filter = df[j] >= i         
        m = df[j][topic_filter].count()           
        list_num_topics.append(m)                                             
        count1 = sum(map(lambda x : x >= 1/i, list_num_topics))                                     
        perc1 = count1 / k
    return(perc1)

The following chart shows that the percentage of potentially interpretable topics for “high enough” levels of L is not large enough, so the paragraph-based interpretation can be explored. 

In [131]:
list_perc1 = []
for i in list_L:
    num = perc(i, df_topic_distribution)
    list_perc1.append(num)

df_L1 = pd.DataFrame({'Parameter L': list_L, 'Percentage of interpretable topics': list_perc1})
fig_L1 = px.line(df_L1, x = 'Parameter L', y="Percentage of interpretable topics", title = 'Value selection for parameter L (document-based)')
fig_L1.update_layout(autosize=False, width=1200, height=400)
fig_L1.update_traces(mode = "lines + markers")
fig_L1.show()

The following chart shows that when L = 0.5, the the percentage of interpretable topics is 86.7%, so we set L = 0.5 - ie, each topic needs at least 2(1/L) paragraphs to be interpreted.

In [132]:
#df_topic_para2 = df_topic_para1.drop(['document_id', 'paragraphs'], axis = 1)
list_perc2 = []
for i in list_L:
    num = perc(i, df_topic_para)
    list_perc2.append(num)

df_L2 = pd.DataFrame({'Parameter L': list_L, 'Percentage of interpretable topics': list_perc2})
fig_L2 = px.line(df_L2, x = 'Parameter L', y="Percentage of interpretable topics", title = 'Value selection for parameter L (paragraph-based )')
fig_L2.update_layout(autosize=False, width=1200, height=400)
fig_L2.update_traces(mode = "lines + markers")
fig_L2.show()

# Build topic model on paragraphs

In [133]:
# tokenization
#data2 = df_para.paragraphs.values.tolist()
#data_words2_2 = list(sent_to_words(data2))

In [134]:
# set the length of word threshold for removing the words less than the threshold
#minimum_len = 4 
#data_words2 = []
#for i in data_words2_2:
#    new_element = [x for x in i if len(x) >= minimum_len]
#    data_words2.append(new_element)

In [135]:
# Bigram & Trigram
#bigram2 = gensim.models.Phrases(data_words2, min_count=5, threshold=100) # higher threshold fewer phrases.
#trigram2 = gensim.models.Phrases(bigram2[data_words2], threshold=100)  
#bigram_mod2 = gensim.models.phrases.Phraser(bigram2)
#trigram_mod2 = gensim.models.phrases.Phraser(trigram2)

In [136]:
# Remove Stop Words
#data_words_nostops2 = remove_stopwords(data_words2)

# Form Trigrams
#data_words_trigrams2 = make_trigrams(data_words_nostops2)

# Do lemmatization keeping only noun, adj, vb, adv
#data_lemmatized2 = lemmatization(data_words_trigrams2, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

In [137]:
# Create Dictionary
#id2word2 = corpora.Dictionary(data_lemmatized2)

# Create Corpus
#texts2 = data_lemmatized2

# Term Document Frequency
#corpus2 = [id2word2.doc2bow(text) for text in texts]

In [138]:
#lda_model2 = gensim.models.LdaModel(
#    corpus=corpus2,
#    id2word=id2word2,
#   alpha=alpha,
#    eta=eta,
#    iterations=iterations,
#    num_topics=k, 
#   passes=passes)

In [139]:
# Compute Coherence Score
#coherence_model_lda2 = CoherenceModel(model=lda_model2, texts=data_lemmatized2, dictionary=id2word2, coherence='c_v')
#coherence_lda2 = coherence_model_lda2.get_coherence()
#print('\nCoherence Score: ', coherence_lda2)

In [140]:
# Visualize the topics
#vis2 = pyLDAvis.gensim_models.prepare(lda_model2, corpus2, id2word2, sort_topics = False)
#vis2