# Prerequisites

In [1]:
# pip install PyMuPDF                    # (install PyMuPDF for extracting info from PDF files)
# pip install tika                       # (install tika for extracting paragraphs from PDF files)
# pip install spacy==2.2.0               # (install spacy for lemmatization)
# conda install gensim                   # (intall gesim for topic modelling)
# pip install pyLDAvis                   # (install pyLDAvis for topic modelling visulisation)
# conda install -c conda-forge pyldavis  # (if you use aconda to install pyLADvis)

In [2]:
import pandas as pd
import numpy as np
import nltk; nltk.download('stopwords') 
from nltk.corpus import stopwords      # import stop words
stop_words = stopwords.words('english')

[nltk_data] Error loading stopwords: <urlopen error [Errno 11004]
[nltk_data]     getaddrinfo failed>


In [3]:
import re
from pprint import pprint

# glob for extracting the directories of metadata
import glob

# PyMuPDF
import fitz

# tika
import tika               
from tika import parser   

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Visualisation
import plotly.express as px
import pyLDAvis
import pyLDAvis.gensim_models
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)
import os

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  EPS = np.finfo(np.float).eps



# Import pdf files, data wrangling and overview

In [4]:
# Extract the directories of the PDF files
pdf_dir = "D:\LEON\Business Analytics\Study\9. Business Project\Data set\Crossrail"
pdf_files = glob.glob("%s/*.pdf" % pdf_dir)
pdf_files[3]

'D:\\LEON\\Business Analytics\\Study\\9. Business Project\\Data set\\Crossrail\\British_Railways_(London)_Bill_Lords_(By_Order)_20_Jan_1988.pdf'

In [5]:
# Use PyMuPDF to extract all info of the PDF files (text, title, date, etc)
list_metadata = []
for i in pdf_files:
    with fitz.open(i) as doc:
        info = doc.metadata
        info['file_name'] = os.path.basename(i)
        text = ''
        for page in doc:
            text+= page.getText()
        info['Content'] = text
        
    list_metadata.append(info)

mupdf: cmsOpenProfileFromMem failed


In [6]:
df = pd.DataFrame(list_metadata)
df['document_id'] = df.index
df.head(3)

Unnamed: 0,format,title,author,subject,keywords,creator,producer,creationDate,modDate,trapped,encryption,file_name,Content,document_id
0,PDF 1.7,,Katherine A Bloomfield,,,Microsoft® Word 2019,Microsoft® Word 2019,D:20201012160043+01'00',D:20201012160043+01'00',,,10-Year_Transport_Plan_02_Jul_2002.pdf,10-Year Transport Plan \n \n6. \n \nMr. Geoffr...,0
1,PDF 1.4,,,,,pdftk 2.02 - www.pdftk.com,itext-paulo-155 (itextpdf.sf.net-lowagie.com),D:20190730052236Z,D:20190730052236Z,,,10_year_plan_for_transport_21_May_2002.pdf,"House of Commons \nTransport, Local Government...",1
2,PDF 1.4,Microsoft Word - 021588,stellent,,,PScript5.dll Version 5.2,Acrobat Distiller 5.0.5 (Windows),D:20041008151844+01'00',D:20041008151844+01'00',,,A_New_deal_for_Transport_Better_for_everyone_W...,AnNew deal for Transport: Better for everyone ...,2


In [7]:
df = df.drop_duplicates(subset = 'Content') # drop duplicate rows
df = df.dropna(subset=['Content']) # drop rows whose text content is NaN
df['Word_count'] = df ['Content'].str.count(' ') + 1
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 229 entries, 0 to 228
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   format        229 non-null    object
 1   title         229 non-null    object
 2   author        229 non-null    object
 3   subject       229 non-null    object
 4   keywords      229 non-null    object
 5   creator       229 non-null    object
 6   producer      229 non-null    object
 7   creationDate  229 non-null    object
 8   modDate       229 non-null    object
 9   trapped       229 non-null    object
 10  encryption    5 non-null      object
 11  file_name     229 non-null    object
 12  Content       229 non-null    object
 13  document_id   229 non-null    int64 
 14  Word_count    229 non-null    int64 
dtypes: int64(2), object(13)
memory usage: 28.6+ KB


### Word count

In [8]:
# Word count
df['Word_count'].sum( )

11132849

In [10]:
# Word count distribution
#import seaborn as sns
#ax1 = sns.distplot(df['Word_count'])
#ax1.set(title = 'Word Count Distribution',
#       xlabel = 'Word Count of Each Document');

# Tokenization

In [11]:
data = df.Content.values.tolist()

In [12]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence).encode('utf-8'), deacc=True))  # deacc=True removes punctuations

data_words= list(sent_to_words(data))

# Processing words: 
Remove Stopwords, Make Bigrams and Trigrams,Lemmatisation, remove short words and meaningless words

In [13]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [14]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [15]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
# data_words_bigrams = make_bigrams(data_words_nostops)

# Form Trigrams
data_words_trigrams = make_trigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

#increase the maximum length of text that the parser or NER can process
nlp.max_length = 13000000 #

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized1 = lemmatization(data_words_trigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
print(data_lemmatized1[:1])

# Set the length of word threshold for removing the words less than the threshold
minimum_len = 4 #
data_lemmatized = []
for i in data_lemmatized1:
    new_element = [x for x in i if len(x) >= minimum_len]
    data_lemmatized.append(new_element)
print(data_lemmatized[:1])

[['year', 'transport', 'plan', 'cotswold', 'make', 'statement', 'recommendation', 'recent', 'report', 'transport', 'committee', 'relate', 'investment', 'transport', 'state', 'transport', 'committee', 'report', 'normal', 'way', 'addition', 'say', 'appoint', 'job', 'look', 'department', 'policy', 'keep', 'house', 'informed', 'policy', 'development', 'appropriate', 'committee', 'stingingly', 'critical', 'report', 'government', 'year', 'remain', 'concerned', 'lack', 'clarity', 'surround', 'financing', 'rail', 'lack', 'detailed', 'implementation', 'plan', 'major', 'barrier', 'improve', 'railway', 'expect', 'government', 'interim', 'target', 'define', 'work', 'programme', 'indeed', 'government', 'respond', 'report', 'revise', 'plan', 'produce', 'matter', 'smuggle', 'summer_recess', 'parliament', 'sit', 'report', 'table', 'normal', 'way', 'house', 'sit', 'time', 'particularly', 'conservative', 'government', 'funding', 'railway', 'indeed', 'everything_else', 'arrange', 'annual', 'basis', 'gove

#  Create the Dictionary and Corpus needed for Topic Modeling

In [16]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 2), (3, 1), (4, 1), (5, 1), (6, 2), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 4), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 2), (20, 3), (21, 1), (22, 3), (23, 1), (24, 1), (25, 2), (26, 1), (27, 1), (28, 2), (29, 1), (30, 1), (31, 2), (32, 1), (33, 1), (34, 1), (35, 1), (36, 3), (37, 4), (38, 1), (39, 1), (40, 1), (41, 6), (42, 1), (43, 1), (44, 1), (45, 1), (46, 3), (47, 3), (48, 2), (49, 1), (50, 2), (51, 1), (52, 3), (53, 4), (54, 2), (55, 1), (56, 2), (57, 2), (58, 3), (59, 1), (60, 1), (61, 1), (62, 1), (63, 2), (64, 2), (65, 1), (66, 1), (67, 1), (68, 3), (69, 1), (70, 1), (71, 1), (72, 1), (73, 3), (74, 1), (75, 1), (76, 1), (77, 1), (78, 2), (79, 1), (80, 1), (81, 1), (82, 1), (83, 1), (84, 1), (85, 1), (86, 1), (87, 1), (88, 1), (89, 1), (90, 1), (91, 1), (92, 1), (93, 1), (94, 1), (95, 2), (96, 2), (97, 1), (98, 1), (99, 1), (100, 1), (101, 1), (102, 10), (103, 4), (104, 1), (105, 2), (106, 1), (107, 2), (108, 4), (109, 3), (110, 1

# Building LDA Model, Hyperameter (k) tuning

In [17]:
# set training parameters
k = 20
passes = 20
iterations = 100
alpha = 50.0/k   
eta = 0.01
random_state = 12345
minimum_probability = 0

In [36]:
# create the function for computing the coherence score of different models with different number of topics.
def compute_coherence_values(dictionary, corpus, texts, limit, start, step):
    coherence_values = []
    model_list = []
    for k in range(start, limit, step):
        model = gensim.models.LdaModel(num_topics=k, corpus=corpus, id2word=id2word, alpha=alpha, eta=eta, 
                                       iterations=iterations, passes=passes, random_state = random_state, minimum_probability = minimum_probability)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [None]:
# apply the function, it might take a long time.
#limit=80; start=0; step=5;
#model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=texts, start=start, limit=limit, step=step)

In [None]:
# plot the coherence score against number of topics
#x = range(start, limit, step)
#list_num_topics = [i for i in x]
#df_coherence = pd.DataFrame({'Number_of_Topics': list_num_topics, 'Coherence_Score': coherence_values})
#fig1 = px.line(df_coherence, x = 'Number_of_Topics', y="Coherence_Score", title = 'Coherence score against number of topics')
#fig1.update_layout(autosize=False, width=1000, height=400)
#fig1.update_traces(mode = "lines + markers")
#fig1.show()

In [28]:
# num of topics = 35 to get the optimal coherence socre
k = 15
lda_model = gensim.models.LdaModel(
    corpus = corpus,
    id2word = id2word,
    alpha = alpha,
    eta = eta,
    iterations = iterations,
    num_topics = k,
    passes = passes,
    random_state = 12345,
    minimum_probability = minimum_probability)

# Classify the paragraphs based on the trained model

### Extract paragraphs from documents

In [29]:
# create the funtion for extraction of paragraphs by splitting the documents by new lines
def para_split(i):
    if '\n \n' in i:
        return i.split('\n \n')
    else:
        return i.split('\n\n')

In [30]:
list_paragraphs = []
list_para_id = []
for i in pdf_files:
    j = parser.from_file(i)
    m = j['content']
    para = para_split(m)
    para = [w.replace('\n', '') for w in para]
    para = [x.strip() for x in para if x.strip()] # remove empty elements
    para_id = [x for x in range(len(para))] 
    list_paragraphs.append(para)
    list_para_id.append(para_id)

2021-08-21 22:01:18,618 [MainThread  ] [WARNI]  Failed to see startup log message; retrying...


In [69]:
df_para1 = df.copy()
df_para1['paragraphs'] = list_paragraphs
df_para1['para_id'] = list_para_id
df_para2 = df_para1.apply(pd.Series.explode)
df_para3 = df_para2.reset_index()
df_para4 = df_para3[['creationDate', 'document_id', 'file_name', 'para_id', 'paragraphs']]
df_para4

Unnamed: 0,creationDate,document_id,file_name,para_id,paragraphs
0,D:20201012160043+01'00',0,10-Year_Transport_Plan_02_Jul_2002.pdf,0,10-Year Transport Plan
1,D:20201012160043+01'00',0,10-Year_Transport_Plan_02_Jul_2002.pdf,1,6.
2,D:20201012160043+01'00',0,10-Year_Transport_Plan_02_Jul_2002.pdf,2,Mr. Geoffrey Clifton-Brown (Cotswold) If he w...
3,D:20201012160043+01'00',0,10-Year_Transport_Plan_02_Jul_2002.pdf,3,The Secretary of State for Transport (Mr. Alis...
4,D:20201012160043+01'00',0,10-Year_Transport_Plan_02_Jul_2002.pdf,4,Mr. Clifton-Brown Paragraph 84 of the Select C...
...,...,...,...,...,...
148647,D:20201012160658+01'00',228,West_Coast_Main_Line_05_Mar_2002.pdf,7,Mr. Spellar That was rather ungallant of Oppos...
148648,D:20201012160658+01'00',228,West_Coast_Main_Line_05_Mar_2002.pdf,8,Chris Grayling (Epsom and Ewell) The Governme...
148649,D:20201012160658+01'00',228,West_Coast_Main_Line_05_Mar_2002.pdf,9,"Mr. Spellar That is fine cheek, coming from a ..."
148650,D:20201012160658+01'00',228,West_Coast_Main_Line_05_Mar_2002.pdf,10,Andrew Bennett (Denton and Reddish) Does my r...


I applied the 148,652 paragraphs above to classify their topics, but it's quite time-consuming and I hadn't got the result after 2 hours. So as shown below I provide users a threshold n for selecting the praragraphs with more than n words for classification. Here I set n = 30, and the number of paragraph to be classified decreased to 52,697, it takes about half an hour to get the result.

In [70]:
n_word_count = 30                                                        # set the threshold of word count
para_word_count = df_para4['paragraphs'].str.split().str.len()           # word count of each paragraph
df_para = df_para4[(para_word_count>=n_word_count)].reset_index()        # select only the paragraphs with word count above the threshold
df_para

Unnamed: 0,index,creationDate,document_id,file_name,para_id,paragraphs
0,2,D:20201012160043+01'00',0,10-Year_Transport_Plan_02_Jul_2002.pdf,2,Mr. Geoffrey Clifton-Brown (Cotswold) If he w...
1,3,D:20201012160043+01'00',0,10-Year_Transport_Plan_02_Jul_2002.pdf,3,The Secretary of State for Transport (Mr. Alis...
2,4,D:20201012160043+01'00',0,10-Year_Transport_Plan_02_Jul_2002.pdf,4,Mr. Clifton-Brown Paragraph 84 of the Select C...
3,5,D:20201012160043+01'00',0,10-Year_Transport_Plan_02_Jul_2002.pdf,5,Mr. Darling The report will be tabled in the n...
4,6,D:20201012160043+01'00',0,10-Year_Transport_Plan_02_Jul_2002.pdf,6,Clive Efford (Eltham) I welcome my right hon....
...,...,...,...,...,...,...
52692,148647,D:20201012160658+01'00',228,West_Coast_Main_Line_05_Mar_2002.pdf,7,Mr. Spellar That was rather ungallant of Oppos...
52693,148648,D:20201012160658+01'00',228,West_Coast_Main_Line_05_Mar_2002.pdf,8,Chris Grayling (Epsom and Ewell) The Governme...
52694,148649,D:20201012160658+01'00',228,West_Coast_Main_Line_05_Mar_2002.pdf,9,"Mr. Spellar That is fine cheek, coming from a ..."
52695,148650,D:20201012160658+01'00',228,West_Coast_Main_Line_05_Mar_2002.pdf,10,Andrew Bennett (Denton and Reddish) Does my r...


### Process the paragraphs

In [49]:
# tokenization
data2 = df_para.paragraphs.values.tolist()
data_words2 = list(sent_to_words(data2))

In [50]:
# Remove Stop Words
data_words_nostops2 = remove_stopwords(data_words2)

# Form Trigrams
data_words_trigrams2 = make_trigrams(data_words_nostops2)

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized2 = lemmatization(data_words_trigrams2, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

In [51]:
# set the length of word threshold for removing the words less than the threshold
minimum_len = 4 
data_lemmatized2_1 = []
for i in data_lemmatized2:
    new_element = [x for x in i if len(x) >= minimum_len]
    data_lemmatized2_1.append(new_element)

### Classify topics of paragraphs

In [52]:
# create the function for converting a list of tuples into a dictionary
def Convert(tup, di):
    di = dict(tup)
    return di

In [53]:
# belong function: classify topics of paragraphs, it might take a long time because there are 148,651 paragraphs in the 11,132,849-word corpus
list_topic_para = []
dictionary_topic_para = {}
for d in data_lemmatized2_1:
    bow = id2word.doc2bow(d)
    belong = lda_model[bow]
    doc_dic = Convert(belong, dictionary_topic_para)
    list_topic_para.append(doc_dic)
    df_topic_para = pd.DataFrame(list_topic_para)

In [71]:
# topic distribution across paragraphs
df_topic_para1_1 = pd.merge(df_para, df_topic_para, how = 'left', left_index=True, right_index=True)
df_topic_para1_1

Unnamed: 0,index,creationDate,document_id,file_name,para_id,paragraphs,0,1,2,3,...,5,6,7,8,9,10,11,12,13,14
0,2,D:20201012160043+01'00',0,10-Year_Transport_Plan_02_Jul_2002.pdf,2,Mr. Geoffrey Clifton-Brown (Cotswold) If he w...,0.072448,0.062922,0.056170,0.059287,...,0.075801,0.064500,0.060214,0.068891,0.064993,0.055450,0.077713,0.076425,0.068982,0.070236
1,3,D:20201012160043+01'00',0,10-Year_Transport_Plan_02_Jul_2002.pdf,3,The Secretary of State for Transport (Mr. Alis...,0.070747,0.070882,0.050330,0.061963,...,0.062193,0.065695,0.061436,0.068397,0.062524,0.049954,0.093780,0.076701,0.064980,0.063693
2,4,D:20201012160043+01'00',0,10-Year_Transport_Plan_02_Jul_2002.pdf,4,Mr. Clifton-Brown Paragraph 84 of the Select C...,0.086332,0.063053,0.037152,0.042598,...,0.065017,0.061546,0.050136,0.055723,0.066761,0.037708,0.116469,0.081088,0.071017,0.066188
3,5,D:20201012160043+01'00',0,10-Year_Transport_Plan_02_Jul_2002.pdf,5,Mr. Darling The report will be tabled in the n...,0.131403,0.068612,0.025374,0.029843,...,0.044684,0.044783,0.040568,0.055095,0.052669,0.024974,0.061665,0.052602,0.235253,0.040666
4,6,D:20201012160043+01'00',0,10-Year_Transport_Plan_02_Jul_2002.pdf,6,Clive Efford (Eltham) I welcome my right hon....,0.069285,0.069053,0.044518,0.049888,...,0.078048,0.061086,0.068690,0.062046,0.102345,0.043165,0.064314,0.079124,0.093914,0.056291
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52692,148647,D:20201012160658+01'00',228,West_Coast_Main_Line_05_Mar_2002.pdf,7,Mr. Spellar That was rather ungallant of Oppos...,0.096125,0.071386,0.027858,0.041813,...,0.052810,0.040474,0.051293,0.056721,0.133052,0.026874,0.047339,0.051183,0.162787,0.045006
52693,148648,D:20201012160658+01'00',228,West_Coast_Main_Line_05_Mar_2002.pdf,8,Chris Grayling (Epsom and Ewell) The Governme...,0.085474,0.068257,0.047044,0.056173,...,0.066754,0.061583,0.050914,0.060732,0.076330,0.047374,0.057462,0.060376,0.128952,0.061780
52694,148649,D:20201012160658+01'00',228,West_Coast_Main_Line_05_Mar_2002.pdf,9,"Mr. Spellar That is fine cheek, coming from a ...",0.073769,0.074035,0.047510,0.057346,...,0.059344,0.056209,0.057094,0.060466,0.061084,0.047341,0.054888,0.058834,0.131114,0.070625
52695,148650,D:20201012160658+01'00',228,West_Coast_Main_Line_05_Mar_2002.pdf,10,Andrew Bennett (Denton and Reddish) Does my r...,0.083192,0.059174,0.047981,0.057561,...,0.068059,0.071295,0.062680,0.064459,0.084050,0.046833,0.059579,0.062758,0.117769,0.049012


In [145]:
# save the result to disk
df_topic_para1_1.to_pickle('./df_topic_para1.pkl')

In [146]:
# load the result from disk
df_topic_para1 = pd.read_pickle('./df_topic_para1.pkl') 

### Highest N ranked paragraphs overall

In [212]:
df_topic_para1_n = df_topic_para1.copy()
df_topic_para1_n['highest_p'] = df_topic_para1_n.iloc[:, 6:].max(axis = 1)         # get the highest probability among the topic distribution of each paragraph
df_topic_para1_n['salient_topic'] = df_topic_para1.iloc[:, 6:].idxmax(axis = 1)    # get the corresponding topic id
df_topic_para1_n = df_topic_para1_n[['file_name', 'para_id', 'paragraphs', 'salient_topic', 'highest_p']]

In [213]:
# highest 5 ranked paragraphs overall
df_topic_para1_n.nlargest(5,['highest_p'])

Unnamed: 0,file_name,para_id,paragraphs,salient_topic,highest_p
32358,House_of_Commons._Fifth_report_from_the_Transp...,3,TRANSPORT COMMITTEE 35 24 February 1981] [Cont...,8,0.977223
32362,House_of_Commons._Fifth_report_from_the_Transp...,7,TRANSPORT COMMITTEE 209 IQiMay 1981] [Continue...,8,0.969239
32889,House_of_Commons._Transport_Committee._Session...,1,TRANSPORT COMMITTEE 185 \1 March \9%1] [Contin...,8,0.962825
49481,Transport_Select_Committee_The_future_of_light...,0,House of Commons Transport Committee Integrate...,6,0.953018
32360,House_of_Commons._Fifth_report_from_the_Transp...,5,186 MINUTES OF EVIDENCE TAKEN BEFORE THE 13 Ma...,8,0.948732


### Highest N ranked paragraphs from each topic

In [174]:
# define the function for extracting the highest N ranked paragraphs from each topic
def top_n_filter(df, top_n):
    list_topic_id = [x for x in range(0,k)]
    list_n_para = []
    list_n_p = []
    for x in range(0, k): 
        n_para = [i for i in df.nlargest(top_n, [x])['paragraphs']]
        n_p = [i for i in df.nlargest(top_n, [x])[x]]
        list_n_para.append(n_para)
        list_n_p.append(n_p)
    pd_n_para = pd.DataFrame({'topic_id': list_topic_id, 'salient_paragraph': list_n_para, 'probability': list_n_p})
    return(pd_n_para.apply(pd.Series.explode))

In [175]:
# highest 2 ranked paragraphs from each topic
top_n_filter(df_topic_para1, 2)

Unnamed: 0,topic_id,salient_paragraph,probability
0,0,"Lord Falconer of Thoroton My Lords, obviously ...",0.640784
0,0,"Lord Falconer of Thoroton My Lords, it is unaf...",0.629251
1,1,somewhere—I think it is that one—is number 19 ...,0.707127
1,1,"hall, the big studio there which we designed, ...",0.681743
2,2,£40 million to £60 million £20 million to £40 ...,0.066667
2,2,July 1998 July 1998 July 1998 July 1998 July 1...,0.066667
3,3,1. In the Compulsory Purchase Act 1965 (hereaf...,0.756951
3,3,Clause 12 requires notice of any street works ...,0.692393
4,4,"Lord Clinton-Davis My Lords, I thank the Minis...",0.789729
4,4,"Lord Brabazon of Tara My Lords, I beg to move ...",0.782108


### Highest N ranked paragraphs for topic K

In [221]:
df_n_topic_k = top_n_filter(df_topic_para1, 2)
topic_id_chosen = 7                    # choose the topic ID
num_para = 2                           # set N
topic_id_filter = df_n_topic_k['topic_id'] == topic_id_chosen
df_n_topic_k[topic_id_filter]

Unnamed: 0,topic_id,salient_paragraph,probability
7,7,18084. What you were suggesting earlier was a ...,0.554868
7,7,aoi-t House of Commons Transport Committee Ove...,0.53027


### Highest N ranked paragraphs where the belong() function is greater than the threshold for M topics at a time

In [176]:
# selecting the paragraphs where the belong() function is greater than the threshold for M topics at a time
threshold = 1/3                                                    # set threshold 
topic_filter = df_topic_para1.iloc[:, 5:].max(axis=1) > threshold  # set filter
df_topic_para_M = df_topic_para1[topic_filter]                     # extract the qualified paragraphs
df_topic_para_M

Unnamed: 0,index,creationDate,document_id,file_name,para_id,paragraphs,0,1,2,3,...,5,6,7,8,9,10,11,12,13,14
74,255,D:20190730052236Z,1,10_year_plan_for_transport_21_May_2002.pdf,241,45. The Transport Act 2000 makes provisions fo...,0.044262,0.043324,0.030238,0.042949,...,0.044425,0.077738,0.050043,0.061554,0.047844,0.028773,0.055126,0.335951,0.055171,0.043903
84,283,D:20190730052236Z,1,10_year_plan_for_transport_21_May_2002.pdf,269,51. The power to introduce local charging sche...,0.050313,0.044189,0.026413,0.029249,...,0.037758,0.063563,0.051164,0.056167,0.045450,0.025352,0.052043,0.334488,0.105345,0.037271
85,285,D:20190730052236Z,1,10_year_plan_for_transport_21_May_2002.pdf,271,52. Hie 10 Year Plan acknowledges that “people...,0.050369,0.035537,0.023224,0.025945,...,0.046300,0.072777,0.041990,0.058119,0.042016,0.022812,0.055307,0.362818,0.093287,0.034373
153,483,D:20190730052236Z,1,10_year_plan_for_transport_21_May_2002.pdf,469,102. Outside the Department there is overwhelm...,0.039320,0.030659,0.019356,0.022694,...,0.041283,0.046922,0.039678,0.072998,0.030887,0.019056,0.069240,0.466428,0.044025,0.028318
155,495,D:20190730052236Z,1,10_year_plan_for_transport_21_May_2002.pdf,481,"104, The Commission for Integrated Transport b...",0.052787,0.034785,0.020927,0.028427,...,0.039780,0.051108,0.037862,0.079260,0.044644,0.020515,0.060161,0.390724,0.071802,0.032363
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52623,148543,D:20201012155409+01'00',226,Tube_Investment_debate_in_Commons_Chamber_08_D...,347,Mr. Jenkin: The notes say that income from the...,0.157773,0.030668,0.007399,0.015877,...,0.039327,0.032086,0.023835,0.020724,0.022524,0.007307,0.020543,0.015676,0.562022,0.018751
52624,148545,D:20201012155409+01'00',226,Tube_Investment_debate_in_Commons_Chamber_08_D...,349,The Parliamentary Under-Secretary of State for...,0.025860,0.025818,0.009835,0.013093,...,0.015744,0.015999,0.015915,0.019068,0.017515,0.009789,0.015963,0.017362,0.751439,0.016530
52625,148547,D:20201012155409+01'00',226,Tube_Investment_debate_in_Commons_Chamber_08_D...,351,Mr. Hill: I am sorry. The hon. Gentleman has n...,0.025708,0.014416,0.009230,0.010668,...,0.015945,0.015724,0.016117,0.023949,0.017505,0.009025,0.018608,0.023142,0.770463,0.012866
52637,148564,D:20201012155409+01'00',226,Tube_Investment_debate_in_Commons_Chamber_08_D...,368,"Division No. 15] [10.15 pm AYES Adams, Mrs Ire...",0.079947,0.039264,0.022127,0.028502,...,0.031971,0.040964,0.033352,0.036987,0.036890,0.022422,0.054178,0.098775,0.408451,0.032336


In [214]:
# Highest 2 ranked paragraphs where the belong() function is greater than the threshold for M topics at a time
top_n_filter(df_topic_para_M, 2)

Unnamed: 0,topic_id,salient_paragraph,probability
0,0,"Lord Falconer of Thoroton My Lords, obviously ...",0.640784
0,0,"Lord Falconer of Thoroton My Lords, it is unaf...",0.629251
1,1,somewhere—I think it is that one—is number 19 ...,0.707127
1,1,"hall, the big studio there which we designed, ...",0.681743
2,2,(3) Subsection (2) shall have effect as if lan...,0.045091
2,2,(3) Subsection (2) shall have effect as if lan...,0.045091
3,3,1. In the Compulsory Purchase Act 1965 (hereaf...,0.756951
3,3,Clause 12 requires notice of any street works ...,0.692393
4,4,"Lord Clinton-Davis My Lords, I thank the Minis...",0.789729
4,4,"Lord Brabazon of Tara My Lords, I beg to move ...",0.782108


# Overview of topics

### The most frequent 10 words of each topic

In [222]:
pprint(lda_model.print_topics())

[(0,
  '0.016*"would" + 0.013*"railtrack" + 0.012*"year" + 0.010*"think" + '
  '0.009*"investment" + 0.008*"take" + 0.008*"make" + 0.008*"public" + '
  '0.008*"company" + 0.007*"cost"'),
 (1,
  '0.019*"would" + 0.016*"crossrail" + 0.009*"committee" + 0.008*"think" + '
  '0.008*"work" + 0.007*"make" + 0.007*"point" + 0.007*"take" + '
  '0.007*"station" + 0.006*"area"'),
 (2,
  '0.009*"transport" + 0.008*"service" + 0.007*"year" + 0.007*"rail" + '
  '0.006*"scheme" + 0.006*"would" + 0.005*"make" + 0.005*"line" + '
  '0.005*"government" + 0.005*"take"'),
 (3,
  '0.029*"work" + 0.023*"shall" + 0.019*"land" + 0.019*"paragraph" + '
  '0.014*"provision" + 0.014*"section" + 0.014*"part" + 0.013*"schedule" + '
  '0.013*"purpose" + 0.011*"railway"'),
 (4,
  '0.023*"bill" + 0.014*"would" + 0.013*"crossrail" + 0.012*"make" + '
  '0.009*"clause" + 0.009*"project" + 0.008*"government" + 0.008*"give" + '
  '0.008*"amendment" + 0.007*"member"'),
 (5,
  '0.014*"line" + 0.012*"train" + 0.012*"year" + 0.

### Topic distribution across documents

In [226]:
# topic distribution over documents
list_topic = []
dictionary_topic = {}
for d in texts:
    bow = id2word.doc2bow(d)
    belong = lda_model[bow]                        # generate a list of tuples of topic distribution of a document
    belong_dic = Convert(belong, dictionary_topic) # convert the list of tuples into a dictionary
    list_topic.append(belong_dic)           
                      
df_topic_distribution = pd.DataFrame(list_topic)   # convert the list of dictionaries into a dataframe
df_topic = pd.merge(df, df_topic_distribution, how = 'left', left_index=True, right_index=True) # merge with info of documents
df_topic.drop(['title','format','creator', 'producer', 'keywords', 'trapped', 'encryption','subject', 'modDate'], axis = 1)

Unnamed: 0,author,creationDate,file_name,Content,document_id,Word_count,0,1,2,3,...,5,6,7,8,9,10,11,12,13,14
0,Katherine A Bloomfield,D:20201012160043+01'00',10-Year_Transport_Plan_02_Jul_2002.pdf,10-Year Transport Plan \n \n6. \n \nMr. Geoffr...,0,1437,0.046091,0.025581,0.005751,0.010421,...,0.016342,0.018955,0.016739,0.023464,0.029125,0.005569,0.055000,0.088750,0.511309,0.014800
1,,D:20190730052236Z,10_year_plan_for_transport_21_May_2002.pdf,"House of Commons \nTransport, Local Government...",1,292641,0.116414,0.000963,0.000027,0.000154,...,0.000489,0.039231,0.003378,0.067757,0.028585,0.000026,0.007787,0.617993,0.116872,0.000163
2,stellent,D:20041008151844+01'00',A_New_deal_for_Transport_Better_for_everyone_W...,AnNew deal for Transport: Better for everyone ...,2,67602,0.000597,0.000495,0.000098,0.000772,...,0.019016,0.004444,0.009851,0.000971,0.024285,0.000094,0.018857,0.918314,0.001196,0.000385
3,Katherine A Bloomfield,D:20200429153449+01'00',British_Railways_(London)_Bill_Lords_(By_Order...,British Railways (London) Bill Lords (By Order...,3,26519,0.001833,0.174634,0.000393,0.004542,...,0.004299,0.003642,0.005085,0.035104,0.068484,0.000375,0.001093,0.001104,0.428075,0.002353
4,Katherine A Bloomfield,D:20201012144724+01'00',British_Railways_Bill_19_Mar_1991.pdf,British Railways Bill \n \nOrder for Second Re...,4,3167,0.007888,0.047891,0.002888,0.031235,...,0.018869,0.017723,0.032891,0.021458,0.204248,0.002736,0.008439,0.018922,0.307411,0.007159
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
224,Katherine A Bloomfield,D:20201012164224+01'00',Transport__London_Underground_07_May_2008.pdf,Transport: London Underground \n \nLord Bridge...,224,512,0.254691,0.034235,0.015145,0.017681,...,0.062010,0.028310,0.026871,0.037356,0.028373,0.016293,0.030588,0.030522,0.077234,0.037115
225,Katherine A Bloomfield,D:20201012164258+01'00',Transport__Rail_and_Air_Travel_06_Mar_2008.pdf,Transport: Rail and Air Travel \n \nWhat is mi...,225,5173,0.006610,0.021962,0.001528,0.003640,...,0.005300,0.046562,0.016488,0.013617,0.104275,0.001472,0.034401,0.204020,0.524514,0.004688
226,Katherine A Bloomfield,D:20201012155409+01'00',Tube_Investment_debate_in_Commons_Chamber_08_D...,Tube Investment \n \n[Relevant documents: Seve...,226,59658,0.021269,0.000975,0.000149,0.000874,...,0.000752,0.002586,0.000753,0.001191,0.000849,0.000145,0.001007,0.003048,0.964035,0.000561
227,Katherine A Bloomfield,D:20201012163723+01'00',Waterloo_Station_14_Mar_2007.pdf,Waterloo Station \n \n11:00:00 \n \nSusan Kra...,227,4714,0.009605,0.034907,0.001724,0.005358,...,0.012611,0.010449,0.209508,0.007894,0.287885,0.001658,0.018974,0.006365,0.373946,0.007304


# Topic interpretation
To interpret the topics, I combined the word frequncy demonstrated by pyLDAvis with prototypical documents or paragraphs suggested by PTBI proposed by Marchetti and Puranam (2020)

### Selection of most salient topics for interpretation
according to the PTBI proposed by Marchetti and Puranam (2020), not all topics are worth interpretation. To extract most salient topics for interpretation, for each topic, we need to compute the percentage of documents loading on the topic with the probability that the document belong to the document > 1/k. Then graph the metric and select the sub-set of topics scoring the highest on it (e.g., based on scree plot).

In [265]:
# percentage of documents above threshold(1/k)
list_percent_above = []
for i in df_topic_distribution:
    num_above = df_topic_distribution[i][df_topic_distribution[i] > 1/k].count()
    percent_above =  num_above/len(df_topic_distribution)
    list_percent_above.append(percent_above)

In [262]:
df_salient_topic = pd.DataFrame({'topic_id': list_topic_id, 'percentage_of_documents_above_threshold': list_percent_above})
df_salient_topic.sort_values(by = 'percentage_of_documents_above_threshold', ascending = False)

Unnamed: 0,topic_id,percentage_of_documents_above_threshold
13,13,0.606987
4,4,0.305677
0,0,0.262009
9,9,0.257642
12,12,0.235808
8,8,0.218341
5,5,0.174672
1,1,0.131004
11,11,0.126638
14,14,0.09607


### Word frequency of each topic

In [227]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, sort_topics = False )
vis

### Prototypical texts for each topic
I followed the method of extraction of prototypical text suggested by PTBI proposed by Marchetti and Puranam (2020). Its heart lies in that for parameter L (probability that a document belongs to a topic), at least 1/L documents with probablity that they belong to the topic >= L are needed to interpret the topics, this method tries to find out the opitimal value of L to maximize the percentage of interpretable topics (Marchetti and Puranam, 2020, p. 20)

In [228]:
List_num_doc = [x for x in range(1, 20, 1)] # generate a list of 1/L (minimum number of documents to interpret a topic)
list_L = [1/x for x in List_num_doc]        # generate a list of L

In [229]:
# create the function for computing the percentage of potentially interpretable topics against parameter L
def perc(i, df):
    list_num_topics = []
    for j in df:                                  
        topic_filter = df[j] >= i         
        m = df[j][topic_filter].count()           
        list_num_topics.append(m)                                             
        count1 = sum(map(lambda x : x >= 1/i, list_num_topics))                                     
        perc1 = count1 / k
    return(perc1)

The following chart shows that the percentage of potentially interpretable topics for “high enough” levels of L is not large enough, so the paragraph-based interpretation can be explored. 

In [230]:
list_perc1 = []
for i in list_L:
    num = perc(i, df_topic_distribution)
    list_perc1.append(num)

df_L1 = pd.DataFrame({'Parameter L': list_L, 'Percentage of interpretable topics': list_perc1})
fig_L1 = px.line(df_L1, x = 'Parameter L', y="Percentage of interpretable topics", title = 'Value selection for parameter L (document-based)')
fig_L1.update_layout(autosize=False, width=1200, height=400)
fig_L1.update_traces(mode = "lines + markers")
fig_L1.show()

The following chart shows that when L = 0.5, the the percentage of interpretable topics is 86.7%, so we set L = 0.5 - ie, each topic needs at least 2(1/L) paragraphs to be interpreted.

In [231]:
df_topic_para

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,0.072448,0.062922,0.056170,0.059287,0.065967,0.075801,0.064500,0.060214,0.068891,0.064993,0.055450,0.077713,0.076425,0.068982,0.070236
1,0.070747,0.070882,0.050330,0.061963,0.076724,0.062193,0.065695,0.061436,0.068397,0.062524,0.049954,0.093780,0.076701,0.064980,0.063693
2,0.086332,0.063053,0.037152,0.042598,0.099212,0.065017,0.061546,0.050136,0.055723,0.066761,0.037708,0.116469,0.081088,0.071017,0.066188
3,0.131403,0.068612,0.025374,0.029843,0.091810,0.044684,0.044783,0.040568,0.055095,0.052669,0.024974,0.061665,0.052602,0.235253,0.040666
4,0.069285,0.069053,0.044518,0.049888,0.058232,0.078048,0.061086,0.068690,0.062046,0.102345,0.043165,0.064314,0.079124,0.093914,0.056291
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52692,0.096125,0.071386,0.027858,0.041813,0.095278,0.052810,0.040474,0.051293,0.056721,0.133052,0.026874,0.047339,0.051183,0.162787,0.045006
52693,0.085474,0.068257,0.047044,0.056173,0.070794,0.066754,0.061583,0.050914,0.060732,0.076330,0.047374,0.057462,0.060376,0.128952,0.061780
52694,0.073769,0.074035,0.047510,0.057346,0.090340,0.059344,0.056209,0.057094,0.060466,0.061084,0.047341,0.054888,0.058834,0.131114,0.070625
52695,0.083192,0.059174,0.047981,0.057561,0.065598,0.068059,0.071295,0.062680,0.064459,0.084050,0.046833,0.059579,0.062758,0.117769,0.049012


In [232]:
#df_topic_para2 = df_topic_para1.drop(['document_id', 'paragraphs'], axis = 1)
list_perc2 = []
for i in list_L:
    num = perc(i, df_topic_para)
    list_perc2.append(num)

df_L2 = pd.DataFrame({'Parameter L': list_L, 'Percentage of interpretable topics': list_perc2})
fig_L2 = px.line(df_L2, x = 'Parameter L', y="Percentage of interpretable topics", title = 'Value selection for parameter L (paragraph-based )')
fig_L2.update_layout(autosize=False, width=1200, height=400)
fig_L2.update_traces(mode = "lines + markers")
fig_L2.show()

# Build topic model on paragraphs

In [None]:
# tokenization
#data2 = df_para.paragraphs.values.tolist()
#data_words2_2 = list(sent_to_words(data2))

In [None]:
# set the length of word threshold for removing the words less than the threshold
#minimum_len = 4 
#data_words2 = []
#for i in data_words2_2:
    new_element = [x for x in i if len(x) >= minimum_len]
    data_words2.append(new_element)

In [None]:
# Bigram & Trigram
#bigram2 = gensim.models.Phrases(data_words2, min_count=5, threshold=100) # higher threshold fewer phrases.
#trigram2 = gensim.models.Phrases(bigram2[data_words2], threshold=100)  
#bigram_mod2 = gensim.models.phrases.Phraser(bigram2)
#trigram_mod2 = gensim.models.phrases.Phraser(trigram2)

In [None]:
# Remove Stop Words
#data_words_nostops2 = remove_stopwords(data_words2)

# Form Trigrams
#data_words_trigrams2 = make_trigrams(data_words_nostops2)

# Do lemmatization keeping only noun, adj, vb, adv
#data_lemmatized2 = lemmatization(data_words_trigrams2, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

In [None]:
# Create Dictionary
#id2word2 = corpora.Dictionary(data_lemmatized2)

# Create Corpus
#texts2 = data_lemmatized2

# Term Document Frequency
#corpus2 = [id2word2.doc2bow(text) for text in texts]

In [None]:
#lda_model2 = gensim.models.LdaModel(
#    corpus=corpus2,
#    id2word=id2word2,
#   alpha=alpha,
#    eta=eta,
#    iterations=iterations,
#    num_topics=k, 
#   passes=passes)

In [None]:
# Compute Coherence Score
#coherence_model_lda2 = CoherenceModel(model=lda_model2, texts=data_lemmatized2, dictionary=id2word2, coherence='c_v')
#coherence_lda2 = coherence_model_lda2.get_coherence()
#print('\nCoherence Score: ', coherence_lda2)

In [None]:
# Visualize the topics
#vis2 = pyLDAvis.gensim_models.prepare(lda_model2, corpus2, id2word2, sort_topics = False)
#vis2