In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import missingno as msn
import re
import string
import nltk

import glob

pd.set_option('display.max_columns', None)
#pd.set_option('display.max_rows', None)
# pd.set_option('display.float_format', '{:.2f}'.format)
# pd.set_option('display.width', None)


# Creating pickle 2018 - 2020

In [None]:
cols = ['yearly_budget', 'yearly_forecast', 'wlc_baseline_incl_NCG']

def load_and_clean_data(file_path):
    df = pd.read_csv(file_path, thousands=',', encoding="latin2")
    cols_to_clean = cols
    
    for col in cols_to_clean:
        df[col] = df[col].str.replace('Ł', '', regex=False).str.replace(',', '', regex=False)
    
    return df

# Load and clean data for each year
df_2018 = load_and_clean_data("../data/raw_data/uk_2018.csv")
df_2019 = load_and_clean_data("../data/raw_data/uk_2019.csv")
df_2020 = load_and_clean_data("../data/raw_data/uk_2020.csv")

# Concatenate dataframes
df1820 = pd.concat([df_2018, df_2019, df_2020], axis=0)

df1820["colour_rating"] = df1820["colour_rating"].str.lower()

# Transform datatype
for col in cols:
    df1820[col] = pd.to_numeric(df1820[col], errors='coerce')

    
df1820["start_date"] = pd.to_datetime(df1820["start_date"], errors="coerce", format="%d.%m.%y")
df1820["end_date"] = pd.to_datetime(df1820["end_date"], errors="coerce", format="%d.%m.%y")

# Cleaning colour rating and making exempts into NaNs
colour_rating = ['amber', 'green', 'red', 'amber/red', 'amber/green', "reset"]
df1820["colour_rating"] = df1820["colour_rating"].str.lower()
df1820["colour_rating"] = np.where(df1820["colour_rating"].isin(colour_rating), df1820["colour_rating"], np.nan)

month_mapping = {
        'jan': '01', 'january': '01',
        'feb': '02', 'february': '02',
        'mar': '03', 'march': '03',
        'apr': '04', 'april': '04',
        'may': '05', 'may': '05',
        'jun': '06', 'june': '06',
        'jul': '07', 'july': '07',
        'aug': '08', 'august': '08',
        'sep': '09', 'september': '09',
        'oct': '10', 'october': '10',
        'nov': '11', 'november': '11',
        'dec': '12', 'december': '12'
    }

def extract_date_after_to_or_on(comment):
    REGEX = "(to|is|on)\s*(\d{1,2})\s+(\w+)\s+(\d{2,4})"
    x = re.compile(REGEX)
    
    match_REGEX = x.search(comment)      
    
    if match_REGEX:
        day = match_REGEX.group(2)
        month = match_REGEX.group(3)
        year = match_REGEX.group(4)
        if len(day) == 1:
            day = '0'+ day
        month = month_mapping.get(month.lower())
        if len(year) == 2:
            year = '20' + year
        return year +'-'+ month +'-'+ day

for index, row in df1820.iterrows():
    if pd.isna(row['end_date']):
        extracted_date = extract_date_after_to_or_on(row['schedule_comment'])
        if extracted_date:
            df1820.at[index, 'end_date'] = extracted_date


#df1820.to_pickle("../data/pickle/cleaned_2018_2020.pkl")



# EDA

In [3]:
df = pd.read_pickle("../data/pickle/final_pickle.pkl")
df = df.drop(1727)


In [None]:
msn.matrix(df)

In [None]:
df["start_date"] = pd.to_datetime(df["start_date"])
df["end_date"] = pd.to_datetime(df["end_date"])
# Calculate the duration in years
df["year_duration"] = (df["end_date"] - df["start_date"]).dt.days / 365.25

# Optionally, round the duration to a specific number of decimal places
df["year_duration"] = df["year_duration"].round(2)

In [None]:
df

In [None]:
df.describe()[["yearly_budget", "yearly_forecast", "wlc_baseline_incl_NCG", "total_benefits"]]

In [None]:
df.groupby("year").describe()[["yearly_budget", "yearly_forecast", "wlc_baseline_incl_NCG", "total_benefits"]]

In [None]:
#sns.pairplot(df)

In [None]:
corr_matrix = df.select_dtypes(np.number).corr()
plt.figure(figsize=(12,12))

sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

In [None]:
msn.matrix(df)

# Topic Modelling

In [24]:
df = pd.read_pickle("../data/pickle/final_pickle.pkl")
df = df.drop(1727)
df["description_aims"] = df['description_aims'].fillna("")

In [24]:
df

Unnamed: 0,project_name,description_aims,rating_comment,start_date,end_date,schedule_comment,yearly_budget,yearly_forecast,wlc_baseline_incl_NCG,variance_comment,budget_comment,year,report_category,project_number,total_benefits,benefits_comment,department_CO,department_CPS,department_DBT,department_DCMS,department_DEFRA,department_DEFRA & DFT,department_DESNZ,department_DFE,department_DFID,department_DFT,department_DHSC,department_DLUHC,department_DSIT,department_DWP,department_FCDO,department_HMLR,department_HMRC,department_HMT,department_HO,department_MOD,department_MOJ,department_NCA,department_NS&I,department_ONS,department_VOA,colour_rating_amber/green,colour_rating_amber/red,colour_rating_green,colour_rating_red,colour_rating_reset,year_duration,start_year,end_year,GDP,annual_earning_ft,unemployment_rate,youth_unemployment_rate,inflation_rate,population,gov_debt,tax_revenue,revenue_excl_grants,grants_and_other_revenue
0,Civil Service Pensions 2015 Remedy,The 2015 Pensions Remedy Programme was created...,"Compared to financial year 21/22-Q4, the Deliv...",2020-05-20,2024-09-30,"Compared to financial year 21/22-Q4, the proje...",7.90,4.20,34.00,The budget variance exceeds 5%. This is primar...,"Compared to financial year 21/22-Q4, the proje...",2023.0,Government Transformation and Service Delivery,CO_0024_2021-Q2,,The project did not provide departmental-agree...,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,4.36,2020.0,2024.0,2270764.0,33061.0,0.0373,0.1051,0.092,67791000.0,35655860.0,6842110.0,9215550.0,598780.0
1,Commercial Capability Expansion Programme,The original Commercial Capability Programme s...,"Compared to financial year 21/22-Q4, the Seni...",2017-04-01,2023-03-31,"Compared to financial year 21/22-Q4, the proje...",3.70,3.10,26.00,The budget variance exceeds 5%. We are continu...,"Compared to financial year 21/22-Q4, the proje...",2023.0,Government Transformation and Service Delivery,CO_0020_1718-Q4,,The project did not provide departmental-agree...,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,6.00,2017.0,2023.0,2270764.0,33061.0,0.0373,0.1051,0.092,67791000.0,35655860.0,6842110.0,9215550.0,598780.0
2,Falcon IT Platform Refresh and Migration,Falcon is a business change programme that wil...,The Senior Responsible Owner's Delivery Confid...,2022-05-01,2025-03-31,The project's end-date at 22/23-Q4 is 2025-03-...,0.00,0.00,52.00,The programme was challenged to move at pace b...,The project's departmental-agree Whole Life Co...,2023.0,Government Transformation and Service Delivery,CO_0176_2223-Q3,60.0,The project's departmental-agree monetised ben...,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,2.92,2022.0,2025.0,2270764.0,33061.0,0.0373,0.1051,0.092,67791000.0,35655860.0,6842110.0,9215550.0,598780.0
3,Future Service Programme,Future Services Programme is a series of procu...,"Compared to financial year 21/22-Q4, the Deliv...",2020-10-01,2025-12-31,"Compared to financial year 21/22-Q4, the proje...",1.60,1.70,,Data already completed on GMPP,NA\r\nThe project did not provide data,2023.0,Government Transformation and Service Delivery,CO_0027_2021-Q4,11.0,The project's departmental-agree monetised ben...,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,5.25,2020.0,2025.0,2270764.0,33061.0,0.0373,0.1051,0.092,67791000.0,35655860.0,6842110.0,9215550.0,598780.0
4,GOV.UK One Login,GOV.UK One Login will provide a single account...,"Compared to financial year 21/22-Q4, the Infra...",2021-01-04,2025-03-31,"Compared to financial year 21/22-Q4, the proje...",114.00,114.00,305.00,The programme's departmental-agree Whole Life ...,"Compared to financial year 21/22-Q4, the proje...",2023.0,ICT,CO_0033_2122-Q1,1752.0,The project's departmental-agree monetised ben...,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,4.24,2021.0,2025.0,2270764.0,33061.0,0.0373,0.1051,0.092,67791000.0,35655860.0,6842110.0,9215550.0,598780.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1722,MoJ Shared Services Evolve (SS Evolve) Programme,The MoJ Shared Services ProgrammeÂs aim is to...,"At quarter 2, the programme was working to a p...",2013-09-07,2015-11-05,The revised June 2016 baseline had a delivery ...,76.76,75.51,395.15,Budget variance less than 5%,The Whole Life Costs (WLC) represent the move ...,2017.0,,,,,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,2.16,2013.0,2015.0,2111357.0,28159.0,0.0481,0.1289,0.018,65611593.0,31190050.0,5066560.0,6686710.0,397240.0
1723,Secure Training centre (STC) Retendering Project,The project is retendering and mobilising a Se...,Quarter 2 was assessed as Amber/Red as there w...,2012-11-05,2017-05-01,The new service at Rainsbrook STC commenced on...,10.83,10.83,86.98,Budget variance less than 5%,The whole life cost of the project reflects th...,2017.0,,,,,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,4.48,2012.0,2017.0,2111357.0,28159.0,0.0481,0.1289,0.018,65611593.0,31190050.0,5066560.0,6686710.0,397240.0
1724,IT Transformation Programme,IT Transformation is one of three programmes t...,The IPA Review Team made 7 recommendations whe...,2016-04-01,2022-07-31,Schedule on track,14.00,14.00,168.68,Variance less than 5%,Whole life costs continue to be on target,2017.0,,,,,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,6.33,2016.0,2022.0,2111357.0,28159.0,0.0481,0.1289,0.018,65611593.0,31190050.0,5066560.0,6686710.0,397240.0
1725,NCA Transformation Programme,The NCAÂs Transformation Programme which will...,The current status of the NCA Transformation P...,2014-04-01,2019-03-31,Schedule on track,25.50,25.50,306.80,Variance less than 5%,Whole life costs continue to be on target.,2017.0,Government Transformation and Service Delivery,NCA_0001_1415-Q2,,,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,5.00,2014.0,2019.0,2111357.0,28159.0,0.0481,0.1289,0.018,65611593.0,31190050.0,5066560.0,6686710.0,397240.0


In [35]:
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

# Assuming df is already defined and contains the column "description_aims"
corpus = df["description_aims"].tolist()

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Add domain-specific stopwords
domain_specific_stopwords = set(["service", "deliver", "programme", "support", "provide", "new", "system", "government"])

# Combine with existing stopwords
stop = set(stopwords.words('english')).union(domain_specific_stopwords)

exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

# Define a function to clean a document, and handle cases where doc is not a string
def clean(doc):
    if isinstance(doc, str):  # Check if the doc is a string
        stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
        punc_free = "".join(ch for ch in stop_free if ch not in exclude)
        normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
        return normalized
    else:
        return ""  # Return an empty string for non-string entries (e.g., NaN)

# Apply cleaning function to the corpus
clean_corpus = [clean(doc).split() for doc in corpus]

# Now clean_corpus should not throw any errors

[nltk_data] Downloading package stopwords to /Users/hieu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/hieu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/hieu/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [28]:
corpus = df["description_aims"][0]

In [36]:
clean_corpus

[['2015',
  'pension',
  'remedy',
  'created',
  'end',
  'age',
  'discrimination',
  'within',
  'civil',
  'pension',
  'scheme',
  'also',
  'creating',
  'solution',
  'remediate',
  'affected',
  'historic',
  'member',
  'scheme'],
 ['original',
  'commercial',
  'capability',
  'successfully',
  'established',
  'commercial',
  'organisation',
  'gco',
  'single',
  'central',
  'employer',
  'several',
  'hundred',
  'commercial',
  'specialist',
  'grade',
  '6',
  'above',
  'central',
  'department',
  'civil',
  'board',
  'endorsed',
  'proposal',
  'commercial',
  'capability',
  'building',
  'intervention',
  'extended',
  'grade',
  '7',
  'commercial',
  'professional',
  'within',
  'central',
  'government',
  'wider',
  'body',
  'wgbs',
  'training',
  'accreditation',
  'developed',
  'delivered',
  'civil',
  'contract',
  'management',
  'community',
  'commercial',
  'capability',
  'expansion',
  'established',
  'impact',
  'target',
  'population',
  'dee

In [33]:
clean_corpus = [clean(corpus)]

In [34]:
clean_corpus

['2015 pension remedy created end age discrimination within civil pension scheme also creating solution remediate affected historic member scheme']

In [7]:
stop

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'deliver',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'government',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'new',
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourse

In [18]:
from gensim import corpora

# Creating document-term matrix 
dictionary = corpora.Dictionary(clean_corpus)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in clean_corpus]

In [9]:
from gensim.models import LdaModel

# LDA model
lda = LdaModel(doc_term_matrix, num_topics=20, id2word = dictionary)

# Results
print(lda.print_topics(num_topics=20, num_words=3))

[(0, '0.014*"prison" + 0.007*"place" + 0.006*"value"'), (1, '0.007*"project" + 0.006*"capability" + 0.006*"current"'), (2, '0.009*"service" + 0.009*"energy" + 0.008*"â\x95"'), (3, '0.011*"capability" + 0.009*"service" + 0.008*"technology"'), (4, '0.008*"work" + 0.008*"nh" + 0.008*"uk"'), (5, '0.009*"contract" + 0.009*"â\x95" + 0.008*"health"'), (6, '0.013*"service" + 0.012*"uk" + 0.005*"building"'), (7, '0.010*"scheme" + 0.008*"pension" + 0.007*"employer"'), (8, '0.006*"capability" + 0.006*"uk" + 0.005*"service"'), (9, '0.021*"service" + 0.008*"business" + 0.007*"digital"'), (10, '0.011*"project" + 0.011*"service" + 0.008*"contract"'), (11, '0.008*"benefit" + 0.007*"â\x95" + 0.006*"system"'), (12, '0.010*"service" + 0.007*"health" + 0.007*"â\x95"'), (13, '0.011*"service" + 0.009*"uk" + 0.009*"capability"'), (14, '0.015*"service" + 0.011*"nh" + 0.007*"project"'), (15, '0.010*"project" + 0.009*"service" + 0.008*"work"'), (16, '0.011*"capability" + 0.009*"â\x95" + 0.007*"service"'), (17, 

In [10]:
lda

<gensim.models.ldamodel.LdaModel at 0x163fd1050>

---

In [11]:
import nltk
nltk.download('stopwords')

import re
import numpy as np
import pandas as  pd
from pprint import pprint# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel# spaCy for preprocessing
import spacy# Plotting tools
import pyLDAvis
import pyLDAvis.gensim_models
import matplotlib.pyplot as plt
%matplotlib inline


[nltk_data] Downloading package stopwords to /Users/hieu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [38]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'programme'])

In [39]:
data = df["description_aims"].tolist()

data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]  
# Remove new line characters 
data = [re.sub('\s+', ' ', sent) for sent in data]  
# Remove distracting single quotes 
data = [re.sub("\'", "", sent) for sent in data]  
pprint(data[:1])

['The 2015 Pensions Remedy Programme was created to end age discrimination '
 'within the Civil Service Pension Schemes and is also creating solutions to '
 'remediate any affected to historic members of the scheme.']


  data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]
  data = [re.sub('\s+', ' ', sent) for sent in data]


In [40]:
data

['The 2015 Pensions Remedy Programme was created to end age discrimination within the Civil Service Pension Schemes and is also creating solutions to remediate any affected to historic members of the scheme.',
 'The original Commercial Capability Programme successfully established the Government Commercial Organisation (GCO) - a single central employer of several hundred Commercial Specialists (Grade 6 and above) for central government departments. The Civil Service Board endorsed proposals that commercial capability building interventions should be extended to Grade 7 commercial professionals within central government, Wider Government Bodies (WGBs) and training and accreditation developed and delivered to the Civil Service contract management community. The Commercial Capability Expansion Programme has been established to impact these new target populations deeper within the Civil Service and more broadly across the Public Sector.',
 'Falcon is a business change programme that will f

In [41]:
def sent_to_words(sentences):
  for sentence in sentences:
    yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))            #deacc=True removes punctuations
data_words = list(sent_to_words(data))
print(data_words[:1])


[['the', 'pensions', 'remedy', 'programme', 'was', 'created', 'to', 'end', 'age', 'discrimination', 'within', 'the', 'civil', 'service', 'pension', 'schemes', 'and', 'is', 'also', 'creating', 'solutions', 'to', 'remediate', 'any', 'affected', 'to', 'historic', 'members', 'of', 'the', 'scheme']]


In [42]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)
# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['the', 'pensions', 'remedy', 'programme', 'was', 'created', 'to', 'end', 'age', 'discrimination', 'within', 'the', 'civil', 'service', 'pension_schemes', 'and', 'is', 'also', 'creating', 'solutions', 'to', 'remediate', 'any', 'affected', 'to', 'historic', 'members', 'of', 'the', 'scheme']


In [43]:
# Define function for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [44]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load("en_core_web_sm")

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['pension', 'remedy', 'create', 'end', 'age', 'discrimination', 'civil', 'service', 'pension', 'scheme', 'also', 'create', 'solution', 'remediate', 'affect', 'historic', 'member', 'scheme']]


In [45]:
# Create Dictionary 
id2word = corpora.Dictionary(data_lemmatized)  
# Create Corpus 
texts = data_lemmatized  
# Term Document Frequency 
corpus = [id2word.doc2bow(text) for text in texts]  
# View 
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 2), (5, 1), (6, 1), (7, 1), (8, 1), (9, 2), (10, 1), (11, 1), (12, 2), (13, 1), (14, 1)]]


In [46]:
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]


[[('affect', 1),
  ('age', 1),
  ('also', 1),
  ('civil', 1),
  ('create', 2),
  ('discrimination', 1),
  ('end', 1),
  ('historic', 1),
  ('member', 1),
  ('pension', 2),
  ('remediate', 1),
  ('remedy', 1),
  ('scheme', 2),
  ('service', 1),
  ('solution', 1)]]

In [47]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [48]:
# Print the keyword of topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.109*"operate" + 0.068*"range" + 0.044*"control" + 0.042*"age" + '
  '0.041*"activity" + 0.038*"operating" + 0.038*"term" + 0.033*"defra" + '
  '0.027*"agreement" + 0.021*"fraud"'),
 (1,
  '0.086*"project" + 0.033*"electronic" + 0.029*"investment" + 0.028*"school" '
  '+ 0.025*"hospital" + 0.024*"decision" + 0.023*"major" + 0.023*"capital" + '
  '0.022*"number" + 0.020*"time"'),
 (2,
  '0.089*"joint" + 0.073*"offender" + 0.056*"asset" + 0.042*"non" + '
  '0.032*"criminal" + 0.029*"product" + 0.027*"space" + 0.027*"limit" + '
  '0.025*"register" + 0.024*"tackle"'),
 (3,
  '0.102*"production" + 0.094*"fit" + 0.078*"become" + 0.068*"condition" + '
  '0.042*"route" + 0.036*"diverse" + 0.033*"see" + 0.028*"culture" + '
  '0.020*"north" + 0.020*"regulator"'),
 (4,
  '0.095*"network" + 0.066*"create" + 0.058*"efficient" + 0.037*"move" + '
  '0.037*"construction" + 0.035*"put" + 0.029*"able" + 0.026*"collection" + '
  '0.024*"apprenticeship" + 0.024*"open"'),
 (5,
  '0.066*"cost" + 0.

In [49]:
df["description_aims"]

0       The 2015 Pensions Remedy Programme was created...
1       The original Commercial Capability Programme s...
2       Falcon is a business change programme that wil...
3       Future Services Programme is a series of procu...
4       GOV.UK One Login will provide a single account...
                              ...                        
1722    The MoJ Shared Services ProgrammeÂs aim is to...
1723    The project is retendering and mobilising a Se...
1724    IT Transformation is one of three programmes t...
1725    The NCAÂs Transformation Programme which will...
1726    The Census Transformation Programme will run f...
Name: description_aims, Length: 1727, dtype: object

In [50]:
df["project_name"]

0                      Civil Service Pensions 2015 Remedy
1               Commercial Capability Expansion Programme
2                Falcon IT Platform Refresh and Migration
3                                Future Service Programme
4                                        GOV.UK One Login
                              ...                        
1722    MoJ Shared Services Evolve (SS Evolve) Programme 
1723     Secure Training centre (STC) Retendering Project
1724                          IT Transformation Programme
1725                         NCA Transformation Programme
1726                      Census Transformation Programme
Name: project_name, Length: 1727, dtype: object