In [None]:
from google.colab import files
import pandas as pd

uploaded = files.upload()

df = pd.read_csv('data_cleaned_final.csv', encoding='MacRoman')

Saving data_cleaned_final.csv to data_cleaned_final.csv


In [None]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
import nltk

# Download necessary NLTK datasets
nltk.download('punkt')
nltk.download('stopwords')

# Use only the title for keyword extraction
all_texts = df['Title']

# Tokenize, clean stopwords and non-alphabetic words, and convert to lowercase
tokens = word_tokenize(' '.join(all_texts).lower())
clean_tokens = [token for token in tokens if token.isalpha() and token not in stopwords.words('english')]

# Count the frequency of each word
word_freq = Counter(clean_tokens)

# Get the 20 most common words
most_common_words = word_freq.most_common(30)
most_common_words


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


[('services', 899),
 ('provision', 538),
 ('service', 362),
 ('system', 304),
 ('supply', 285),
 ('framework', 265),
 ('support', 260),
 ('works', 248),
 ('management', 226),
 ('maintenance', 218),
 ('care', 208),
 ('council', 197),
 ('health', 189),
 ('programme', 187),
 ('market', 187),
 ('dps', 175),
 ('tender', 169),
 ('contract', 169),
 ('engagement', 159),
 ('dynamic', 144),
 ('development', 143),
 ('purchasing', 138),
 ('project', 130),
 ('research', 119),
 ('community', 116),
 ('centre', 114),
 ('children', 112),
 ('refurbishment', 111),
 ('housing', 109),
 ('school', 109)]

In [None]:
# Tokenize each document separately and clean
document_tokens = [word_tokenize(doc.lower()) for doc in all_texts]
clean_document_tokens = [[token for token in doc if token.isalpha() and token not in stopwords.words('english')] for doc in document_tokens]


In [None]:
from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel

# Create a Gensim dictionary from the tokens
dictionary = Dictionary(clean_document_tokens)

# Filter out extremes
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

# Create a corpus
corpus = [dictionary.doc2bow(text) for text in clean_document_tokens]

In [None]:
# Initialize and train the LDA model
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=20, random_state=100,
                     update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True)

# Note: Adjust num_topics according to how many distinct topics you believe the dataset contains.

In [None]:
# Print the topics identified by LDA
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.507*"new" + 0.152*"one" + 0.111*"two" + 0.102*"responsive" + 0.005*"mode" + 0.004*"esrc" + 0.002*"grants" + 0.000*"provision" + 0.000*"works" + 0.000*"school"
Topic: 1 
Words: 0.372*"market" + 0.286*"engagement" + 0.201*"public" + 0.092*"improvement" + 0.000*"provision" + 0.000*"services" + 0.000*"soft" + 0.000*"service" + 0.000*"test" + 0.000*"management"
Topic: 2 
Words: 0.460*"health" + 0.182*"platform" + 0.114*"energy" + 0.081*"mental" + 0.066*"alternative" + 0.027*"hubs" + 0.000*"services" + 0.000*"provision" + 0.000*"service" + 0.000*"occupational"
Topic: 3 
Words: 0.174*"opportunity" + 0.171*"data" + 0.148*"fund" + 0.123*"green" + 0.106*"engineering" + 0.079*"infrastructure" + 0.079*"strategic" + 0.000*"services" + 0.000*"local" + 0.000*"goods"
Topic: 4 
Words: 0.262*"use" + 0.213*"impact" + 0.176*"flexible" + 0.157*"uk" + 0.001*"system" + 0.001*"procurement" + 0.001*"services" + 0.001*"contracts" + 0.001*"access" + 0.001*"single"
Topic: 5 
Words: 0.658*"netwo

In [None]:
for index, doc in enumerate(corpus):
    doc_topics = lda_model.get_document_topics(doc)
    print(f"Document {index} Topics: {doc_topics}")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Document 293 Topics: [(0, 0.027886521), (1, 0.049673747), (2, 0.04219765), (3, 0.034585427), (4, 0.018475205), (5, 0.36494574), (6, 0.059262883), (7, 0.056014366), (8, 0.026081027), (9, 0.0426161), (10, 0.026036983), (11, 0.023001214), (12, 0.024123814), (13, 0.020848969), (14, 0.09470682), (15, 0.018711558), (16, 0.01876075), (19, 0.033486)]
Document 294 Topics: [(0, 0.020564906), (1, 0.036631886), (2, 0.031118643), (3, 0.025505014), (4, 0.27632362), (6, 0.04370339), (7, 0.04130777), (8, 0.019233447), (9, 0.031427227), (10, 0.019200966), (11, 0.01696224), (12, 0.0177901), (13, 0.015375066), (14, 0.06984147), (15, 0.013798833), (16, 0.01383511), (19, 0.28726527)]
Document 295 Topics: [(0, 0.016284965), (1, 0.029008105), (2, 0.02464227), (3, 0.22822013), (4, 0.0107890135), (6, 0.034607895), (7, 0.03271085), (8, 0.015230606), (9, 0.024886634), (10, 0.015204886), (11, 0.013432079), (12, 0.014087647), (13, 0.012175227), (14, 

In [None]:
pip install pyLDAvis


Collecting pyLDAvis
  Downloading pyLDAvis-3.4.1-py3-none-any.whl (2.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
Collecting funcy (from pyLDAvis)
  Downloading funcy-2.0-py2.py3-none-any.whl (30 kB)
Installing collected packages: funcy, pyLDAvis
Successfully installed funcy-2.0 pyLDAvis-3.4.1


In [None]:
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

vis = gensimvis.prepare(lda_model, corpus, dictionary, sort_topics=False)
pyLDAvis.display(vis)
